356 files changed, 23417 insertions, 9241 deletions
diff --git a/fs/Kconfig b/fs/Kconfig
index 9adee0d7536e..6725f59c18e6 100644
--- a/fs/Kconfig
+++ b/fs/Kconfig
@@ -84,6 +84,8 @@ config MANDATORY_FILE_LOCKING
          To the best of my knowledge this is dead code that no one cares about.
+source "fs/crypto/Kconfig"
 source "fs/notify/Kconfig"
 source "fs/quota/Kconfig"
@@ -207,6 +209,7 @@ menuconfig MISC_FILESYSTEMS
 if MISC_FILESYSTEMS
+source "fs/orangefs/Kconfig"
 source "fs/adfs/Kconfig"
 source "fs/affs/Kconfig"
 source "fs/ecryptfs/Kconfig"
diff --git a/fs/Makefile b/fs/Makefile
index 79f522575cba..85b6e13b62d3 100644
--- a/fs/Makefile
+++ b/fs/Makefile
@@ -30,6 +30,7 @@ obj-$(CONFIG_EVENTFD)		+= eventfd.o
 obj-$(CONFIG_USERFAULTFD)       += userfaultfd.o
 obj-$(CONFIG_AIO)               += aio.o
 obj-$(CONFIG_FS_DAX)            += dax.o
+obj-$(CONFIG_FS_ENCRYPTION)     += crypto/
 obj-$(CONFIG_FILE_LOCKING)      += locks.o
 obj-$(CONFIG_COMPAT)            += compat.o compat_ioctl.o
 obj-$(CONFIG_BINFMT_AOUT)       += binfmt_aout.o
@@ -105,6 +106,7 @@ obj-$(CONFIG_AUTOFS4_FS)	+= autofs4/
 obj-$(CONFIG_ADFS_FS)           += adfs/
 obj-$(CONFIG_FUSE_FS)           += fuse/
 obj-$(CONFIG_OVERLAY_FS)        += overlayfs/
+obj-$(CONFIG_ORANGEFS_FS)       += orangefs/
 obj-$(CONFIG_UDF_FS)            += udf/
 obj-$(CONFIG_SUN_OPENPROMFS)    += openpromfs/
 obj-$(CONFIG_OMFS_FS)           += omfs/
diff --git a/fs/autofs4/autofs_i.h b/fs/autofs4/autofs_i.h
index c37149b929be..f0d268b97d19 100644
--- a/fs/autofs4/autofs_i.h
+++ b/fs/autofs4/autofs_i.h
@@ -1,15 +1,11 @@
-/* -*- c -*- ------------------------------------------------------------- *
+/*
- *   
+ *  Copyright 1997-1998 Transmeta Corporation - All Rights Reserved
- * linux/fs/autofs/autofs_i.h
+ *  Copyright 2005-2006 Ian Kent <raven@themaw.net>
- *
- *   Copyright 1997-1998 Transmeta Corporation - All Rights Reserved
- *   Copyright 2005-2006 Ian Kent <raven@themaw.net>
 *
 * This file is part of the Linux kernel and is made available under
 * the terms of the GNU General Public License, version 2, or at your
 * option, any later version, incorporated herein by reference.
- *
+ */
- * ----------------------------------------------------------------------- */
 /* Internal header file for autofs */
@@ -35,28 +31,23 @@
 #include <linux/mount.h>
 #include <linux/namei.h>
 #include <asm/current.h>
-#include <asm/uaccess.h>
+#include <linux/uaccess.h>
 /* #define DEBUG */
-#define DPRINTK(fmt, ...)                               \
+#ifdef pr_fmt
-        pr_debug("pid %d: %s: " fmt "\n",               \
+#undef pr_fmt
-                current->pid, __func__, ##__VA_ARGS__)
+#endif
+#define pr_fmt(fmt) KBUILD_MODNAME ":pid:%d:%s: " fmt, current->pid, __func__
-#define AUTOFS_WARN(fmt, ...)                           \
-        printk(KERN_WARNING "pid %d: %s: " fmt "\n",    \
+/*
-                current->pid, __func__, ##__VA_ARGS__)
+ * Unified info structure.  This is pointed to by both the dentry and
+ * inode structures.  Each file in the filesystem has an instance of this
-#define AUTOFS_ERROR(fmt, ...)                          \
+ * structure.  It holds a reference to the dentry, so dentries are never
-        printk(KERN_ERR "pid %d: %s: " fmt "\n",        \
+ * flushed while the file exists.  All name lookups are dealt with at the
-                current->pid, __func__, ##__VA_ARGS__)
+ * dentry level, although the filesystem can interfere in the validation
+ * process.  Readdir is implemented by traversing the dentry lists.
-/* Unified info structure.  This is pointed to by both the dentry and
+ */
-   inode structures.  Each file in the filesystem has an instance of this
-   structure.  It holds a reference to the dentry, so dentries are never
-   flushed while the file exists.  All name lookups are dealt with at the
-   dentry level, although the filesystem can interfere in the validation
-   process.  Readdir is implemented by traversing the dentry lists. */
 struct autofs_info {
        struct dentry   *dentry;
        struct inode    *inode;
@@ -78,7 +69,7 @@ struct autofs_info {
        kgid_t gid;
 };
-#define AUTOFS_INF_EXPIRING     (1<<0) /* dentry is in the process of expiring */
+#define AUTOFS_INF_EXPIRING     (1<<0) /* dentry in the process of expiring */
 #define AUTOFS_INF_NO_RCU       (1<<1) /* the dentry is being considered
                                        * for expiry, so RCU_walk is
                                        * not permitted
@@ -140,10 +131,11 @@ static inline struct autofs_info *autofs4_dentry_ino(struct dentry *dentry)
 }
 /* autofs4_oz_mode(): do we see the man behind the curtain?  (The
-   processes which do manipulations for us in user space sees the raw
+ * processes which do manipulations for us in user space sees the raw
-   filesystem without "magic".) */
+ * filesystem without "magic".)
+ */
-static inline int autofs4_oz_mode(struct autofs_sb_info *sbi) {
+static inline int autofs4_oz_mode(struct autofs_sb_info *sbi)
+{
        return sbi->catatonic || task_pgrp(current) == sbi->oz_pgrp;
 }
@@ -154,12 +146,12 @@ void autofs4_free_ino(struct autofs_info *);
 int is_autofs4_dentry(struct dentry *);
 int autofs4_expire_wait(struct dentry *dentry, int rcu_walk);
 int autofs4_expire_run(struct super_block *, struct vfsmount *,
-                        struct autofs_sb_info *,
+                       struct autofs_sb_info *,
-                        struct autofs_packet_expire __user *);
+                       struct autofs_packet_expire __user *);
 int autofs4_do_expire_multi(struct super_block *sb, struct vfsmount *mnt,
                            struct autofs_sb_info *sbi, int when);
 int autofs4_expire_multi(struct super_block *, struct vfsmount *,
-                        struct autofs_sb_info *, int __user *);
+                         struct autofs_sb_info *, int __user *);
 struct dentry *autofs4_expire_direct(struct super_block *sb,
                                     struct vfsmount *mnt,
                                     struct autofs_sb_info *sbi, int how);
@@ -224,8 +216,8 @@ static inline int autofs_prepare_pipe(struct file *pipe)
 /* Queue management functions */
-int autofs4_wait(struct autofs_sb_info *,struct dentry *, enum autofs_notify);
+int autofs4_wait(struct autofs_sb_info *, struct dentry *, enum autofs_notify);
-int autofs4_wait_release(struct autofs_sb_info *,autofs_wqt_t,int);
+int autofs4_wait_release(struct autofs_sb_info *, autofs_wqt_t, int);
 void autofs4_catatonic_mode(struct autofs_sb_info *);
 static inline u32 autofs4_get_dev(struct autofs_sb_info *sbi)
@@ -242,37 +234,37 @@ static inline void __autofs4_add_expiring(struct dentry *dentry)
 {
        struct autofs_sb_info *sbi = autofs4_sbi(dentry->d_sb);
        struct autofs_info *ino = autofs4_dentry_ino(dentry);
        if (ino) {
                if (list_empty(&ino->expiring))
                        list_add(&ino->expiring, &sbi->expiring_list);
        }
-        return;
 }
 static inline void autofs4_add_expiring(struct dentry *dentry)
 {
        struct autofs_sb_info *sbi = autofs4_sbi(dentry->d_sb);
        struct autofs_info *ino = autofs4_dentry_ino(dentry);
        if (ino) {
                spin_lock(&sbi->lookup_lock);
                if (list_empty(&ino->expiring))
                        list_add(&ino->expiring, &sbi->expiring_list);
                spin_unlock(&sbi->lookup_lock);
        }
-        return;
 }
 static inline void autofs4_del_expiring(struct dentry *dentry)
 {
        struct autofs_sb_info *sbi = autofs4_sbi(dentry->d_sb);
        struct autofs_info *ino = autofs4_dentry_ino(dentry);
        if (ino) {
                spin_lock(&sbi->lookup_lock);
                if (!list_empty(&ino->expiring))
                        list_del_init(&ino->expiring);
                spin_unlock(&sbi->lookup_lock);
        }
-        return;
 }
 extern void autofs4_kill_sb(struct super_block *);
diff --git a/fs/autofs4/dev-ioctl.c b/fs/autofs4/dev-ioctl.c
index ac7d921ed984..c7fcc7438843 100644
--- a/fs/autofs4/dev-ioctl.c
+++ b/fs/autofs4/dev-ioctl.c
@@ -72,13 +72,13 @@ static int check_dev_ioctl_version(int cmd, struct autofs_dev_ioctl *param)
 {
        int err = 0;
-        if ((AUTOFS_DEV_IOCTL_VERSION_MAJOR != param->ver_major) ||
+        if ((param->ver_major != AUTOFS_DEV_IOCTL_VERSION_MAJOR) ||
-            (AUTOFS_DEV_IOCTL_VERSION_MINOR < param->ver_minor)) {
+            (param->ver_minor > AUTOFS_DEV_IOCTL_VERSION_MINOR)) {
-                AUTOFS_WARN("ioctl control interface version mismatch: "
+                pr_warn("ioctl control interface version mismatch: "
-                     "kernel(%u.%u), user(%u.%u), cmd(%d)",
+                        "kernel(%u.%u), user(%u.%u), cmd(%d)\n",
-                     AUTOFS_DEV_IOCTL_VERSION_MAJOR,
+                        AUTOFS_DEV_IOCTL_VERSION_MAJOR,
-                     AUTOFS_DEV_IOCTL_VERSION_MINOR,
+                        AUTOFS_DEV_IOCTL_VERSION_MINOR,
-                     param->ver_major, param->ver_minor, cmd);
+                        param->ver_major, param->ver_minor, cmd);
                err = -EINVAL;
        }
@@ -93,7 +93,8 @@ static int check_dev_ioctl_version(int cmd, struct autofs_dev_ioctl *param)
 * Copy parameter control struct, including a possible path allocated
 * at the end of the struct.
 */
-static struct autofs_dev_ioctl *copy_dev_ioctl(struct autofs_dev_ioctl __user *in)
+static struct autofs_dev_ioctl *
+                copy_dev_ioctl(struct autofs_dev_ioctl __user *in)
 {
        struct autofs_dev_ioctl tmp, *res;
@@ -116,7 +117,6 @@ static struct autofs_dev_ioctl *copy_dev_ioctl(struct autofs_dev_ioctl __user *i
 static inline void free_dev_ioctl(struct autofs_dev_ioctl *param)
 {
        kfree(param);
-        return;
 }
 /*
@@ -129,24 +129,24 @@ static int validate_dev_ioctl(int cmd, struct autofs_dev_ioctl *param)
        err = check_dev_ioctl_version(cmd, param);
        if (err) {
-                AUTOFS_WARN("invalid device control module version "
+                pr_warn("invalid device control module version "
-                     "supplied for cmd(0x%08x)", cmd);
+                        "supplied for cmd(0x%08x)\n", cmd);
                goto out;
        }
        if (param->size > sizeof(*param)) {
                err = invalid_str(param->path, param->size - sizeof(*param));
                if (err) {
-                        AUTOFS_WARN(
+                        pr_warn(
-                          "path string terminator missing for cmd(0x%08x)",
+                          "path string terminator missing for cmd(0x%08x)\n",
                          cmd);
                        goto out;
                }
                err = check_name(param->path);
                if (err) {
-                        AUTOFS_WARN("invalid path supplied for cmd(0x%08x)",
+                        pr_warn("invalid path supplied for cmd(0x%08x)\n",
-                                    cmd);
+                                cmd);
                        goto out;
                }
        }
@@ -197,7 +197,9 @@ static int find_autofs_mount(const char *pathname,
                             void *data)
 {
        struct path path;
-        int err = kern_path_mountpoint(AT_FDCWD, pathname, &path, 0);
+        int err;
+        err = kern_path_mountpoint(AT_FDCWD, pathname, &path, 0);
        if (err)
                return err;
        err = -ENOENT;
@@ -225,6 +227,7 @@ static int test_by_dev(struct path *path, void *p)
 static int test_by_type(struct path *path, void *p)
 {
        struct autofs_info *ino = autofs4_dentry_ino(path->dentry);
        return ino && ino->sbi->type & *(unsigned *)p;
 }
@@ -370,7 +373,7 @@ static int autofs_dev_ioctl_setpipefd(struct file *fp,
                new_pid = get_task_pid(current, PIDTYPE_PGID);
                if (ns_of_pid(new_pid) != ns_of_pid(sbi->oz_pgrp)) {
-                        AUTOFS_WARN("Not allowed to change PID namespace");
+                        pr_warn("not allowed to change PID namespace\n");
                        err = -EINVAL;
                        goto out;
                }
@@ -456,8 +459,10 @@ static int autofs_dev_ioctl_requester(struct file *fp,
                err = 0;
                autofs4_expire_wait(path.dentry, 0);
                spin_lock(&sbi->fs_lock);
-                param->requester.uid = from_kuid_munged(current_user_ns(), ino->uid);
+                param->requester.uid =
-                param->requester.gid = from_kgid_munged(current_user_ns(), ino->gid);
+                        from_kuid_munged(current_user_ns(), ino->uid);
+                param->requester.gid =
+                        from_kgid_munged(current_user_ns(), ino->gid);
                spin_unlock(&sbi->fs_lock);
        }
        path_put(&path);
@@ -619,7 +624,8 @@ static ioctl_fn lookup_dev_ioctl(unsigned int cmd)
 }
 /* ioctl dispatcher */
-static int _autofs_dev_ioctl(unsigned int command, struct autofs_dev_ioctl __user *user)
+static int _autofs_dev_ioctl(unsigned int command,
+                             struct autofs_dev_ioctl __user *user)
 {
        struct autofs_dev_ioctl *param;
        struct file *fp;
@@ -655,7 +661,7 @@ static int _autofs_dev_ioctl(unsigned int command, struct autofs_dev_ioctl __use
        fn = lookup_dev_ioctl(cmd);
        if (!fn) {
-                AUTOFS_WARN("unknown command 0x%08x", command);
+                pr_warn("unknown command 0x%08x\n", command);
                return -ENOTTY;
        }
@@ -711,6 +717,7 @@ out:
 static long autofs_dev_ioctl(struct file *file, uint command, ulong u)
 {
        int err;
        err = _autofs_dev_ioctl(command, (struct autofs_dev_ioctl __user *) u);
        return (long) err;
 }
@@ -733,8 +740,8 @@ static const struct file_operations _dev_ioctl_fops = {
 static struct miscdevice _autofs_dev_ioctl_misc = {
        .minor          = AUTOFS_MINOR,
-        .name           = AUTOFS_DEVICE_NAME,
+        .name           = AUTOFS_DEVICE_NAME,
-        .fops           = &_dev_ioctl_fops
+        .fops           = &_dev_ioctl_fops
 };
 MODULE_ALIAS_MISCDEV(AUTOFS_MINOR);
@@ -747,7 +754,7 @@ int __init autofs_dev_ioctl_init(void)
        r = misc_register(&_autofs_dev_ioctl_misc);
        if (r) {
-                AUTOFS_ERROR("misc_register failed for control device");
+                pr_err("misc_register failed for control device\n");
                return r;
        }
@@ -757,6 +764,4 @@ int __init autofs_dev_ioctl_init(void)
 void autofs_dev_ioctl_exit(void)
 {
        misc_deregister(&_autofs_dev_ioctl_misc);
-        return;
 }
diff --git a/fs/autofs4/expire.c b/fs/autofs4/expire.c
index 1cebc3c52fa5..9510d8d2e9cd 100644
--- a/fs/autofs4/expire.c
+++ b/fs/autofs4/expire.c
@@ -1,16 +1,12 @@
-/* -*- c -*- --------------------------------------------------------------- *
+/*
- *
+ * Copyright 1997-1998 Transmeta Corporation -- All Rights Reserved
- * linux/fs/autofs/expire.c
+ * Copyright 1999-2000 Jeremy Fitzhardinge <jeremy@goop.org>
- *
+ * Copyright 2001-2006 Ian Kent <raven@themaw.net>
- *  Copyright 1997-1998 Transmeta Corporation -- All Rights Reserved
- *  Copyright 1999-2000 Jeremy Fitzhardinge <jeremy@goop.org>
- *  Copyright 2001-2006 Ian Kent <raven@themaw.net>
 *
 * This file is part of the Linux kernel and is made available under
 * the terms of the GNU General Public License, version 2, or at your
 * option, any later version, incorporated herein by reference.
- *
+ */
- * ------------------------------------------------------------------------- */
 #include "autofs_i.h"
@@ -18,7 +14,7 @@ static unsigned long now;
 /* Check if a dentry can be expired */
 static inline int autofs4_can_expire(struct dentry *dentry,
-                                        unsigned long timeout, int do_now)
+                                     unsigned long timeout, int do_now)
 {
        struct autofs_info *ino = autofs4_dentry_ino(dentry);
@@ -41,7 +37,7 @@ static int autofs4_mount_busy(struct vfsmount *mnt, struct dentry *dentry)
        struct path path = {.mnt = mnt, .dentry = dentry};
        int status = 1;
-        DPRINTK("dentry %p %pd", dentry, dentry);
+        pr_debug("dentry %p %pd\n", dentry, dentry);
        path_get(&path);
@@ -58,14 +54,16 @@ static int autofs4_mount_busy(struct vfsmount *mnt, struct dentry *dentry)
        /* Update the expiry counter if fs is busy */
        if (!may_umount_tree(path.mnt)) {
-                struct autofs_info *ino = autofs4_dentry_ino(top);
+                struct autofs_info *ino;
+                ino = autofs4_dentry_ino(top);
                ino->last_used = jiffies;
                goto done;
        }
        status = 0;
 done:
-        DPRINTK("returning = %d", status);
+        pr_debug("returning = %d\n", status);
        path_put(&path);
        return status;
 }
@@ -74,7 +72,7 @@ done:
 * Calculate and dget next entry in the subdirs list under root.
 */
 static struct dentry *get_next_positive_subdir(struct dentry *prev,
-                                                struct dentry *root)
+                                               struct dentry *root)
 {
        struct autofs_sb_info *sbi = autofs4_sbi(root->d_sb);
        struct list_head *next;
@@ -121,7 +119,7 @@ cont:
 * Calculate and dget next entry in top down tree traversal.
 */
 static struct dentry *get_next_positive_dentry(struct dentry *prev,
-                                                struct dentry *root)
+                                               struct dentry *root)
 {
        struct autofs_sb_info *sbi = autofs4_sbi(root->d_sb);
        struct list_head *next;
@@ -187,15 +185,17 @@ again:
 * autofs submounts.
 */
 static int autofs4_direct_busy(struct vfsmount *mnt,
-                                struct dentry *top,
+                               struct dentry *top,
-                                unsigned long timeout,
+                               unsigned long timeout,
-                                int do_now)
+                               int do_now)
 {
-        DPRINTK("top %p %pd", top, top);
+        pr_debug("top %p %pd\n", top, top);
        /* If it's busy update the expiry counters */
        if (!may_umount_tree(mnt)) {
-                struct autofs_info *ino = autofs4_dentry_ino(top);
+                struct autofs_info *ino;
+                ino = autofs4_dentry_ino(top);
                if (ino)
                        ino->last_used = jiffies;
                return 1;
@@ -208,7 +208,8 @@ static int autofs4_direct_busy(struct vfsmount *mnt,
        return 0;
 }
-/* Check a directory tree of mount points for busyness
+/*
+ * Check a directory tree of mount points for busyness
 * The tree is not busy iff no mountpoints are busy
 */
 static int autofs4_tree_busy(struct vfsmount *mnt,
@@ -219,7 +220,7 @@ static int autofs4_tree_busy(struct vfsmount *mnt,
        struct autofs_info *top_ino = autofs4_dentry_ino(top);
        struct dentry *p;
-        DPRINTK("top %p %pd", top, top);
+        pr_debug("top %p %pd\n", top, top);
        /* Negative dentry - give up */
        if (!simple_positive(top))
@@ -227,7 +228,7 @@ static int autofs4_tree_busy(struct vfsmount *mnt,
        p = NULL;
        while ((p = get_next_positive_dentry(p, top))) {
-                DPRINTK("dentry %p %pd", p, p);
+                pr_debug("dentry %p %pd\n", p, p);
                /*
                 * Is someone visiting anywhere in the subtree ?
@@ -273,11 +274,11 @@ static struct dentry *autofs4_check_leaves(struct vfsmount *mnt,
 {
        struct dentry *p;
-        DPRINTK("parent %p %pd", parent, parent);
+        pr_debug("parent %p %pd\n", parent, parent);
        p = NULL;
        while ((p = get_next_positive_dentry(p, parent))) {
-                DPRINTK("dentry %p %pd", p, p);
+                pr_debug("dentry %p %pd\n", p, p);
                if (d_mountpoint(p)) {
                        /* Can we umount this guy */
@@ -362,7 +363,7 @@ static struct dentry *should_expire(struct dentry *dentry,
         *         offset (autofs-5.0+).
         */
        if (d_mountpoint(dentry)) {
-                DPRINTK("checking mountpoint %p %pd", dentry, dentry);
+                pr_debug("checking mountpoint %p %pd\n", dentry, dentry);
                /* Can we umount this guy */
                if (autofs4_mount_busy(mnt, dentry))
@@ -375,7 +376,7 @@ static struct dentry *should_expire(struct dentry *dentry,
        }
        if (d_really_is_positive(dentry) && d_is_symlink(dentry)) {
-                DPRINTK("checking symlink %p %pd", dentry, dentry);
+                pr_debug("checking symlink %p %pd\n", dentry, dentry);
                /*
                 * A symlink can't be "busy" in the usual sense so
                 * just check last used for expire timeout.
@@ -404,6 +405,7 @@ static struct dentry *should_expire(struct dentry *dentry,
        } else {
                /* Path walk currently on this dentry? */
                struct dentry *expired;
                ino_count = atomic_read(&ino->count) + 1;
                if (d_count(dentry) > ino_count)
                        return NULL;
@@ -471,7 +473,7 @@ struct dentry *autofs4_expire_indirect(struct super_block *sb,
        return NULL;
 found:
-        DPRINTK("returning %p %pd", expired, expired);
+        pr_debug("returning %p %pd\n", expired, expired);
        ino->flags |= AUTOFS_INF_EXPIRING;
        smp_mb();
        ino->flags &= ~AUTOFS_INF_NO_RCU;
@@ -503,12 +505,12 @@ int autofs4_expire_wait(struct dentry *dentry, int rcu_walk)
        if (ino->flags & AUTOFS_INF_EXPIRING) {
                spin_unlock(&sbi->fs_lock);
-                DPRINTK("waiting for expire %p name=%pd", dentry, dentry);
+                pr_debug("waiting for expire %p name=%pd\n", dentry, dentry);
                status = autofs4_wait(sbi, dentry, NFY_NONE);
                wait_for_completion(&ino->expire_complete);
-                DPRINTK("expire done status=%d", status);
+                pr_debug("expire done status=%d\n", status);
                if (d_unhashed(dentry))
                        return -EAGAIN;
@@ -522,21 +524,22 @@ int autofs4_expire_wait(struct dentry *dentry, int rcu_walk)
 /* Perform an expiry operation */
 int autofs4_expire_run(struct super_block *sb,
-                      struct vfsmount *mnt,
+                       struct vfsmount *mnt,
-                      struct autofs_sb_info *sbi,
+                       struct autofs_sb_info *sbi,
-                      struct autofs_packet_expire __user *pkt_p)
+                       struct autofs_packet_expire __user *pkt_p)
 {
        struct autofs_packet_expire pkt;
        struct autofs_info *ino;
        struct dentry *dentry;
        int ret = 0;
-        memset(&pkt,0,sizeof pkt);
+        memset(&pkt, 0, sizeof(pkt));
        pkt.hdr.proto_version = sbi->version;
        pkt.hdr.type = autofs_ptype_expire;
-        if ((dentry = autofs4_expire_indirect(sb, mnt, sbi, 0)) == NULL)
+        dentry = autofs4_expire_indirect(sb, mnt, sbi, 0);
+        if (!dentry)
                return -EAGAIN;
        pkt.len = dentry->d_name.len;
@@ -544,7 +547,7 @@ int autofs4_expire_run(struct super_block *sb,
        pkt.name[pkt.len] = '\0';
        dput(dentry);
-        if ( copy_to_user(pkt_p, &pkt, sizeof(struct autofs_packet_expire)) )
+        if (copy_to_user(pkt_p, &pkt, sizeof(struct autofs_packet_expire)))
                ret = -EFAULT;
        spin_lock(&sbi->fs_lock);
@@ -573,7 +576,8 @@ int autofs4_do_expire_multi(struct super_block *sb, struct vfsmount *mnt,
                struct autofs_info *ino = autofs4_dentry_ino(dentry);
                /* This is synchronous because it makes the daemon a
-                   little easier */
+                 * little easier
+                 */
                ret = autofs4_wait(sbi, dentry, NFY_EXPIRE);
                spin_lock(&sbi->fs_lock);
@@ -588,8 +592,10 @@ int autofs4_do_expire_multi(struct super_block *sb, struct vfsmount *mnt,
        return ret;
 }
-/* Call repeatedly until it returns -EAGAIN, meaning there's nothing
+/*
-   more to be done */
+ * Call repeatedly until it returns -EAGAIN, meaning there's nothing
+ * more to be done.
+ */
 int autofs4_expire_multi(struct super_block *sb, struct vfsmount *mnt,
                        struct autofs_sb_info *sbi, int __user *arg)
 {
diff --git a/fs/autofs4/init.c b/fs/autofs4/init.c
index b3db517e89ec..8cf0e63389ae 100644
--- a/fs/autofs4/init.c
+++ b/fs/autofs4/init.c
@@ -1,14 +1,10 @@
-/* -*- c -*- --------------------------------------------------------------- *
+/*
- *
+ * Copyright 1997-1998 Transmeta Corporation -- All Rights Reserved
- * linux/fs/autofs/init.c
- *
- *  Copyright 1997-1998 Transmeta Corporation -- All Rights Reserved
 *
 * This file is part of the Linux kernel and is made available under
 * the terms of the GNU General Public License, version 2, or at your
 * option, any later version, incorporated herein by reference.
- *
+ */
- * ------------------------------------------------------------------------- */
 #include <linux/module.h>
 #include <linux/init.h>
diff --git a/fs/autofs4/inode.c b/fs/autofs4/inode.c
index a3ae0b2aeb5a..61b21051bd5a 100644
--- a/fs/autofs4/inode.c
+++ b/fs/autofs4/inode.c
@@ -1,15 +1,11 @@
-/* -*- c -*- --------------------------------------------------------------- *
+/*
- *
+ * Copyright 1997-1998 Transmeta Corporation -- All Rights Reserved
- * linux/fs/autofs/inode.c
+ * Copyright 2005-2006 Ian Kent <raven@themaw.net>
- *
- *  Copyright 1997-1998 Transmeta Corporation -- All Rights Reserved
- *  Copyright 2005-2006 Ian Kent <raven@themaw.net>
 *
 * This file is part of the Linux kernel and is made available under
 * the terms of the GNU General Public License, version 2, or at your
 * option, any later version, incorporated herein by reference.
- *
+ */
- * ------------------------------------------------------------------------- */
 #include <linux/kernel.h>
 #include <linux/slab.h>
@@ -24,7 +20,9 @@
 struct autofs_info *autofs4_new_ino(struct autofs_sb_info *sbi)
 {
-        struct autofs_info *ino = kzalloc(sizeof(*ino), GFP_KERNEL);
+        struct autofs_info *ino;
+        ino = kzalloc(sizeof(*ino), GFP_KERNEL);
        if (ino) {
                INIT_LIST_HEAD(&ino->active);
                INIT_LIST_HEAD(&ino->expiring);
@@ -62,7 +60,7 @@ void autofs4_kill_sb(struct super_block *sb)
                put_pid(sbi->oz_pgrp);
        }
-        DPRINTK("shutting down");
+        pr_debug("shutting down\n");
        kill_litter_super(sb);
        if (sbi)
                kfree_rcu(sbi, rcu);
@@ -94,7 +92,12 @@ static int autofs4_show_options(struct seq_file *m, struct dentry *root)
                seq_printf(m, ",direct");
        else
                seq_printf(m, ",indirect");
+#ifdef CONFIG_CHECKPOINT_RESTORE
+        if (sbi->pipe)
+                seq_printf(m, ",pipe_ino=%ld", sbi->pipe->f_inode->i_ino);
+        else
+                seq_printf(m, ",pipe_ino=-1");
+#endif
        return 0;
 }
@@ -147,6 +150,7 @@ static int parse_options(char *options, int *pipefd, kuid_t *uid, kgid_t *gid,
        while ((p = strsep(&options, ",")) != NULL) {
                int token;
                if (!*p)
                        continue;
@@ -204,9 +208,9 @@ static int parse_options(char *options, int *pipefd, kuid_t *uid, kgid_t *gid,
 int autofs4_fill_super(struct super_block *s, void *data, int silent)
 {
-        struct inode * root_inode;
+        struct inode *root_inode;
-        struct dentry * root;
+        struct dentry *root;
-        struct file * pipe;
+        struct file *pipe;
        int pipefd;
        struct autofs_sb_info *sbi;
        struct autofs_info *ino;
@@ -217,7 +221,7 @@ int autofs4_fill_super(struct super_block *s, void *data, int silent)
        sbi = kzalloc(sizeof(*sbi), GFP_KERNEL);
        if (!sbi)
                return -ENOMEM;
-        DPRINTK("starting up, sbi = %p",sbi);
+        pr_debug("starting up, sbi = %p\n", sbi);
        s->s_fs_info = sbi;
        sbi->magic = AUTOFS_SBI_MAGIC;
@@ -266,14 +270,14 @@ int autofs4_fill_super(struct super_block *s, void *data, int silent)
        if (parse_options(data, &pipefd, &root_inode->i_uid, &root_inode->i_gid,
                          &pgrp, &pgrp_set, &sbi->type, &sbi->min_proto,
                          &sbi->max_proto)) {
-                printk("autofs: called with bogus options\n");
+                pr_err("called with bogus options\n");
                goto fail_dput;
        }
        if (pgrp_set) {
                sbi->oz_pgrp = find_get_pid(pgrp);
                if (!sbi->oz_pgrp) {
-                        pr_warn("autofs: could not find process group %d\n",
+                        pr_err("could not find process group %d\n",
                                pgrp);
                        goto fail_dput;
                }
@@ -290,10 +294,10 @@ int autofs4_fill_super(struct super_block *s, void *data, int silent)
        /* Couldn't this be tested earlier? */
        if (sbi->max_proto < AUTOFS_MIN_PROTO_VERSION ||
            sbi->min_proto > AUTOFS_MAX_PROTO_VERSION) {
-                printk("autofs: kernel does not match daemon version "
+                pr_err("kernel does not match daemon version "
                       "daemon (%d, %d) kernel (%d, %d)\n",
-                        sbi->min_proto, sbi->max_proto,
+                       sbi->min_proto, sbi->max_proto,
-                        AUTOFS_MIN_PROTO_VERSION, AUTOFS_MAX_PROTO_VERSION);
+                       AUTOFS_MIN_PROTO_VERSION, AUTOFS_MAX_PROTO_VERSION);
                goto fail_dput;
        }
@@ -304,11 +308,11 @@ int autofs4_fill_super(struct super_block *s, void *data, int silent)
                sbi->version = sbi->max_proto;
        sbi->sub_version = AUTOFS_PROTO_SUBVERSION;
-        DPRINTK("pipe fd = %d, pgrp = %u", pipefd, pid_nr(sbi->oz_pgrp));
+        pr_debug("pipe fd = %d, pgrp = %u\n", pipefd, pid_nr(sbi->oz_pgrp));
        pipe = fget(pipefd);
        if (!pipe) {
-                printk("autofs: could not open pipe file descriptor\n");
+                pr_err("could not open pipe file descriptor\n");
                goto fail_dput;
        }
        ret = autofs_prepare_pipe(pipe);
@@ -323,12 +327,12 @@ int autofs4_fill_super(struct super_block *s, void *data, int silent)
         */
        s->s_root = root;
        return 0;
-        
        /*
         * Failure ... clean up.
         */
 fail_fput:
-        printk("autofs: pipe file descriptor does not contain proper ops\n");
+        pr_err("pipe file descriptor does not contain proper ops\n");
        fput(pipe);
        /* fall through */
 fail_dput:
diff --git a/fs/autofs4/root.c b/fs/autofs4/root.c
index c6d7d3dbd52a..7ab923940d18 100644
--- a/fs/autofs4/root.c
+++ b/fs/autofs4/root.c
@@ -1,16 +1,12 @@
-/* -*- c -*- --------------------------------------------------------------- *
+/*
- *
+ * Copyright 1997-1998 Transmeta Corporation -- All Rights Reserved
- * linux/fs/autofs/root.c
+ * Copyright 1999-2000 Jeremy Fitzhardinge <jeremy@goop.org>
- *
+ * Copyright 2001-2006 Ian Kent <raven@themaw.net>
- *  Copyright 1997-1998 Transmeta Corporation -- All Rights Reserved
- *  Copyright 1999-2000 Jeremy Fitzhardinge <jeremy@goop.org>
- *  Copyright 2001-2006 Ian Kent <raven@themaw.net>
 *
 * This file is part of the Linux kernel and is made available under
 * the terms of the GNU General Public License, version 2, or at your
 * option, any later version, incorporated herein by reference.
- *
+ */
- * ------------------------------------------------------------------------- */
 #include <linux/capability.h>
 #include <linux/errno.h>
@@ -23,16 +19,18 @@
 #include "autofs_i.h"
-static int autofs4_dir_symlink(struct inode *,struct dentry *,const char *);
+static int autofs4_dir_symlink(struct inode *, struct dentry *, const char *);
-static int autofs4_dir_unlink(struct inode *,struct dentry *);
+static int autofs4_dir_unlink(struct inode *, struct dentry *);
-static int autofs4_dir_rmdir(struct inode *,struct dentry *);
+static int autofs4_dir_rmdir(struct inode *, struct dentry *);
-static int autofs4_dir_mkdir(struct inode *,struct dentry *,umode_t);
+static int autofs4_dir_mkdir(struct inode *, struct dentry *, umode_t);
-static long autofs4_root_ioctl(struct file *,unsigned int,unsigned long);
+static long autofs4_root_ioctl(struct file *, unsigned int, unsigned long);
 #ifdef CONFIG_COMPAT
-static long autofs4_root_compat_ioctl(struct file *,unsigned int,unsigned long);
+static long autofs4_root_compat_ioctl(struct file *,
+                                      unsigned int, unsigned long);
 #endif
 static int autofs4_dir_open(struct inode *inode, struct file *file);
-static struct dentry *autofs4_lookup(struct inode *,struct dentry *, unsigned int);
+static struct dentry *autofs4_lookup(struct inode *,
+                                     struct dentry *, unsigned int);
 static struct vfsmount *autofs4_d_automount(struct path *);
 static int autofs4_d_manage(struct dentry *, bool);
 static void autofs4_dentry_release(struct dentry *);
@@ -74,7 +72,9 @@ const struct dentry_operations autofs4_dentry_operations = {
 static void autofs4_add_active(struct dentry *dentry)
 {
        struct autofs_sb_info *sbi = autofs4_sbi(dentry->d_sb);
-        struct autofs_info *ino = autofs4_dentry_ino(dentry);
+        struct autofs_info *ino;
+        ino = autofs4_dentry_ino(dentry);
        if (ino) {
                spin_lock(&sbi->lookup_lock);
                if (!ino->active_count) {
@@ -84,13 +84,14 @@ static void autofs4_add_active(struct dentry *dentry)
                ino->active_count++;
                spin_unlock(&sbi->lookup_lock);
        }
-        return;
 }
 static void autofs4_del_active(struct dentry *dentry)
 {
        struct autofs_sb_info *sbi = autofs4_sbi(dentry->d_sb);
-        struct autofs_info *ino = autofs4_dentry_ino(dentry);
+        struct autofs_info *ino;
+        ino = autofs4_dentry_ino(dentry);
        if (ino) {
                spin_lock(&sbi->lookup_lock);
                ino->active_count--;
@@ -100,7 +101,6 @@ static void autofs4_del_active(struct dentry *dentry)
                }
                spin_unlock(&sbi->lookup_lock);
        }
-        return;
 }
 static int autofs4_dir_open(struct inode *inode, struct file *file)
@@ -108,7 +108,7 @@ static int autofs4_dir_open(struct inode *inode, struct file *file)
        struct dentry *dentry = file->f_path.dentry;
        struct autofs_sb_info *sbi = autofs4_sbi(dentry->d_sb);
-        DPRINTK("file=%p dentry=%p %pd", file, dentry, dentry);
+        pr_debug("file=%p dentry=%p %pd\n", file, dentry, dentry);
        if (autofs4_oz_mode(sbi))
                goto out;
@@ -138,7 +138,7 @@ static void autofs4_dentry_release(struct dentry *de)
        struct autofs_info *ino = autofs4_dentry_ino(de);
        struct autofs_sb_info *sbi = autofs4_sbi(de->d_sb);
-        DPRINTK("releasing %p", de);
+        pr_debug("releasing %p\n", de);
        if (!ino)
                return;
@@ -278,9 +278,9 @@ static int autofs4_mount_wait(struct dentry *dentry, bool rcu_walk)
        if (ino->flags & AUTOFS_INF_PENDING) {
                if (rcu_walk)
                        return -ECHILD;
-                DPRINTK("waiting for mount name=%pd", dentry);
+                pr_debug("waiting for mount name=%pd\n", dentry);
                status = autofs4_wait(sbi, dentry, NFY_MOUNT);
-                DPRINTK("mount wait done status=%d", status);
+                pr_debug("mount wait done status=%d\n", status);
        }
        ino->last_used = jiffies;
        return status;
@@ -320,7 +320,9 @@ static struct dentry *autofs4_mountpoint_changed(struct path *path)
        if (autofs_type_indirect(sbi->type) && d_unhashed(dentry)) {
                struct dentry *parent = dentry->d_parent;
                struct autofs_info *ino;
-                struct dentry *new = d_lookup(parent, &dentry->d_name);
+                struct dentry *new;
+                new = d_lookup(parent, &dentry->d_name);
                if (!new)
                        return NULL;
                ino = autofs4_dentry_ino(new);
@@ -338,7 +340,7 @@ static struct vfsmount *autofs4_d_automount(struct path *path)
        struct autofs_info *ino = autofs4_dentry_ino(dentry);
        int status;
-        DPRINTK("dentry=%p %pd", dentry, dentry);
+        pr_debug("dentry=%p %pd\n", dentry, dentry);
        /* The daemon never triggers a mount. */
        if (autofs4_oz_mode(sbi))
@@ -425,7 +427,7 @@ static int autofs4_d_manage(struct dentry *dentry, bool rcu_walk)
        struct autofs_info *ino = autofs4_dentry_ino(dentry);
        int status;
-        DPRINTK("dentry=%p %pd", dentry, dentry);
+        pr_debug("dentry=%p %pd\n", dentry, dentry);
        /* The daemon never waits. */
        if (autofs4_oz_mode(sbi)) {
@@ -455,6 +457,7 @@ static int autofs4_d_manage(struct dentry *dentry, bool rcu_walk)
                 * a mount-trap.
                 */
                struct inode *inode;
                if (ino->flags & (AUTOFS_INF_EXPIRING | AUTOFS_INF_NO_RCU))
                        return 0;
                if (d_mountpoint(dentry))
@@ -494,13 +497,14 @@ static int autofs4_d_manage(struct dentry *dentry, bool rcu_walk)
 }
 /* Lookups in the root directory */
-static struct dentry *autofs4_lookup(struct inode *dir, struct dentry *dentry, unsigned int flags)
+static struct dentry *autofs4_lookup(struct inode *dir,
+                                     struct dentry *dentry, unsigned int flags)
 {
        struct autofs_sb_info *sbi;
        struct autofs_info *ino;
        struct dentry *active;
-        DPRINTK("name = %pd", dentry);
+        pr_debug("name = %pd\n", dentry);
        /* File name too long to exist */
        if (dentry->d_name.len > NAME_MAX)
@@ -508,14 +512,14 @@ static struct dentry *autofs4_lookup(struct inode *dir, struct dentry *dentry, u
        sbi = autofs4_sbi(dir->i_sb);
-        DPRINTK("pid = %u, pgrp = %u, catatonic = %d, oz_mode = %d",
+        pr_debug("pid = %u, pgrp = %u, catatonic = %d, oz_mode = %d\n",
-                current->pid, task_pgrp_nr(current), sbi->catatonic,
+                 current->pid, task_pgrp_nr(current), sbi->catatonic,
-                autofs4_oz_mode(sbi));
+                 autofs4_oz_mode(sbi));
        active = autofs4_lookup_active(dentry);
-        if (active) {
+        if (active)
                return active;
-        } else {
+        else {
                /*
                 * A dentry that is not within the root can never trigger a
                 * mount operation, unless the directory already exists, so we
@@ -526,7 +530,8 @@ static struct dentry *autofs4_lookup(struct inode *dir, struct dentry *dentry, u
                        return ERR_PTR(-ENOENT);
                /* Mark entries in the root as mount triggers */
-                if (autofs_type_indirect(sbi->type) && IS_ROOT(dentry->d_parent))
+                if (IS_ROOT(dentry->d_parent) &&
+                    autofs_type_indirect(sbi->type))
                        __managed_dentry_set_managed(dentry);
                ino = autofs4_new_ino(sbi);
@@ -537,8 +542,6 @@ static struct dentry *autofs4_lookup(struct inode *dir, struct dentry *dentry, u
                ino->dentry = dentry;
                autofs4_add_active(dentry);
-                d_instantiate(dentry, NULL);
        }
        return NULL;
 }
@@ -554,7 +557,7 @@ static int autofs4_dir_symlink(struct inode *dir,
        size_t size = strlen(symname);
        char *cp;
-        DPRINTK("%s <- %pd", symname, dentry);
+        pr_debug("%s <- %pd\n", symname, dentry);
        if (!autofs4_oz_mode(sbi))
                return -EACCES;
@@ -613,7 +616,7 @@ static int autofs4_dir_unlink(struct inode *dir, struct dentry *dentry)
        struct autofs_sb_info *sbi = autofs4_sbi(dir->i_sb);
        struct autofs_info *ino = autofs4_dentry_ino(dentry);
        struct autofs_info *p_ino;
-        
        /* This allows root to remove symlinks */
        if (!autofs4_oz_mode(sbi) && !capable(CAP_SYS_ADMIN))
                return -EPERM;
@@ -664,7 +667,6 @@ static void autofs_set_leaf_automount_flags(struct dentry *dentry)
        if (IS_ROOT(parent->d_parent))
                return;
        managed_dentry_clear_managed(parent);
-        return;
 }
 static void autofs_clear_leaf_automount_flags(struct dentry *dentry)
@@ -687,7 +689,6 @@ static void autofs_clear_leaf_automount_flags(struct dentry *dentry)
        if (d_child->next == &parent->d_subdirs &&
            d_child->prev == &parent->d_subdirs)
                managed_dentry_set_managed(parent);
-        return;
 }
 static int autofs4_dir_rmdir(struct inode *dir, struct dentry *dentry)
@@ -695,8 +696,8 @@ static int autofs4_dir_rmdir(struct inode *dir, struct dentry *dentry)
        struct autofs_sb_info *sbi = autofs4_sbi(dir->i_sb);
        struct autofs_info *ino = autofs4_dentry_ino(dentry);
        struct autofs_info *p_ino;
-        
-        DPRINTK("dentry %p, removing %pd", dentry, dentry);
+        pr_debug("dentry %p, removing %pd\n", dentry, dentry);
        if (!autofs4_oz_mode(sbi))
                return -EACCES;
@@ -728,7 +729,8 @@ static int autofs4_dir_rmdir(struct inode *dir, struct dentry *dentry)
        return 0;
 }
-static int autofs4_dir_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode)
+static int autofs4_dir_mkdir(struct inode *dir,
+                             struct dentry *dentry, umode_t mode)
 {
        struct autofs_sb_info *sbi = autofs4_sbi(dir->i_sb);
        struct autofs_info *ino = autofs4_dentry_ino(dentry);
@@ -738,7 +740,7 @@ static int autofs4_dir_mkdir(struct inode *dir, struct dentry *dentry, umode_t m
        if (!autofs4_oz_mode(sbi))
                return -EACCES;
-        DPRINTK("dentry %p, creating %pd", dentry, dentry);
+        pr_debug("dentry %p, creating %pd\n", dentry, dentry);
        BUG_ON(!ino);
@@ -768,14 +770,18 @@ static int autofs4_dir_mkdir(struct inode *dir, struct dentry *dentry, umode_t m
 /* Get/set timeout ioctl() operation */
 #ifdef CONFIG_COMPAT
 static inline int autofs4_compat_get_set_timeout(struct autofs_sb_info *sbi,
-                                         compat_ulong_t __user *p)
+                                                 compat_ulong_t __user *p)
 {
-        int rv;
        unsigned long ntimeout;
+        int rv;
+        rv = get_user(ntimeout, p);
+        if (rv)
+                goto error;
-        if ((rv = get_user(ntimeout, p)) ||
+        rv = put_user(sbi->exp_timeout/HZ, p);
-             (rv = put_user(sbi->exp_timeout/HZ, p)))
+        if (rv)
-                return rv;
+                goto error;
        if (ntimeout > UINT_MAX/HZ)
                sbi->exp_timeout = 0;
@@ -783,18 +789,24 @@ static inline int autofs4_compat_get_set_timeout(struct autofs_sb_info *sbi,
                sbi->exp_timeout = ntimeout * HZ;
        return 0;
+error:
+        return rv;
 }
 #endif
 static inline int autofs4_get_set_timeout(struct autofs_sb_info *sbi,
-                                         unsigned long __user *p)
+                                          unsigned long __user *p)
 {
-        int rv;
        unsigned long ntimeout;
+        int rv;
+        rv = get_user(ntimeout, p);
+        if (rv)
+                goto error;
-        if ((rv = get_user(ntimeout, p)) ||
+        rv = put_user(sbi->exp_timeout/HZ, p);
-             (rv = put_user(sbi->exp_timeout/HZ, p)))
+        if (rv)
-                return rv;
+                goto error;
        if (ntimeout > ULONG_MAX/HZ)
                sbi->exp_timeout = 0;
@@ -802,16 +814,20 @@ static inline int autofs4_get_set_timeout(struct autofs_sb_info *sbi,
                sbi->exp_timeout = ntimeout * HZ;
        return 0;
+error:
+        return rv;
 }
 /* Return protocol version */
-static inline int autofs4_get_protover(struct autofs_sb_info *sbi, int __user *p)
+static inline int autofs4_get_protover(struct autofs_sb_info *sbi,
+                                       int __user *p)
 {
        return put_user(sbi->version, p);
 }
 /* Return protocol sub version */
-static inline int autofs4_get_protosubver(struct autofs_sb_info *sbi, int __user *p)
+static inline int autofs4_get_protosubver(struct autofs_sb_info *sbi,
+                                          int __user *p)
 {
        return put_user(sbi->sub_version, p);
 }
@@ -826,7 +842,7 @@ static inline int autofs4_ask_umount(struct vfsmount *mnt, int __user *p)
        if (may_umount(mnt))
                status = 1;
-        DPRINTK("returning %d", status);
+        pr_debug("returning %d\n", status);
        status = put_user(status, p);
@@ -834,9 +850,9 @@ static inline int autofs4_ask_umount(struct vfsmount *mnt, int __user *p)
 }
 /* Identify autofs4_dentries - this is so we can tell if there's
-   an extra dentry refcount or not.  We only hold a refcount on the
+ * an extra dentry refcount or not.  We only hold a refcount on the
-   dentry if its non-negative (ie, d_inode != NULL)
+ * dentry if its non-negative (ie, d_inode != NULL)
-*/
+ */
 int is_autofs4_dentry(struct dentry *dentry)
 {
        return dentry && d_really_is_positive(dentry) &&
@@ -854,21 +870,21 @@ static int autofs4_root_ioctl_unlocked(struct inode *inode, struct file *filp,
        struct autofs_sb_info *sbi = autofs4_sbi(inode->i_sb);
        void __user *p = (void __user *)arg;
-        DPRINTK("cmd = 0x%08x, arg = 0x%08lx, sbi = %p, pgrp = %u",
+        pr_debug("cmd = 0x%08x, arg = 0x%08lx, sbi = %p, pgrp = %u\n",
-                cmd,arg,sbi,task_pgrp_nr(current));
+                 cmd, arg, sbi, task_pgrp_nr(current));
        if (_IOC_TYPE(cmd) != _IOC_TYPE(AUTOFS_IOC_FIRST) ||
             _IOC_NR(cmd) - _IOC_NR(AUTOFS_IOC_FIRST) >= AUTOFS_IOC_COUNT)
                return -ENOTTY;
-        
        if (!autofs4_oz_mode(sbi) && !capable(CAP_SYS_ADMIN))
                return -EPERM;
-        
-        switch(cmd) {
+        switch (cmd) {
        case AUTOFS_IOC_READY:  /* Wait queue: go ahead and retry */
-                return autofs4_wait_release(sbi,(autofs_wqt_t)arg,0);
+                return autofs4_wait_release(sbi, (autofs_wqt_t) arg, 0);
        case AUTOFS_IOC_FAIL:   /* Wait queue: fail with ENOENT */
-                return autofs4_wait_release(sbi,(autofs_wqt_t)arg,-ENOENT);
+                return autofs4_wait_release(sbi, (autofs_wqt_t) arg, -ENOENT);
        case AUTOFS_IOC_CATATONIC: /* Enter catatonic mode (daemon shutdown) */
                autofs4_catatonic_mode(sbi);
                return 0;
@@ -888,13 +904,15 @@ static int autofs4_root_ioctl_unlocked(struct inode *inode, struct file *filp,
        /* return a single thing to expire */
        case AUTOFS_IOC_EXPIRE:
-                return autofs4_expire_run(inode->i_sb,filp->f_path.mnt,sbi, p);
+                return autofs4_expire_run(inode->i_sb,
+                                          filp->f_path.mnt, sbi, p);
        /* same as above, but can send multiple expires through pipe */
        case AUTOFS_IOC_EXPIRE_MULTI:
-                return autofs4_expire_multi(inode->i_sb,filp->f_path.mnt,sbi, p);
+                return autofs4_expire_multi(inode->i_sb,
+                                            filp->f_path.mnt, sbi, p);
        default:
-                return -ENOSYS;
+                return -EINVAL;
        }
 }
@@ -902,12 +920,13 @@ static long autofs4_root_ioctl(struct file *filp,
                               unsigned int cmd, unsigned long arg)
 {
        struct inode *inode = file_inode(filp);
        return autofs4_root_ioctl_unlocked(inode, filp, cmd, arg);
 }
 #ifdef CONFIG_COMPAT
 static long autofs4_root_compat_ioctl(struct file *filp,
-                             unsigned int cmd, unsigned long arg)
+                                      unsigned int cmd, unsigned long arg)
 {
        struct inode *inode = file_inode(filp);
        int ret;
@@ -916,7 +935,7 @@ static long autofs4_root_compat_ioctl(struct file *filp,
                ret = autofs4_root_ioctl_unlocked(inode, filp, cmd, arg);
        else
                ret = autofs4_root_ioctl_unlocked(inode, filp, cmd,
-                        (unsigned long)compat_ptr(arg));
+                                              (unsigned long) compat_ptr(arg));
        return ret;
 }
diff --git a/fs/autofs4/symlink.c b/fs/autofs4/symlink.c
index 84e037d1d129..99aab00dc217 100644
--- a/fs/autofs4/symlink.c
+++ b/fs/autofs4/symlink.c
@@ -1,14 +1,10 @@
-/* -*- c -*- --------------------------------------------------------------- *
+/*
- *
+ * Copyright 1997-1998 Transmeta Corporation -- All Rights Reserved
- * linux/fs/autofs/symlink.c
- *
- *  Copyright 1997-1998 Transmeta Corporation -- All Rights Reserved
 *
 * This file is part of the Linux kernel and is made available under
 * the terms of the GNU General Public License, version 2, or at your
 * option, any later version, incorporated herein by reference.
- *
+ */
- * ------------------------------------------------------------------------- */
 #include "autofs_i.h"
@@ -18,6 +14,7 @@ static const char *autofs4_get_link(struct dentry *dentry,
 {
        struct autofs_sb_info *sbi;
        struct autofs_info *ino;
        if (!dentry)
                return ERR_PTR(-ECHILD);
        sbi = autofs4_sbi(dentry->d_sb);
diff --git a/fs/autofs4/waitq.c b/fs/autofs4/waitq.c
index 35b755e79c2d..0146d911f468 100644
--- a/fs/autofs4/waitq.c
+++ b/fs/autofs4/waitq.c
@@ -1,15 +1,11 @@
-/* -*- c -*- --------------------------------------------------------------- *
+/*
- *
+ * Copyright 1997-1998 Transmeta Corporation -- All Rights Reserved
- * linux/fs/autofs/waitq.c
+ * Copyright 2001-2006 Ian Kent <raven@themaw.net>
- *
- *  Copyright 1997-1998 Transmeta Corporation -- All Rights Reserved
- *  Copyright 2001-2006 Ian Kent <raven@themaw.net>
 *
 * This file is part of the Linux kernel and is made available under
 * the terms of the GNU General Public License, version 2, or at your
 * option, any later version, incorporated herein by reference.
- *
+ */
- * ------------------------------------------------------------------------- */
 #include <linux/slab.h>
 #include <linux/time.h>
@@ -18,7 +14,8 @@
 #include "autofs_i.h"
 /* We make this a static variable rather than a part of the superblock; it
-   is better if we don't reassign numbers easily even across filesystems */
+ * is better if we don't reassign numbers easily even across filesystems
+ */
 static autofs_wqt_t autofs4_next_wait_queue = 1;
 /* These are the signals we allow interrupting a pending mount */
@@ -34,7 +31,7 @@ void autofs4_catatonic_mode(struct autofs_sb_info *sbi)
                return;
        }
-        DPRINTK("entering catatonic mode");
+        pr_debug("entering catatonic mode\n");
        sbi->catatonic = 1;
        wq = sbi->queues;
@@ -69,17 +66,19 @@ static int autofs4_write(struct autofs_sb_info *sbi,
        set_fs(KERNEL_DS);
        mutex_lock(&sbi->pipe_mutex);
-        while (bytes &&
+        wr = __vfs_write(file, data, bytes, &file->f_pos);
-               (wr = __vfs_write(file,data,bytes,&file->f_pos)) > 0) {
+        while (bytes && wr) {
                data += wr;
                bytes -= wr;
+                wr = __vfs_write(file, data, bytes, &file->f_pos);
        }
        mutex_unlock(&sbi->pipe_mutex);
        set_fs(fs);
        /* Keep the currently executing process from receiving a
-           SIGPIPE unless it was already supposed to get one */
+         * SIGPIPE unless it was already supposed to get one
+         */
        if (wr == -EPIPE && !sigpipe) {
                spin_lock_irqsave(&current->sighand->siglock, flags);
                sigdelset(&current->pending.signal, SIGPIPE);
@@ -89,7 +88,7 @@ static int autofs4_write(struct autofs_sb_info *sbi,
        return (bytes > 0);
 }
-        
 static void autofs4_notify_daemon(struct autofs_sb_info *sbi,
                                 struct autofs_wait_queue *wq,
                                 int type)
@@ -102,10 +101,11 @@ static void autofs4_notify_daemon(struct autofs_sb_info *sbi,
        struct file *pipe = NULL;
        size_t pktsz;
-        DPRINTK("wait id = 0x%08lx, name = %.*s, type=%d",
+        pr_debug("wait id = 0x%08lx, name = %.*s, type=%d\n",
-                (unsigned long) wq->wait_queue_token, wq->name.len, wq->name.name, type);
+                 (unsigned long) wq->wait_queue_token,
+                 wq->name.len, wq->name.name, type);
-        memset(&pkt,0,sizeof pkt); /* For security reasons */
+        memset(&pkt, 0, sizeof(pkt)); /* For security reasons */
        pkt.hdr.proto_version = sbi->version;
        pkt.hdr.type = type;
@@ -126,7 +126,8 @@ static void autofs4_notify_daemon(struct autofs_sb_info *sbi,
        }
        case autofs_ptype_expire_multi:
        {
-                struct autofs_packet_expire_multi *ep = &pkt.v4_pkt.expire_multi;
+                struct autofs_packet_expire_multi *ep =
+                                        &pkt.v4_pkt.expire_multi;
                pktsz = sizeof(*ep);
@@ -163,7 +164,7 @@ static void autofs4_notify_daemon(struct autofs_sb_info *sbi,
                break;
        }
        default:
-                printk("autofs4_notify_daemon: bad type %d!\n", type);
+                pr_warn("bad type %d!\n", type);
                mutex_unlock(&sbi->wq_mutex);
                return;
        }
@@ -231,7 +232,7 @@ autofs4_find_wait(struct autofs_sb_info *sbi, struct qstr *qstr)
                if (wq->name.hash == qstr->hash &&
                    wq->name.len == qstr->len &&
                    wq->name.name &&
-                         !memcmp(wq->name.name, qstr->name, qstr->len))
+                    !memcmp(wq->name.name, qstr->name, qstr->len))
                        break;
        }
        return wq;
@@ -248,7 +249,7 @@ autofs4_find_wait(struct autofs_sb_info *sbi, struct qstr *qstr)
 static int validate_request(struct autofs_wait_queue **wait,
                            struct autofs_sb_info *sbi,
                            struct qstr *qstr,
-                            struct dentry*dentry, enum autofs_notify notify)
+                            struct dentry *dentry, enum autofs_notify notify)
 {
        struct autofs_wait_queue *wq;
        struct autofs_info *ino;
@@ -322,8 +323,10 @@ static int validate_request(struct autofs_wait_queue **wait,
                 * continue on and create a new request.
                 */
                if (!IS_ROOT(dentry)) {
-                        if (d_really_is_positive(dentry) && d_unhashed(dentry)) {
+                        if (d_unhashed(dentry) &&
+                            d_really_is_positive(dentry)) {
                                struct dentry *parent = dentry->d_parent;
                                new = d_lookup(parent, &dentry->d_name);
                                if (new)
                                        dentry = new;
@@ -340,8 +343,8 @@ static int validate_request(struct autofs_wait_queue **wait,
        return 1;
 }
-int autofs4_wait(struct autofs_sb_info *sbi, struct dentry *dentry,
+int autofs4_wait(struct autofs_sb_info *sbi,
-                enum autofs_notify notify)
+                 struct dentry *dentry, enum autofs_notify notify)
 {
        struct autofs_wait_queue *wq;
        struct qstr qstr;
@@ -411,7 +414,7 @@ int autofs4_wait(struct autofs_sb_info *sbi, struct dentry *dentry,
        if (!wq) {
                /* Create a new wait queue */
-                wq = kmalloc(sizeof(struct autofs_wait_queue),GFP_KERNEL);
+                wq = kmalloc(sizeof(struct autofs_wait_queue), GFP_KERNEL);
                if (!wq) {
                        kfree(qstr.name);
                        mutex_unlock(&sbi->wq_mutex);
@@ -450,17 +453,19 @@ int autofs4_wait(struct autofs_sb_info *sbi, struct dentry *dentry,
                                        autofs_ptype_expire_indirect;
                }
-                DPRINTK("new wait id = 0x%08lx, name = %.*s, nfy=%d\n",
+                pr_debug("new wait id = 0x%08lx, name = %.*s, nfy=%d\n",
-                        (unsigned long) wq->wait_queue_token, wq->name.len,
+                         (unsigned long) wq->wait_queue_token, wq->name.len,
-                        wq->name.name, notify);
+                         wq->name.name, notify);
-                /* autofs4_notify_daemon() may block; it will unlock ->wq_mutex */
+                /*
+                 * autofs4_notify_daemon() may block; it will unlock ->wq_mutex
+                 */
                autofs4_notify_daemon(sbi, wq, type);
        } else {
                wq->wait_ctr++;
-                DPRINTK("existing wait id = 0x%08lx, name = %.*s, nfy=%d",
+                pr_debug("existing wait id = 0x%08lx, name = %.*s, nfy=%d\n",
-                        (unsigned long) wq->wait_queue_token, wq->name.len,
+                         (unsigned long) wq->wait_queue_token, wq->name.len,
-                        wq->name.name, notify);
+                         wq->name.name, notify);
                mutex_unlock(&sbi->wq_mutex);
                kfree(qstr.name);
        }
@@ -471,12 +476,14 @@ int autofs4_wait(struct autofs_sb_info *sbi, struct dentry *dentry,
         */
        if (wq->name.name) {
                /* Block all but "shutdown" signals while waiting */
-                sigset_t oldset;
+                unsigned long shutdown_sigs_mask;
                unsigned long irqflags;
+                sigset_t oldset;
                spin_lock_irqsave(&current->sighand->siglock, irqflags);
                oldset = current->blocked;
-                siginitsetinv(&current->blocked, SHUTDOWN_SIGS & ~oldset.sig[0]);
+                shutdown_sigs_mask = SHUTDOWN_SIGS & ~oldset.sig[0];
+                siginitsetinv(&current->blocked, shutdown_sigs_mask);
                recalc_sigpending();
                spin_unlock_irqrestore(&current->sighand->siglock, irqflags);
@@ -487,7 +494,7 @@ int autofs4_wait(struct autofs_sb_info *sbi, struct dentry *dentry,
                recalc_sigpending();
                spin_unlock_irqrestore(&current->sighand->siglock, irqflags);
        } else {
-                DPRINTK("skipped sleeping");
+                pr_debug("skipped sleeping\n");
        }
        status = wq->status;
@@ -562,4 +569,3 @@ int autofs4_wait_release(struct autofs_sb_info *sbi, autofs_wqt_t wait_queue_tok
        return 0;
 }
diff --git a/fs/block_dev.c b/fs/block_dev.c
index 826b164a4b5b..3172c4e2f502 100644
--- a/fs/block_dev.c
+++ b/fs/block_dev.c
@@ -575,7 +575,11 @@ static const struct super_operations bdev_sops = {
 static struct dentry *bd_mount(struct file_system_type *fs_type,
        int flags, const char *dev_name, void *data)
 {
-        return mount_pseudo(fs_type, "bdev:", &bdev_sops, NULL, BDEVFS_MAGIC);
+        struct dentry *dent;
+        dent = mount_pseudo(fs_type, "bdev:", &bdev_sops, NULL, BDEVFS_MAGIC);
+        if (dent)
+                dent->d_sb->s_iflags |= SB_I_CGROUPWB;
+        return dent;
 }
 static struct file_system_type bd_type = {
diff --git a/fs/btrfs/backref.c b/fs/btrfs/backref.c
index f6dac40f87ff..80e8472d618b 100644
--- a/fs/btrfs/backref.c
+++ b/fs/btrfs/backref.c
@@ -148,8 +148,7 @@ int __init btrfs_prelim_ref_init(void)
 void btrfs_prelim_ref_exit(void)
 {
-        if (btrfs_prelim_ref_cache)
+        kmem_cache_destroy(btrfs_prelim_ref_cache);
-                kmem_cache_destroy(btrfs_prelim_ref_cache);
 }
 /*
@@ -566,17 +565,14 @@ static void __merge_refs(struct list_head *head, int mode)
                struct __prelim_ref *pos2 = pos1, *tmp;
                list_for_each_entry_safe_continue(pos2, tmp, head, list) {
-                        struct __prelim_ref *xchg, *ref1 = pos1, *ref2 = pos2;
+                        struct __prelim_ref *ref1 = pos1, *ref2 = pos2;
                        struct extent_inode_elem *eie;
                        if (!ref_for_same_block(ref1, ref2))
                                continue;
                        if (mode == 1) {
-                                if (!ref1->parent && ref2->parent) {
+                                if (!ref1->parent && ref2->parent)
-                                        xchg = ref1;
+                                        swap(ref1, ref2);
-                                        ref1 = ref2;
-                                        ref2 = xchg;
-                                }
                        } else {
                                if (ref1->parent != ref2->parent)
                                        continue;
diff --git a/fs/btrfs/check-integrity.c b/fs/btrfs/check-integrity.c
index 861d472564c1..e34a71b3e225 100644
--- a/fs/btrfs/check-integrity.c
+++ b/fs/btrfs/check-integrity.c
@@ -95,6 +95,7 @@
 #include <linux/genhd.h>
 #include <linux/blkdev.h>
 #include <linux/vmalloc.h>
+#include <linux/string.h>
 #include "ctree.h"
 #include "disk-io.h"
 #include "hash.h"
@@ -105,6 +106,7 @@
 #include "locking.h"
 #include "check-integrity.h"
 #include "rcu-string.h"
+#include "compression.h"
 #define BTRFSIC_BLOCK_HASHTABLE_SIZE 0x10000
 #define BTRFSIC_BLOCK_LINK_HASHTABLE_SIZE 0x10000
@@ -176,7 +178,7 @@ struct btrfsic_block {
 * Elements of this type are allocated dynamically and required because
 * each block object can refer to and can be ref from multiple blocks.
 * The key to lookup them in the hashtable is the dev_bytenr of
- * the block ref to plus the one from the block refered from.
+ * the block ref to plus the one from the block referred from.
 * The fact that they are searchable via a hashtable and that a
 * ref_cnt is maintained is not required for the btrfs integrity
 * check algorithm itself, it is only used to make the output more
@@ -3076,7 +3078,7 @@ int btrfsic_mount(struct btrfs_root *root,
        list_for_each_entry(device, dev_head, dev_list) {
                struct btrfsic_dev_state *ds;
-                char *p;
+                const char *p;
                if (!device->bdev || !device->name)
                        continue;
@@ -3092,11 +3094,7 @@ int btrfsic_mount(struct btrfs_root *root,
                ds->state = state;
                bdevname(ds->bdev, ds->name);
                ds->name[BDEVNAME_SIZE - 1] = '\0';
-                for (p = ds->name; *p != '\0'; p++);
+                p = kbasename(ds->name);
-                while (p > ds->name && *p != '/')
-                        p--;
-                if (*p == '/')
-                        p++;
                strlcpy(ds->name, p, sizeof(ds->name));
                btrfsic_dev_state_hashtable_add(ds,
                                                &btrfsic_dev_state_hashtable);
diff --git a/fs/btrfs/compression.h b/fs/btrfs/compression.h
index 13a4dc0436c9..f49d8b8c0f00 100644
--- a/fs/btrfs/compression.h
+++ b/fs/btrfs/compression.h
@@ -48,6 +48,15 @@ int btrfs_submit_compressed_read(struct inode *inode, struct bio *bio,
 void btrfs_clear_biovec_end(struct bio_vec *bvec, int vcnt,
                                   unsigned long pg_index,
                                   unsigned long pg_offset);
+enum btrfs_compression_type {
+        BTRFS_COMPRESS_NONE  = 0,
+        BTRFS_COMPRESS_ZLIB  = 1,
+        BTRFS_COMPRESS_LZO   = 2,
+        BTRFS_COMPRESS_TYPES = 2,
+        BTRFS_COMPRESS_LAST  = 3,
+};
 struct btrfs_compress_op {
        struct list_head *(*alloc_workspace)(void);
diff --git a/fs/btrfs/ctree.c b/fs/btrfs/ctree.c
index 769e0ff1b4ce..77592931ab4f 100644
--- a/fs/btrfs/ctree.c
+++ b/fs/btrfs/ctree.c
@@ -311,7 +311,7 @@ struct tree_mod_root {
 struct tree_mod_elem {
        struct rb_node node;
-        u64 index;              /* shifted logical */
+        u64 logical;
        u64 seq;
        enum mod_log_op op;
@@ -435,11 +435,11 @@ void btrfs_put_tree_mod_seq(struct btrfs_fs_info *fs_info,
 /*
 * key order of the log:
- *       index -> sequence
+ *       node/leaf start address -> sequence
 *
- * the index is the shifted logical of the *new* root node for root replace
+ * The 'start address' is the logical address of the *new* root node
- * operations, or the shifted logical of the affected block for all other
+ * for root replace operations, or the logical address of the affected
- * operations.
+ * block for all other operations.
 *
 * Note: must be called with write lock (tree_mod_log_write_lock).
 */
@@ -460,9 +460,9 @@ __tree_mod_log_insert(struct btrfs_fs_info *fs_info, struct tree_mod_elem *tm)
        while (*new) {
                cur = container_of(*new, struct tree_mod_elem, node);
                parent = *new;
-                if (cur->index < tm->index)
+                if (cur->logical < tm->logical)
                        new = &((*new)->rb_left);
-                else if (cur->index > tm->index)
+                else if (cur->logical > tm->logical)
                        new = &((*new)->rb_right);
                else if (cur->seq < tm->seq)
                        new = &((*new)->rb_left);
@@ -523,7 +523,7 @@ alloc_tree_mod_elem(struct extent_buffer *eb, int slot,
        if (!tm)
                return NULL;
-        tm->index = eb->start >> PAGE_CACHE_SHIFT;
+        tm->logical = eb->start;
        if (op != MOD_LOG_KEY_ADD) {
                btrfs_node_key(eb, &tm->key, slot);
                tm->blockptr = btrfs_node_blockptr(eb, slot);
@@ -588,7 +588,7 @@ tree_mod_log_insert_move(struct btrfs_fs_info *fs_info,
                goto free_tms;
        }
-        tm->index = eb->start >> PAGE_CACHE_SHIFT;
+        tm->logical = eb->start;
        tm->slot = src_slot;
        tm->move.dst_slot = dst_slot;
        tm->move.nr_items = nr_items;
@@ -699,7 +699,7 @@ tree_mod_log_insert_root(struct btrfs_fs_info *fs_info,
                goto free_tms;
        }
-        tm->index = new_root->start >> PAGE_CACHE_SHIFT;
+        tm->logical = new_root->start;
        tm->old_root.logical = old_root->start;
        tm->old_root.level = btrfs_header_level(old_root);
        tm->generation = btrfs_header_generation(old_root);
@@ -739,16 +739,15 @@ __tree_mod_log_search(struct btrfs_fs_info *fs_info, u64 start, u64 min_seq,
        struct rb_node *node;
        struct tree_mod_elem *cur = NULL;
        struct tree_mod_elem *found = NULL;
-        u64 index = start >> PAGE_CACHE_SHIFT;
        tree_mod_log_read_lock(fs_info);
        tm_root = &fs_info->tree_mod_log;
        node = tm_root->rb_node;
        while (node) {
                cur = container_of(node, struct tree_mod_elem, node);
-                if (cur->index < index) {
+                if (cur->logical < start) {
                        node = node->rb_left;
-                } else if (cur->index > index) {
+                } else if (cur->logical > start) {
                        node = node->rb_right;
                } else if (cur->seq < min_seq) {
                        node = node->rb_left;
@@ -1230,9 +1229,10 @@ __tree_mod_log_oldest_root(struct btrfs_fs_info *fs_info,
                return NULL;
        /*
-         * the very last operation that's logged for a root is the replacement
+         * the very last operation that's logged for a root is the
-         * operation (if it is replaced at all). this has the index of the *new*
+         * replacement operation (if it is replaced at all). this has
-         * root, making it the very first operation that's logged for this root.
+         * the logical address of the *new* root, making it the very
+         * first operation that's logged for this root.
         */
        while (1) {
                tm = tree_mod_log_search_oldest(fs_info, root_logical,
@@ -1336,7 +1336,7 @@ __tree_mod_log_rewind(struct btrfs_fs_info *fs_info, struct extent_buffer *eb,
                if (!next)
                        break;
                tm = container_of(next, struct tree_mod_elem, node);
-                if (tm->index != first_tm->index)
+                if (tm->logical != first_tm->logical)
                        break;
        }
        tree_mod_log_read_unlock(fs_info);
@@ -5361,7 +5361,7 @@ int btrfs_compare_trees(struct btrfs_root *left_root,
                goto out;
        }
-        tmp_buf = kmalloc(left_root->nodesize, GFP_NOFS);
+        tmp_buf = kmalloc(left_root->nodesize, GFP_KERNEL);
        if (!tmp_buf) {
                ret = -ENOMEM;
                goto out;
diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h
index bfe4a337fb4d..84a6a5b3384a 100644
--- a/fs/btrfs/ctree.h
+++ b/fs/btrfs/ctree.h
@@ -100,6 +100,9 @@ struct btrfs_ordered_sum;
 /* tracks free space in block groups. */
 #define BTRFS_FREE_SPACE_TREE_OBJECTID 10ULL
+/* device stats in the device tree */
+#define BTRFS_DEV_STATS_OBJECTID 0ULL
 /* for storing balance parameters in the root tree */
 #define BTRFS_BALANCE_OBJECTID -4ULL
@@ -715,14 +718,6 @@ struct btrfs_timespec {
        __le32 nsec;
 } __attribute__ ((__packed__));
-enum btrfs_compression_type {
-        BTRFS_COMPRESS_NONE  = 0,
-        BTRFS_COMPRESS_ZLIB  = 1,
-        BTRFS_COMPRESS_LZO   = 2,
-        BTRFS_COMPRESS_TYPES = 2,
-        BTRFS_COMPRESS_LAST  = 3,
-};
 struct btrfs_inode_item {
        /* nfs style generation number */
        __le64 generation;
@@ -793,7 +788,7 @@ struct btrfs_root_item {
        /*
         * This generation number is used to test if the new fields are valid
-         * and up to date while reading the root item. Everytime the root item
+         * and up to date while reading the root item. Every time the root item
         * is written out, the "generation" field is copied into this field. If
         * anyone ever mounted the fs with an older kernel, we will have
         * mismatching generation values here and thus must invalidate the
@@ -1002,8 +997,10 @@ struct btrfs_dev_replace {
        pid_t lock_owner;
        atomic_t nesting_level;
        struct mutex lock_finishing_cancel_unmount;
-        struct mutex lock_management_lock;
+        rwlock_t lock;
-        struct mutex lock;
+        atomic_t read_locks;
+        atomic_t blocking_readers;
+        wait_queue_head_t read_lock_wq;
        struct btrfs_scrub_progress scrub_progress;
 };
@@ -1222,10 +1219,10 @@ struct btrfs_space_info {
         * we've called update_block_group and dropped the bytes_used counter
         * and increased the bytes_pinned counter.  However this means that
         * bytes_pinned does not reflect the bytes that will be pinned once the
-         * delayed refs are flushed, so this counter is inc'ed everytime we call
+         * delayed refs are flushed, so this counter is inc'ed every time we
-         * btrfs_free_extent so it is a realtime count of what will be freed
+         * call btrfs_free_extent so it is a realtime count of what will be
-         * once the transaction is committed.  It will be zero'ed everytime the
+         * freed once the transaction is committed.  It will be zero'ed every
-         * transaction commits.
+         * time the transaction commits.
         */
        struct percpu_counter total_bytes_pinned;
@@ -1822,6 +1819,9 @@ struct btrfs_fs_info {
        spinlock_t reada_lock;
        struct radix_tree_root reada_tree;
+        /* readahead works cnt */
+        atomic_t reada_works_cnt;
        /* Extent buffer radix tree */
        spinlock_t buffer_lock;
        struct radix_tree_root buffer_radix;
@@ -2185,13 +2185,43 @@ struct btrfs_ioctl_defrag_range_args {
 */
 #define BTRFS_QGROUP_RELATION_KEY       246
+/*
+ * Obsolete name, see BTRFS_TEMPORARY_ITEM_KEY.
+ */
 #define BTRFS_BALANCE_ITEM_KEY  248
 /*
- * Persistantly stores the io stats in the device tree.
+ * The key type for tree items that are stored persistently, but do not need to
- * One key for all stats, (0, BTRFS_DEV_STATS_KEY, devid).
+ * exist for extended period of time. The items can exist in any tree.
+ *
+ * [subtype, BTRFS_TEMPORARY_ITEM_KEY, data]
+ *
+ * Existing items:
+ *
+ * - balance status item
+ *   (BTRFS_BALANCE_OBJECTID, BTRFS_TEMPORARY_ITEM_KEY, 0)
 */
-#define BTRFS_DEV_STATS_KEY     249
+#define BTRFS_TEMPORARY_ITEM_KEY        248
+/*
+ * Obsolete name, see BTRFS_PERSISTENT_ITEM_KEY
+ */
+#define BTRFS_DEV_STATS_KEY             249
+/*
+ * The key type for tree items that are stored persistently and usually exist
+ * for a long period, eg. filesystem lifetime. The item kinds can be status
+ * information, stats or preference values. The item can exist in any tree.
+ *
+ * [subtype, BTRFS_PERSISTENT_ITEM_KEY, data]
+ *
+ * Existing items:
+ *
+ * - device statistics, store IO stats in the device tree, one key for all
+ *   stats
+ *   (BTRFS_DEV_STATS_OBJECTID, BTRFS_DEV_STATS_KEY, 0)
+ */
+#define BTRFS_PERSISTENT_ITEM_KEY       249
 /*
 * Persistantly stores the device replace state in the device tree.
@@ -2241,7 +2271,7 @@ struct btrfs_ioctl_defrag_range_args {
 #define BTRFS_MOUNT_ENOSPC_DEBUG         (1 << 15)
 #define BTRFS_MOUNT_AUTO_DEFRAG         (1 << 16)
 #define BTRFS_MOUNT_INODE_MAP_CACHE     (1 << 17)
-#define BTRFS_MOUNT_RECOVERY            (1 << 18)
+#define BTRFS_MOUNT_USEBACKUPROOT       (1 << 18)
 #define BTRFS_MOUNT_SKIP_BALANCE        (1 << 19)
 #define BTRFS_MOUNT_CHECK_INTEGRITY     (1 << 20)
 #define BTRFS_MOUNT_CHECK_INTEGRITY_INCLUDING_EXTENT_DATA (1 << 21)
@@ -2250,9 +2280,10 @@ struct btrfs_ioctl_defrag_range_args {
 #define BTRFS_MOUNT_FRAGMENT_DATA       (1 << 24)
 #define BTRFS_MOUNT_FRAGMENT_METADATA   (1 << 25)
 #define BTRFS_MOUNT_FREE_SPACE_TREE     (1 << 26)
+#define BTRFS_MOUNT_NOLOGREPLAY         (1 << 27)
 #define BTRFS_DEFAULT_COMMIT_INTERVAL   (30)
-#define BTRFS_DEFAULT_MAX_INLINE        (8192)
+#define BTRFS_DEFAULT_MAX_INLINE        (2048)
 #define btrfs_clear_opt(o, opt)         ((o) &= ~BTRFS_MOUNT_##opt)
 #define btrfs_set_opt(o, opt)           ((o) |= BTRFS_MOUNT_##opt)
@@ -2353,6 +2384,9 @@ struct btrfs_map_token {
        unsigned long offset;
 };
+#define BTRFS_BYTES_TO_BLKS(fs_info, bytes) \
+                                ((bytes) >> (fs_info)->sb->s_blocksize_bits)
 static inline void btrfs_init_map_token (struct btrfs_map_token *token)
 {
        token->kaddr = NULL;
@@ -3448,8 +3482,7 @@ u64 btrfs_csum_bytes_to_leaves(struct btrfs_root *root, u64 csum_bytes);
 static inline u64 btrfs_calc_trans_metadata_size(struct btrfs_root *root,
                                                 unsigned num_items)
 {
-        return (root->nodesize + root->nodesize * (BTRFS_MAX_LEVEL - 1)) *
+        return root->nodesize * BTRFS_MAX_LEVEL * 2 * num_items;
-                2 * num_items;
 }
 /*
@@ -4027,7 +4060,7 @@ int btrfs_unlink_subvol(struct btrfs_trans_handle *trans,
                        struct btrfs_root *root,
                        struct inode *dir, u64 objectid,
                        const char *name, int name_len);
-int btrfs_truncate_page(struct inode *inode, loff_t from, loff_t len,
+int btrfs_truncate_block(struct inode *inode, loff_t from, loff_t len,
                        int front);
 int btrfs_truncate_inode_items(struct btrfs_trans_handle *trans,
                               struct btrfs_root *root,
@@ -4089,6 +4122,7 @@ void btrfs_test_inode_set_ops(struct inode *inode);
 /* ioctl.c */
 long btrfs_ioctl(struct file *file, unsigned int cmd, unsigned long arg);
+int btrfs_ioctl_get_supported_features(void __user *arg);
 void btrfs_update_iflags(struct inode *inode);
 void btrfs_inherit_iflags(struct inode *inode, struct inode *dir);
 int btrfs_is_empty_uuid(u8 *uuid);
@@ -4151,7 +4185,8 @@ void btrfs_sysfs_remove_mounted(struct btrfs_fs_info *fs_info);
 ssize_t btrfs_listxattr(struct dentry *dentry, char *buffer, size_t size);
 /* super.c */
-int btrfs_parse_options(struct btrfs_root *root, char *options);
+int btrfs_parse_options(struct btrfs_root *root, char *options,
+                        unsigned long new_flags);
 int btrfs_sync_fs(struct super_block *sb, int wait);
 #ifdef CONFIG_PRINTK
@@ -4525,8 +4560,8 @@ struct reada_control *btrfs_reada_add(struct btrfs_root *root,
                              struct btrfs_key *start, struct btrfs_key *end);
 int btrfs_reada_wait(void *handle);
 void btrfs_reada_detach(void *handle);
-int btree_readahead_hook(struct btrfs_root *root, struct extent_buffer *eb,
+int btree_readahead_hook(struct btrfs_fs_info *fs_info,
-                         u64 start, int err);
+                         struct extent_buffer *eb, u64 start, int err);
 static inline int is_fstree(u64 rootid)
 {
diff --git a/fs/btrfs/delayed-inode.c b/fs/btrfs/delayed-inode.c
index b57daa895cea..6cef0062f929 100644
--- a/fs/btrfs/delayed-inode.c
+++ b/fs/btrfs/delayed-inode.c
@@ -43,8 +43,7 @@ int __init btrfs_delayed_inode_init(void)
 void btrfs_delayed_inode_exit(void)
 {
-        if (delayed_node_cache)
+        kmem_cache_destroy(delayed_node_cache);
-                kmem_cache_destroy(delayed_node_cache);
 }
 static inline void btrfs_init_delayed_node(
@@ -651,9 +650,14 @@ static int btrfs_delayed_inode_reserve_metadata(
                        goto out;
                ret = btrfs_block_rsv_migrate(src_rsv, dst_rsv, num_bytes);
-                if (!WARN_ON(ret))
+                if (!ret)
                        goto out;
+                if (btrfs_test_opt(root, ENOSPC_DEBUG)) {
+                        btrfs_debug(root->fs_info,
+                                    "block rsv migrate returned %d", ret);
+                        WARN_ON(1);
+                }
                /*
                 * Ok this is a problem, let's just steal from the global rsv
                 * since this really shouldn't happen that often.
diff --git a/fs/btrfs/delayed-ref.c b/fs/btrfs/delayed-ref.c
index 914ac13bd92f..430b3689b112 100644
--- a/fs/btrfs/delayed-ref.c
+++ b/fs/btrfs/delayed-ref.c
@@ -929,14 +929,10 @@ btrfs_find_delayed_ref_head(struct btrfs_trans_handle *trans, u64 bytenr)
 void btrfs_delayed_ref_exit(void)
 {
-        if (btrfs_delayed_ref_head_cachep)
+        kmem_cache_destroy(btrfs_delayed_ref_head_cachep);
-                kmem_cache_destroy(btrfs_delayed_ref_head_cachep);
+        kmem_cache_destroy(btrfs_delayed_tree_ref_cachep);
-        if (btrfs_delayed_tree_ref_cachep)
+        kmem_cache_destroy(btrfs_delayed_data_ref_cachep);
-                kmem_cache_destroy(btrfs_delayed_tree_ref_cachep);
+        kmem_cache_destroy(btrfs_delayed_extent_op_cachep);
-        if (btrfs_delayed_data_ref_cachep)
-                kmem_cache_destroy(btrfs_delayed_data_ref_cachep);
-        if (btrfs_delayed_extent_op_cachep)
-                kmem_cache_destroy(btrfs_delayed_extent_op_cachep);
 }
 int btrfs_delayed_ref_init(void)
diff --git a/fs/btrfs/dev-replace.c b/fs/btrfs/dev-replace.c
index cbb7dbfb3fff..a1d6652e0c47 100644
--- a/fs/btrfs/dev-replace.c
+++ b/fs/btrfs/dev-replace.c
@@ -202,13 +202,13 @@ int btrfs_run_dev_replace(struct btrfs_trans_handle *trans,
        struct btrfs_dev_replace_item *ptr;
        struct btrfs_dev_replace *dev_replace = &fs_info->dev_replace;
-        btrfs_dev_replace_lock(dev_replace);
+        btrfs_dev_replace_lock(dev_replace, 0);
        if (!dev_replace->is_valid ||
            !dev_replace->item_needs_writeback) {
-                btrfs_dev_replace_unlock(dev_replace);
+                btrfs_dev_replace_unlock(dev_replace, 0);
                return 0;
        }
-        btrfs_dev_replace_unlock(dev_replace);
+        btrfs_dev_replace_unlock(dev_replace, 0);
        key.objectid = 0;
        key.type = BTRFS_DEV_REPLACE_KEY;
@@ -264,7 +264,7 @@ int btrfs_run_dev_replace(struct btrfs_trans_handle *trans,
        ptr = btrfs_item_ptr(eb, path->slots[0],
                             struct btrfs_dev_replace_item);
-        btrfs_dev_replace_lock(dev_replace);
+        btrfs_dev_replace_lock(dev_replace, 1);
        if (dev_replace->srcdev)
                btrfs_set_dev_replace_src_devid(eb, ptr,
                        dev_replace->srcdev->devid);
@@ -287,7 +287,7 @@ int btrfs_run_dev_replace(struct btrfs_trans_handle *trans,
        btrfs_set_dev_replace_cursor_right(eb, ptr,
                dev_replace->cursor_right);
        dev_replace->item_needs_writeback = 0;
-        btrfs_dev_replace_unlock(dev_replace);
+        btrfs_dev_replace_unlock(dev_replace, 1);
        btrfs_mark_buffer_dirty(eb);
@@ -356,7 +356,7 @@ int btrfs_dev_replace_start(struct btrfs_root *root,
                return PTR_ERR(trans);
        }
-        btrfs_dev_replace_lock(dev_replace);
+        btrfs_dev_replace_lock(dev_replace, 1);
        switch (dev_replace->replace_state) {
        case BTRFS_IOCTL_DEV_REPLACE_STATE_NEVER_STARTED:
        case BTRFS_IOCTL_DEV_REPLACE_STATE_FINISHED:
@@ -395,7 +395,7 @@ int btrfs_dev_replace_start(struct btrfs_root *root,
        dev_replace->is_valid = 1;
        dev_replace->item_needs_writeback = 1;
        args->result = BTRFS_IOCTL_DEV_REPLACE_RESULT_NO_ERROR;
-        btrfs_dev_replace_unlock(dev_replace);
+        btrfs_dev_replace_unlock(dev_replace, 1);
        ret = btrfs_sysfs_add_device_link(tgt_device->fs_devices, tgt_device);
        if (ret)
@@ -407,7 +407,7 @@ int btrfs_dev_replace_start(struct btrfs_root *root,
        trans = btrfs_start_transaction(root, 0);
        if (IS_ERR(trans)) {
                ret = PTR_ERR(trans);
-                btrfs_dev_replace_lock(dev_replace);
+                btrfs_dev_replace_lock(dev_replace, 1);
                goto leave;
        }
@@ -433,7 +433,7 @@ int btrfs_dev_replace_start(struct btrfs_root *root,
 leave:
        dev_replace->srcdev = NULL;
        dev_replace->tgtdev = NULL;
-        btrfs_dev_replace_unlock(dev_replace);
+        btrfs_dev_replace_unlock(dev_replace, 1);
        btrfs_destroy_dev_replace_tgtdev(fs_info, tgt_device);
        return ret;
 }
@@ -471,18 +471,18 @@ static int btrfs_dev_replace_finishing(struct btrfs_fs_info *fs_info,
        /* don't allow cancel or unmount to disturb the finishing procedure */
        mutex_lock(&dev_replace->lock_finishing_cancel_unmount);
-        btrfs_dev_replace_lock(dev_replace);
+        btrfs_dev_replace_lock(dev_replace, 0);
        /* was the operation canceled, or is it finished? */
        if (dev_replace->replace_state !=
            BTRFS_IOCTL_DEV_REPLACE_STATE_STARTED) {
-                btrfs_dev_replace_unlock(dev_replace);
+                btrfs_dev_replace_unlock(dev_replace, 0);
                mutex_unlock(&dev_replace->lock_finishing_cancel_unmount);
                return 0;
        }
        tgt_device = dev_replace->tgtdev;
        src_device = dev_replace->srcdev;
-        btrfs_dev_replace_unlock(dev_replace);
+        btrfs_dev_replace_unlock(dev_replace, 0);
        /*
         * flush all outstanding I/O and inode extent mappings before the
@@ -507,7 +507,7 @@ static int btrfs_dev_replace_finishing(struct btrfs_fs_info *fs_info,
        /* keep away write_all_supers() during the finishing procedure */
        mutex_lock(&root->fs_info->fs_devices->device_list_mutex);
        mutex_lock(&root->fs_info->chunk_mutex);
-        btrfs_dev_replace_lock(dev_replace);
+        btrfs_dev_replace_lock(dev_replace, 1);
        dev_replace->replace_state =
                scrub_ret ? BTRFS_IOCTL_DEV_REPLACE_STATE_CANCELED
                          : BTRFS_IOCTL_DEV_REPLACE_STATE_FINISHED;
@@ -528,7 +528,7 @@ static int btrfs_dev_replace_finishing(struct btrfs_fs_info *fs_info,
                                rcu_str_deref(src_device->name),
                              src_device->devid,
                              rcu_str_deref(tgt_device->name), scrub_ret);
-                btrfs_dev_replace_unlock(dev_replace);
+                btrfs_dev_replace_unlock(dev_replace, 1);
                mutex_unlock(&root->fs_info->chunk_mutex);
                mutex_unlock(&root->fs_info->fs_devices->device_list_mutex);
                mutex_unlock(&uuid_mutex);
@@ -565,7 +565,7 @@ static int btrfs_dev_replace_finishing(struct btrfs_fs_info *fs_info,
        list_add(&tgt_device->dev_alloc_list, &fs_info->fs_devices->alloc_list);
        fs_info->fs_devices->rw_devices++;
-        btrfs_dev_replace_unlock(dev_replace);
+        btrfs_dev_replace_unlock(dev_replace, 1);
        btrfs_rm_dev_replace_blocked(fs_info);
@@ -649,7 +649,7 @@ void btrfs_dev_replace_status(struct btrfs_fs_info *fs_info,
        struct btrfs_dev_replace *dev_replace = &fs_info->dev_replace;
        struct btrfs_device *srcdev;
-        btrfs_dev_replace_lock(dev_replace);
+        btrfs_dev_replace_lock(dev_replace, 0);
        /* even if !dev_replace_is_valid, the values are good enough for
         * the replace_status ioctl */
        args->result = BTRFS_IOCTL_DEV_REPLACE_RESULT_NO_ERROR;
@@ -675,7 +675,7 @@ void btrfs_dev_replace_status(struct btrfs_fs_info *fs_info,
                        div_u64(btrfs_device_get_total_bytes(srcdev), 1000));
                break;
        }
-        btrfs_dev_replace_unlock(dev_replace);
+        btrfs_dev_replace_unlock(dev_replace, 0);
 }
 int btrfs_dev_replace_cancel(struct btrfs_fs_info *fs_info,
@@ -698,13 +698,13 @@ static u64 __btrfs_dev_replace_cancel(struct btrfs_fs_info *fs_info)
                return -EROFS;
        mutex_lock(&dev_replace->lock_finishing_cancel_unmount);
-        btrfs_dev_replace_lock(dev_replace);
+        btrfs_dev_replace_lock(dev_replace, 1);
        switch (dev_replace->replace_state) {
        case BTRFS_IOCTL_DEV_REPLACE_STATE_NEVER_STARTED:
        case BTRFS_IOCTL_DEV_REPLACE_STATE_FINISHED:
        case BTRFS_IOCTL_DEV_REPLACE_STATE_CANCELED:
                result = BTRFS_IOCTL_DEV_REPLACE_RESULT_NOT_STARTED;
-                btrfs_dev_replace_unlock(dev_replace);
+                btrfs_dev_replace_unlock(dev_replace, 1);
                goto leave;
        case BTRFS_IOCTL_DEV_REPLACE_STATE_STARTED:
        case BTRFS_IOCTL_DEV_REPLACE_STATE_SUSPENDED:
@@ -717,7 +717,7 @@ static u64 __btrfs_dev_replace_cancel(struct btrfs_fs_info *fs_info)
        dev_replace->replace_state = BTRFS_IOCTL_DEV_REPLACE_STATE_CANCELED;
        dev_replace->time_stopped = get_seconds();
        dev_replace->item_needs_writeback = 1;
-        btrfs_dev_replace_unlock(dev_replace);
+        btrfs_dev_replace_unlock(dev_replace, 1);
        btrfs_scrub_cancel(fs_info);
        trans = btrfs_start_transaction(root, 0);
@@ -740,7 +740,7 @@ void btrfs_dev_replace_suspend_for_unmount(struct btrfs_fs_info *fs_info)
        struct btrfs_dev_replace *dev_replace = &fs_info->dev_replace;
        mutex_lock(&dev_replace->lock_finishing_cancel_unmount);
-        btrfs_dev_replace_lock(dev_replace);
+        btrfs_dev_replace_lock(dev_replace, 1);
        switch (dev_replace->replace_state) {
        case BTRFS_IOCTL_DEV_REPLACE_STATE_NEVER_STARTED:
        case BTRFS_IOCTL_DEV_REPLACE_STATE_FINISHED:
@@ -756,7 +756,7 @@ void btrfs_dev_replace_suspend_for_unmount(struct btrfs_fs_info *fs_info)
                break;
        }
-        btrfs_dev_replace_unlock(dev_replace);
+        btrfs_dev_replace_unlock(dev_replace, 1);
        mutex_unlock(&dev_replace->lock_finishing_cancel_unmount);
 }
@@ -766,12 +766,12 @@ int btrfs_resume_dev_replace_async(struct btrfs_fs_info *fs_info)
        struct task_struct *task;
        struct btrfs_dev_replace *dev_replace = &fs_info->dev_replace;
-        btrfs_dev_replace_lock(dev_replace);
+        btrfs_dev_replace_lock(dev_replace, 1);
        switch (dev_replace->replace_state) {
        case BTRFS_IOCTL_DEV_REPLACE_STATE_NEVER_STARTED:
        case BTRFS_IOCTL_DEV_REPLACE_STATE_FINISHED:
        case BTRFS_IOCTL_DEV_REPLACE_STATE_CANCELED:
-                btrfs_dev_replace_unlock(dev_replace);
+                btrfs_dev_replace_unlock(dev_replace, 1);
                return 0;
        case BTRFS_IOCTL_DEV_REPLACE_STATE_STARTED:
                break;
@@ -784,10 +784,10 @@ int btrfs_resume_dev_replace_async(struct btrfs_fs_info *fs_info)
                btrfs_info(fs_info, "cannot continue dev_replace, tgtdev is missing");
                btrfs_info(fs_info,
                        "you may cancel the operation after 'mount -o degraded'");
-                btrfs_dev_replace_unlock(dev_replace);
+                btrfs_dev_replace_unlock(dev_replace, 1);
                return 0;
        }
-        btrfs_dev_replace_unlock(dev_replace);
+        btrfs_dev_replace_unlock(dev_replace, 1);
        WARN_ON(atomic_xchg(
                &fs_info->mutually_exclusive_operation_running, 1));
@@ -802,7 +802,7 @@ static int btrfs_dev_replace_kthread(void *data)
        struct btrfs_ioctl_dev_replace_args *status_args;
        u64 progress;
-        status_args = kzalloc(sizeof(*status_args), GFP_NOFS);
+        status_args = kzalloc(sizeof(*status_args), GFP_KERNEL);
        if (status_args) {
                btrfs_dev_replace_status(fs_info, status_args);
                progress = status_args->status.progress_1000;
@@ -858,55 +858,65 @@ int btrfs_dev_replace_is_ongoing(struct btrfs_dev_replace *dev_replace)
                 * not called and the the filesystem is remounted
                 * in degraded state. This does not stop the
                 * dev_replace procedure. It needs to be canceled
-                 * manually if the cancelation is wanted.
+                 * manually if the cancellation is wanted.
                 */
                break;
        }
        return 1;
 }
-void btrfs_dev_replace_lock(struct btrfs_dev_replace *dev_replace)
+void btrfs_dev_replace_lock(struct btrfs_dev_replace *dev_replace, int rw)
 {
-        /* the beginning is just an optimization for the typical case */
+        if (rw == 1) {
-        if (atomic_read(&dev_replace->nesting_level) == 0) {
+                /* write */
-acquire_lock:
+again:
-                /* this is not a nested case where the same thread
+                wait_event(dev_replace->read_lock_wq,
-                 * is trying to acqurire the same lock twice */
+                           atomic_read(&dev_replace->blocking_readers) == 0);
-                mutex_lock(&dev_replace->lock);
+                write_lock(&dev_replace->lock);
-                mutex_lock(&dev_replace->lock_management_lock);
+                if (atomic_read(&dev_replace->blocking_readers)) {
-                dev_replace->lock_owner = current->pid;
+                        write_unlock(&dev_replace->lock);
-                atomic_inc(&dev_replace->nesting_level);
+                        goto again;
-                mutex_unlock(&dev_replace->lock_management_lock);
+                }
-                return;
+        } else {
+                read_lock(&dev_replace->lock);
+                atomic_inc(&dev_replace->read_locks);
        }
+}
-        mutex_lock(&dev_replace->lock_management_lock);
+void btrfs_dev_replace_unlock(struct btrfs_dev_replace *dev_replace, int rw)
-        if (atomic_read(&dev_replace->nesting_level) > 0 &&
+{
-            dev_replace->lock_owner == current->pid) {
+        if (rw == 1) {
-                WARN_ON(!mutex_is_locked(&dev_replace->lock));
+                /* write */
-                atomic_inc(&dev_replace->nesting_level);
+                ASSERT(atomic_read(&dev_replace->blocking_readers) == 0);
-                mutex_unlock(&dev_replace->lock_management_lock);
+                write_unlock(&dev_replace->lock);
-                return;
+        } else {
+                ASSERT(atomic_read(&dev_replace->read_locks) > 0);
+                atomic_dec(&dev_replace->read_locks);
+                read_unlock(&dev_replace->lock);
        }
+}
-        mutex_unlock(&dev_replace->lock_management_lock);
+/* inc blocking cnt and release read lock */
-        goto acquire_lock;
+void btrfs_dev_replace_set_lock_blocking(
+                                        struct btrfs_dev_replace *dev_replace)
+{
+        /* only set blocking for read lock */
+        ASSERT(atomic_read(&dev_replace->read_locks) > 0);
+        atomic_inc(&dev_replace->blocking_readers);
+        read_unlock(&dev_replace->lock);
 }
-void btrfs_dev_replace_unlock(struct btrfs_dev_replace *dev_replace)
+/* acquire read lock and dec blocking cnt */
+void btrfs_dev_replace_clear_lock_blocking(
+                                        struct btrfs_dev_replace *dev_replace)
 {
-        WARN_ON(!mutex_is_locked(&dev_replace->lock));
+        /* only set blocking for read lock */
-        mutex_lock(&dev_replace->lock_management_lock);
+        ASSERT(atomic_read(&dev_replace->read_locks) > 0);
-        WARN_ON(atomic_read(&dev_replace->nesting_level) < 1);
+        ASSERT(atomic_read(&dev_replace->blocking_readers) > 0);
-        WARN_ON(dev_replace->lock_owner != current->pid);
+        read_lock(&dev_replace->lock);
-        atomic_dec(&dev_replace->nesting_level);
+        if (atomic_dec_and_test(&dev_replace->blocking_readers) &&
-        if (atomic_read(&dev_replace->nesting_level) == 0) {
+            waitqueue_active(&dev_replace->read_lock_wq))
-                dev_replace->lock_owner = 0;
+                wake_up(&dev_replace->read_lock_wq);
-                mutex_unlock(&dev_replace->lock_management_lock);
-                mutex_unlock(&dev_replace->lock);
-        } else {
-                mutex_unlock(&dev_replace->lock_management_lock);
-        }
 }
 void btrfs_bio_counter_inc_noblocked(struct btrfs_fs_info *fs_info)
diff --git a/fs/btrfs/dev-replace.h b/fs/btrfs/dev-replace.h
index 20035cbbf021..29e3ef5f96bd 100644
--- a/fs/btrfs/dev-replace.h
+++ b/fs/btrfs/dev-replace.h
@@ -34,8 +34,11 @@ int btrfs_dev_replace_cancel(struct btrfs_fs_info *fs_info,
 void btrfs_dev_replace_suspend_for_unmount(struct btrfs_fs_info *fs_info);
 int btrfs_resume_dev_replace_async(struct btrfs_fs_info *fs_info);
 int btrfs_dev_replace_is_ongoing(struct btrfs_dev_replace *dev_replace);
-void btrfs_dev_replace_lock(struct btrfs_dev_replace *dev_replace);
+void btrfs_dev_replace_lock(struct btrfs_dev_replace *dev_replace, int rw);
-void btrfs_dev_replace_unlock(struct btrfs_dev_replace *dev_replace);
+void btrfs_dev_replace_unlock(struct btrfs_dev_replace *dev_replace, int rw);
+void btrfs_dev_replace_set_lock_blocking(struct btrfs_dev_replace *dev_replace);
+void btrfs_dev_replace_clear_lock_blocking(
+                                        struct btrfs_dev_replace *dev_replace);
 static inline void btrfs_dev_replace_stats_inc(atomic64_t *stat_value)
 {
diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
index 4545e2e2ad45..4b02591b0301 100644
--- a/fs/btrfs/disk-io.c
+++ b/fs/btrfs/disk-io.c
@@ -50,6 +50,7 @@
 #include "raid56.h"
 #include "sysfs.h"
 #include "qgroup.h"
+#include "compression.h"
 #ifdef CONFIG_X86
 #include <asm/cpufeature.h>
@@ -110,8 +111,7 @@ int __init btrfs_end_io_wq_init(void)
 void btrfs_end_io_wq_exit(void)
 {
-        if (btrfs_end_io_wq_cache)
+        kmem_cache_destroy(btrfs_end_io_wq_cache);
-                kmem_cache_destroy(btrfs_end_io_wq_cache);
 }
 /*
@@ -612,6 +612,7 @@ static int btree_readpage_end_io_hook(struct btrfs_io_bio *io_bio,
        int found_level;
        struct extent_buffer *eb;
        struct btrfs_root *root = BTRFS_I(page->mapping->host)->root;
+        struct btrfs_fs_info *fs_info = root->fs_info;
        int ret = 0;
        int reads_done;
@@ -637,21 +638,21 @@ static int btree_readpage_end_io_hook(struct btrfs_io_bio *io_bio,
        found_start = btrfs_header_bytenr(eb);
        if (found_start != eb->start) {
-                btrfs_err_rl(eb->fs_info, "bad tree block start %llu %llu",
+                btrfs_err_rl(fs_info, "bad tree block start %llu %llu",
-                               found_start, eb->start);
+                             found_start, eb->start);
                ret = -EIO;
                goto err;
        }
-        if (check_tree_block_fsid(root->fs_info, eb)) {
+        if (check_tree_block_fsid(fs_info, eb)) {
-                btrfs_err_rl(eb->fs_info, "bad fsid on block %llu",
+                btrfs_err_rl(fs_info, "bad fsid on block %llu",
-                               eb->start);
+                             eb->start);
                ret = -EIO;
                goto err;
        }
        found_level = btrfs_header_level(eb);
        if (found_level >= BTRFS_MAX_LEVEL) {
-                btrfs_err(root->fs_info, "bad tree block level %d",
+                btrfs_err(fs_info, "bad tree block level %d",
-                           (int)btrfs_header_level(eb));
+                          (int)btrfs_header_level(eb));
                ret = -EIO;
                goto err;
        }
@@ -659,7 +660,7 @@ static int btree_readpage_end_io_hook(struct btrfs_io_bio *io_bio,
        btrfs_set_buffer_lockdep_class(btrfs_header_owner(eb),
                                       eb, found_level);
-        ret = csum_tree_block(root->fs_info, eb, 1);
+        ret = csum_tree_block(fs_info, eb, 1);
        if (ret) {
                ret = -EIO;
                goto err;
@@ -680,7 +681,7 @@ static int btree_readpage_end_io_hook(struct btrfs_io_bio *io_bio,
 err:
        if (reads_done &&
            test_and_clear_bit(EXTENT_BUFFER_READAHEAD, &eb->bflags))
-                btree_readahead_hook(root, eb, eb->start, ret);
+                btree_readahead_hook(fs_info, eb, eb->start, ret);
        if (ret) {
                /*
@@ -699,14 +700,13 @@ out:
 static int btree_io_failed_hook(struct page *page, int failed_mirror)
 {
        struct extent_buffer *eb;
-        struct btrfs_root *root = BTRFS_I(page->mapping->host)->root;
        eb = (struct extent_buffer *)page->private;
        set_bit(EXTENT_BUFFER_READ_ERR, &eb->bflags);
        eb->read_mirror = failed_mirror;
        atomic_dec(&eb->io_pages);
        if (test_and_clear_bit(EXTENT_BUFFER_READAHEAD, &eb->bflags))
-                btree_readahead_hook(root, eb, eb->start, -EIO);
+                btree_readahead_hook(eb->fs_info, eb, eb->start, -EIO);
        return -EIO;    /* we fixed nothing */
 }
@@ -816,7 +816,7 @@ static void run_one_async_done(struct btrfs_work *work)
            waitqueue_active(&fs_info->async_submit_wait))
                wake_up(&fs_info->async_submit_wait);
-        /* If an error occured we just want to clean up the bio and move on */
+        /* If an error occurred we just want to clean up the bio and move on */
        if (async->error) {
                async->bio->bi_error = async->error;
                bio_endio(async->bio);
@@ -931,7 +931,7 @@ static int check_async_write(struct inode *inode, unsigned long bio_flags)
        if (bio_flags & EXTENT_BIO_TREE_LOG)
                return 0;
 #ifdef CONFIG_X86
-        if (static_cpu_has_safe(X86_FEATURE_XMM4_2))
+        if (static_cpu_has(X86_FEATURE_XMM4_2))
                return 0;
 #endif
        return 1;
@@ -1296,9 +1296,10 @@ static void __setup_root(u32 nodesize, u32 sectorsize, u32 stripesize,
        spin_lock_init(&root->root_item_lock);
 }
-static struct btrfs_root *btrfs_alloc_root(struct btrfs_fs_info *fs_info)
+static struct btrfs_root *btrfs_alloc_root(struct btrfs_fs_info *fs_info,
+                gfp_t flags)
 {
-        struct btrfs_root *root = kzalloc(sizeof(*root), GFP_NOFS);
+        struct btrfs_root *root = kzalloc(sizeof(*root), flags);
        if (root)
                root->fs_info = fs_info;
        return root;
@@ -1310,7 +1311,7 @@ struct btrfs_root *btrfs_alloc_dummy_root(void)
 {
        struct btrfs_root *root;
-        root = btrfs_alloc_root(NULL);
+        root = btrfs_alloc_root(NULL, GFP_KERNEL);
        if (!root)
                return ERR_PTR(-ENOMEM);
        __setup_root(4096, 4096, 4096, root, NULL, 1);
@@ -1332,7 +1333,7 @@ struct btrfs_root *btrfs_create_tree(struct btrfs_trans_handle *trans,
        int ret = 0;
        uuid_le uuid;
-        root = btrfs_alloc_root(fs_info);
+        root = btrfs_alloc_root(fs_info, GFP_KERNEL);
        if (!root)
                return ERR_PTR(-ENOMEM);
@@ -1408,7 +1409,7 @@ static struct btrfs_root *alloc_log_tree(struct btrfs_trans_handle *trans,
        struct btrfs_root *tree_root = fs_info->tree_root;
        struct extent_buffer *leaf;
-        root = btrfs_alloc_root(fs_info);
+        root = btrfs_alloc_root(fs_info, GFP_NOFS);
        if (!root)
                return ERR_PTR(-ENOMEM);
@@ -1506,7 +1507,7 @@ static struct btrfs_root *btrfs_read_tree_root(struct btrfs_root *tree_root,
        if (!path)
                return ERR_PTR(-ENOMEM);
-        root = btrfs_alloc_root(fs_info);
+        root = btrfs_alloc_root(fs_info, GFP_NOFS);
        if (!root) {
                ret = -ENOMEM;
                goto alloc_fail;
@@ -2272,9 +2273,11 @@ static void btrfs_init_dev_replace_locks(struct btrfs_fs_info *fs_info)
        fs_info->dev_replace.lock_owner = 0;
        atomic_set(&fs_info->dev_replace.nesting_level, 0);
        mutex_init(&fs_info->dev_replace.lock_finishing_cancel_unmount);
-        mutex_init(&fs_info->dev_replace.lock_management_lock);
+        rwlock_init(&fs_info->dev_replace.lock);
-        mutex_init(&fs_info->dev_replace.lock);
+        atomic_set(&fs_info->dev_replace.read_locks, 0);
+        atomic_set(&fs_info->dev_replace.blocking_readers, 0);
        init_waitqueue_head(&fs_info->replace_wait);
+        init_waitqueue_head(&fs_info->dev_replace.read_lock_wq);
 }
 static void btrfs_init_qgroup(struct btrfs_fs_info *fs_info)
@@ -2385,7 +2388,7 @@ static int btrfs_replay_log(struct btrfs_fs_info *fs_info,
                return -EIO;
        }
-        log_tree_root = btrfs_alloc_root(fs_info);
+        log_tree_root = btrfs_alloc_root(fs_info, GFP_KERNEL);
        if (!log_tree_root)
                return -ENOMEM;
@@ -2510,8 +2513,8 @@ int open_ctree(struct super_block *sb,
        int backup_index = 0;
        int max_active;
-        tree_root = fs_info->tree_root = btrfs_alloc_root(fs_info);
+        tree_root = fs_info->tree_root = btrfs_alloc_root(fs_info, GFP_KERNEL);
-        chunk_root = fs_info->chunk_root = btrfs_alloc_root(fs_info);
+        chunk_root = fs_info->chunk_root = btrfs_alloc_root(fs_info, GFP_KERNEL);
        if (!tree_root || !chunk_root) {
                err = -ENOMEM;
                goto fail;
@@ -2603,6 +2606,7 @@ int open_ctree(struct super_block *sb,
        atomic_set(&fs_info->nr_async_bios, 0);
        atomic_set(&fs_info->defrag_running, 0);
        atomic_set(&fs_info->qgroup_op_seq, 0);
+        atomic_set(&fs_info->reada_works_cnt, 0);
        atomic64_set(&fs_info->tree_mod_seq, 0);
        fs_info->sb = sb;
        fs_info->max_inline = BTRFS_DEFAULT_MAX_INLINE;
@@ -2622,7 +2626,7 @@ int open_ctree(struct super_block *sb,
        INIT_LIST_HEAD(&fs_info->ordered_roots);
        spin_lock_init(&fs_info->ordered_root_lock);
        fs_info->delayed_root = kmalloc(sizeof(struct btrfs_delayed_root),
-                                        GFP_NOFS);
+                                        GFP_KERNEL);
        if (!fs_info->delayed_root) {
                err = -ENOMEM;
                goto fail_iput;
@@ -2750,7 +2754,7 @@ int open_ctree(struct super_block *sb,
         */
        fs_info->compress_type = BTRFS_COMPRESS_ZLIB;
-        ret = btrfs_parse_options(tree_root, options);
+        ret = btrfs_parse_options(tree_root, options, sb->s_flags);
        if (ret) {
                err = ret;
                goto fail_alloc;
@@ -3029,8 +3033,9 @@ retry_root_backup:
        if (ret)
                goto fail_trans_kthread;
-        /* do not make disk changes in broken FS */
+        /* do not make disk changes in broken FS or nologreplay is given */
-        if (btrfs_super_log_root(disk_super) != 0) {
+        if (btrfs_super_log_root(disk_super) != 0 &&
+            !btrfs_test_opt(tree_root, NOLOGREPLAY)) {
                ret = btrfs_replay_log(fs_info, fs_devices);
                if (ret) {
                        err = ret;
@@ -3146,6 +3151,12 @@ retry_root_backup:
        fs_info->open = 1;
+        /*
+         * backuproot only affect mount behavior, and if open_ctree succeeded,
+         * no need to keep the flag
+         */
+        btrfs_clear_opt(fs_info->mount_opt, USEBACKUPROOT);
        return 0;
 fail_qgroup:
@@ -3200,7 +3211,7 @@ fail:
        return err;
 recovery_tree_root:
-        if (!btrfs_test_opt(tree_root, RECOVERY))
+        if (!btrfs_test_opt(tree_root, USEBACKUPROOT))
                goto fail_tree_roots;
        free_root_pointers(fs_info, 0);
diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
index e2287c7c10be..53e12977bfd0 100644
--- a/fs/btrfs/extent-tree.c
+++ b/fs/btrfs/extent-tree.c
@@ -4838,7 +4838,7 @@ static inline int need_do_async_reclaim(struct btrfs_space_info *space_info,
        u64 thresh = div_factor_fine(space_info->total_bytes, 98);
        /* If we're just plain full then async reclaim just slows us down. */
-        if (space_info->bytes_used >= thresh)
+        if ((space_info->bytes_used + space_info->bytes_reserved) >= thresh)
                return 0;
        return (used >= thresh && !btrfs_fs_closing(fs_info) &&
@@ -5373,27 +5373,33 @@ static void update_global_block_rsv(struct btrfs_fs_info *fs_info)
        block_rsv->size = min_t(u64, num_bytes, SZ_512M);
-        num_bytes = sinfo->bytes_used + sinfo->bytes_pinned +
+        if (block_rsv->reserved < block_rsv->size) {
-                    sinfo->bytes_reserved + sinfo->bytes_readonly +
+                num_bytes = sinfo->bytes_used + sinfo->bytes_pinned +
-                    sinfo->bytes_may_use;
+                        sinfo->bytes_reserved + sinfo->bytes_readonly +
+                        sinfo->bytes_may_use;
-        if (sinfo->total_bytes > num_bytes) {
+                if (sinfo->total_bytes > num_bytes) {
-                num_bytes = sinfo->total_bytes - num_bytes;
+                        num_bytes = sinfo->total_bytes - num_bytes;
-                block_rsv->reserved += num_bytes;
+                        num_bytes = min(num_bytes,
-                sinfo->bytes_may_use += num_bytes;
+                                        block_rsv->size - block_rsv->reserved);
-                trace_btrfs_space_reservation(fs_info, "space_info",
+                        block_rsv->reserved += num_bytes;
-                                      sinfo->flags, num_bytes, 1);
+                        sinfo->bytes_may_use += num_bytes;
-        }
+                        trace_btrfs_space_reservation(fs_info, "space_info",
+                                                      sinfo->flags, num_bytes,
-        if (block_rsv->reserved >= block_rsv->size) {
+                                                      1);
+                }
+        } else if (block_rsv->reserved > block_rsv->size) {
                num_bytes = block_rsv->reserved - block_rsv->size;
                sinfo->bytes_may_use -= num_bytes;
                trace_btrfs_space_reservation(fs_info, "space_info",
                                      sinfo->flags, num_bytes, 0);
                block_rsv->reserved = block_rsv->size;
-                block_rsv->full = 1;
        }
+        if (block_rsv->reserved == block_rsv->size)
+                block_rsv->full = 1;
+        else
+                block_rsv->full = 0;
        spin_unlock(&block_rsv->lock);
        spin_unlock(&sinfo->lock);
 }
@@ -5752,7 +5758,7 @@ out_fail:
                /*
                 * This is tricky, but first we need to figure out how much we
-                 * free'd from any free-ers that occured during this
+                 * free'd from any free-ers that occurred during this
                 * reservation, so we reset ->csum_bytes to the csum_bytes
                 * before we dropped our lock, and then call the free for the
                 * number of bytes that were freed while we were trying our
@@ -7018,7 +7024,7 @@ btrfs_lock_cluster(struct btrfs_block_group_cache *block_group,
                   struct btrfs_free_cluster *cluster,
                   int delalloc)
 {
-        struct btrfs_block_group_cache *used_bg;
+        struct btrfs_block_group_cache *used_bg = NULL;
        bool locked = false;
 again:
        spin_lock(&cluster->refill_lock);
diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c
index 392592dc7010..76a0c8597d98 100644
--- a/fs/btrfs/extent_io.c
+++ b/fs/btrfs/extent_io.c
@@ -206,10 +206,8 @@ void extent_io_exit(void)
         * destroy caches.
         */
        rcu_barrier();
-        if (extent_state_cache)
+        kmem_cache_destroy(extent_state_cache);
-                kmem_cache_destroy(extent_state_cache);
+        kmem_cache_destroy(extent_buffer_cache);
-        if (extent_buffer_cache)
-                kmem_cache_destroy(extent_buffer_cache);
        if (btrfs_bioset)
                bioset_free(btrfs_bioset);
 }
@@ -232,7 +230,7 @@ static struct extent_state *alloc_extent_state(gfp_t mask)
        if (!state)
                return state;
        state->state = 0;
-        state->private = 0;
+        state->failrec = NULL;
        RB_CLEAR_NODE(&state->rb_node);
        btrfs_leak_debug_add(&state->leak_list, &states);
        atomic_set(&state->refs, 1);
@@ -1844,7 +1842,8 @@ out:
 * set the private field for a given byte offset in the tree.  If there isn't
 * an extent_state there already, this does nothing.
 */
-static int set_state_private(struct extent_io_tree *tree, u64 start, u64 private)
+static noinline int set_state_failrec(struct extent_io_tree *tree, u64 start,
+                struct io_failure_record *failrec)
 {
        struct rb_node *node;
        struct extent_state *state;
@@ -1865,13 +1864,14 @@ static int set_state_private(struct extent_io_tree *tree, u64 start, u64 private
                ret = -ENOENT;
                goto out;
        }
-        state->private = private;
+        state->failrec = failrec;
 out:
        spin_unlock(&tree->lock);
        return ret;
 }
-int get_state_private(struct extent_io_tree *tree, u64 start, u64 *private)
+static noinline int get_state_failrec(struct extent_io_tree *tree, u64 start,
+                struct io_failure_record **failrec)
 {
        struct rb_node *node;
        struct extent_state *state;
@@ -1892,7 +1892,7 @@ int get_state_private(struct extent_io_tree *tree, u64 start, u64 *private)
                ret = -ENOENT;
                goto out;
        }
-        *private = state->private;
+        *failrec = state->failrec;
 out:
        spin_unlock(&tree->lock);
        return ret;
@@ -1972,7 +1972,7 @@ int free_io_failure(struct inode *inode, struct io_failure_record *rec)
        int err = 0;
        struct extent_io_tree *failure_tree = &BTRFS_I(inode)->io_failure_tree;
-        set_state_private(failure_tree, rec->start, 0);
+        set_state_failrec(failure_tree, rec->start, NULL);
        ret = clear_extent_bits(failure_tree, rec->start,
                                rec->start + rec->len - 1,
                                EXTENT_LOCKED | EXTENT_DIRTY, GFP_NOFS);
@@ -2089,7 +2089,6 @@ int clean_io_failure(struct inode *inode, u64 start, struct page *page,
                     unsigned int pg_offset)
 {
        u64 private;
-        u64 private_failure;
        struct io_failure_record *failrec;
        struct btrfs_fs_info *fs_info = BTRFS_I(inode)->root->fs_info;
        struct extent_state *state;
@@ -2102,12 +2101,11 @@ int clean_io_failure(struct inode *inode, u64 start, struct page *page,
        if (!ret)
                return 0;
-        ret = get_state_private(&BTRFS_I(inode)->io_failure_tree, start,
+        ret = get_state_failrec(&BTRFS_I(inode)->io_failure_tree, start,
-                                &private_failure);
+                        &failrec);
        if (ret)
                return 0;
-        failrec = (struct io_failure_record *)(unsigned long) private_failure;
        BUG_ON(!failrec->this_mirror);
        if (failrec->in_validation) {
@@ -2167,7 +2165,7 @@ void btrfs_free_io_failure_record(struct inode *inode, u64 start, u64 end)
                next = next_state(state);
-                failrec = (struct io_failure_record *)(unsigned long)state->private;
+                failrec = state->failrec;
                free_extent_state(state);
                kfree(failrec);
@@ -2177,10 +2175,9 @@ void btrfs_free_io_failure_record(struct inode *inode, u64 start, u64 end)
 }
 int btrfs_get_io_failure_record(struct inode *inode, u64 start, u64 end,
-                                struct io_failure_record **failrec_ret)
+                struct io_failure_record **failrec_ret)
 {
        struct io_failure_record *failrec;
-        u64 private;
        struct extent_map *em;
        struct extent_io_tree *failure_tree = &BTRFS_I(inode)->io_failure_tree;
        struct extent_io_tree *tree = &BTRFS_I(inode)->io_tree;
@@ -2188,7 +2185,7 @@ int btrfs_get_io_failure_record(struct inode *inode, u64 start, u64 end,
        int ret;
        u64 logical;
-        ret = get_state_private(failure_tree, start, &private);
+        ret = get_state_failrec(failure_tree, start, &failrec);
        if (ret) {
                failrec = kzalloc(sizeof(*failrec), GFP_NOFS);
                if (!failrec)
@@ -2237,8 +2234,7 @@ int btrfs_get_io_failure_record(struct inode *inode, u64 start, u64 end,
                ret = set_extent_bits(failure_tree, start, end,
                                        EXTENT_LOCKED | EXTENT_DIRTY, GFP_NOFS);
                if (ret >= 0)
-                        ret = set_state_private(failure_tree, start,
+                        ret = set_state_failrec(failure_tree, start, failrec);
-                                                (u64)(unsigned long)failrec);
                /* set the bits in the inode's tree */
                if (ret >= 0)
                        ret = set_extent_bits(tree, start, end, EXTENT_DAMAGED,
@@ -2248,7 +2244,6 @@ int btrfs_get_io_failure_record(struct inode *inode, u64 start, u64 end,
                        return ret;
                }
        } else {
-                failrec = (struct io_failure_record *)(unsigned long)private;
                pr_debug("Get IO Failure Record: (found) logical=%llu, start=%llu, len=%llu, validation=%d\n",
                         failrec->logical, failrec->start, failrec->len,
                         failrec->in_validation);
@@ -3177,7 +3172,8 @@ static int __extent_read_full_page(struct extent_io_tree *tree,
        while (1) {
                lock_extent(tree, start, end);
-                ordered = btrfs_lookup_ordered_extent(inode, start);
+                ordered = btrfs_lookup_ordered_range(inode, start,
+                                                PAGE_CACHE_SIZE);
                if (!ordered)
                        break;
                unlock_extent(tree, start, end);
diff --git a/fs/btrfs/extent_io.h b/fs/btrfs/extent_io.h
index 880d5292e972..5dbf92e68fbd 100644
--- a/fs/btrfs/extent_io.h
+++ b/fs/btrfs/extent_io.h
@@ -61,6 +61,7 @@
 struct extent_state;
 struct btrfs_root;
 struct btrfs_io_bio;
+struct io_failure_record;
 typedef int (extent_submit_bio_hook_t)(struct inode *inode, int rw,
                                       struct bio *bio, int mirror_num,
@@ -111,8 +112,7 @@ struct extent_state {
        atomic_t refs;
        unsigned state;
-        /* for use by the FS */
+        struct io_failure_record *failrec;
-        u64 private;
 #ifdef CONFIG_BTRFS_DEBUG
        struct list_head leak_list;
@@ -342,7 +342,6 @@ int extent_readpages(struct extent_io_tree *tree,
                     get_extent_t get_extent);
 int extent_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
                __u64 start, __u64 len, get_extent_t *get_extent);
-int get_state_private(struct extent_io_tree *tree, u64 start, u64 *private);
 void set_page_extent_mapped(struct page *page);
 struct extent_buffer *alloc_extent_buffer(struct btrfs_fs_info *fs_info,
diff --git a/fs/btrfs/extent_map.c b/fs/btrfs/extent_map.c
index 84fb56d5c018..318b048eb254 100644
--- a/fs/btrfs/extent_map.c
+++ b/fs/btrfs/extent_map.c
@@ -4,6 +4,7 @@
 #include <linux/hardirq.h>
 #include "ctree.h"
 #include "extent_map.h"
+#include "compression.h"
 static struct kmem_cache *extent_map_cache;
@@ -20,8 +21,7 @@ int __init extent_map_init(void)
 void extent_map_exit(void)
 {
-        if (extent_map_cache)
+        kmem_cache_destroy(extent_map_cache);
-                kmem_cache_destroy(extent_map_cache);
 }
 /**
@@ -62,7 +62,7 @@ struct extent_map *alloc_extent_map(void)
 /**
 * free_extent_map - drop reference count of an extent_map
- * @em:         extent map beeing releasead
+ * @em:         extent map being releasead
 *
 * Drops the reference out on @em by one and free the structure
 * if the reference count hits zero.
@@ -422,7 +422,7 @@ struct extent_map *search_extent_mapping(struct extent_map_tree *tree,
 /**
 * remove_extent_mapping - removes an extent_map from the extent tree
 * @tree:       extent tree to remove from
- * @em:         extent map beeing removed
+ * @em:         extent map being removed
 *
 * Removes @em from @tree.  No reference counts are dropped, and no checks
 * are done to see if the range is in use
diff --git a/fs/btrfs/file-item.c b/fs/btrfs/file-item.c
index a67e1c828d0f..b5baf5bdc8e1 100644
--- a/fs/btrfs/file-item.c
+++ b/fs/btrfs/file-item.c
@@ -25,6 +25,7 @@
 #include "transaction.h"
 #include "volumes.h"
 #include "print-tree.h"
+#include "compression.h"
 #define __MAX_CSUM_ITEMS(r, size) ((unsigned long)(((BTRFS_LEAF_DATA_SIZE(r) - \
                                   sizeof(struct btrfs_item) * 2) / \
@@ -172,6 +173,7 @@ static int __btrfs_lookup_bio_sums(struct btrfs_root *root,
        u64 item_start_offset = 0;
        u64 item_last_offset = 0;
        u64 disk_bytenr;
+        u64 page_bytes_left;
        u32 diff;
        int nblocks;
        int bio_index = 0;
@@ -220,6 +222,8 @@ static int __btrfs_lookup_bio_sums(struct btrfs_root *root,
        disk_bytenr = (u64)bio->bi_iter.bi_sector << 9;
        if (dio)
                offset = logical_offset;
+        page_bytes_left = bvec->bv_len;
        while (bio_index < bio->bi_vcnt) {
                if (!dio)
                        offset = page_offset(bvec->bv_page) + bvec->bv_offset;
@@ -243,7 +247,7 @@ static int __btrfs_lookup_bio_sums(struct btrfs_root *root,
                                if (BTRFS_I(inode)->root->root_key.objectid ==
                                    BTRFS_DATA_RELOC_TREE_OBJECTID) {
                                        set_extent_bits(io_tree, offset,
-                                                offset + bvec->bv_len - 1,
+                                                offset + root->sectorsize - 1,
                                                EXTENT_NODATASUM, GFP_NOFS);
                                } else {
                                        btrfs_info(BTRFS_I(inode)->root->fs_info,
@@ -281,13 +285,29 @@ static int __btrfs_lookup_bio_sums(struct btrfs_root *root,
 found:
                csum += count * csum_size;
                nblocks -= count;
-                bio_index += count;
                while (count--) {
-                        disk_bytenr += bvec->bv_len;
+                        disk_bytenr += root->sectorsize;
-                        offset += bvec->bv_len;
+                        offset += root->sectorsize;
-                        bvec++;
+                        page_bytes_left -= root->sectorsize;
+                        if (!page_bytes_left) {
+                                bio_index++;
+                                /*
+                                 * make sure we're still inside the
+                                 * bio before we update page_bytes_left
+                                 */
+                                if (bio_index >= bio->bi_vcnt) {
+                                        WARN_ON_ONCE(count);
+                                        goto done;
+                                }
+                                bvec++;
+                                page_bytes_left = bvec->bv_len;
+                        }
                }
        }
+done:
        btrfs_free_path(path);
        return 0;
 }
@@ -432,6 +452,8 @@ int btrfs_csum_one_bio(struct btrfs_root *root, struct inode *inode,
        struct bio_vec *bvec = bio->bi_io_vec;
        int bio_index = 0;
        int index;
+        int nr_sectors;
+        int i;
        unsigned long total_bytes = 0;
        unsigned long this_sum_bytes = 0;
        u64 offset;
@@ -459,41 +481,56 @@ int btrfs_csum_one_bio(struct btrfs_root *root, struct inode *inode,
                if (!contig)
                        offset = page_offset(bvec->bv_page) + bvec->bv_offset;
-                if (offset >= ordered->file_offset + ordered->len ||
+                data = kmap_atomic(bvec->bv_page);
-                    offset < ordered->file_offset) {
-                        unsigned long bytes_left;
-                        sums->len = this_sum_bytes;
-                        this_sum_bytes = 0;
-                        btrfs_add_ordered_sum(inode, ordered, sums);
-                        btrfs_put_ordered_extent(ordered);
-                        bytes_left = bio->bi_iter.bi_size - total_bytes;
+                nr_sectors = BTRFS_BYTES_TO_BLKS(root->fs_info,
+                                                bvec->bv_len + root->sectorsize
+                                                - 1);
+                for (i = 0; i < nr_sectors; i++) {
+                        if (offset >= ordered->file_offset + ordered->len ||
+                                offset < ordered->file_offset) {
+                                unsigned long bytes_left;
+                                kunmap_atomic(data);
+                                sums->len = this_sum_bytes;
+                                this_sum_bytes = 0;
+                                btrfs_add_ordered_sum(inode, ordered, sums);
+                                btrfs_put_ordered_extent(ordered);
+                                bytes_left = bio->bi_iter.bi_size - total_bytes;
+                                sums = kzalloc(btrfs_ordered_sum_size(root, bytes_left),
+                                        GFP_NOFS);
+                                BUG_ON(!sums); /* -ENOMEM */
+                                sums->len = bytes_left;
+                                ordered = btrfs_lookup_ordered_extent(inode,
+                                                                offset);
+                                ASSERT(ordered); /* Logic error */
+                                sums->bytenr = ((u64)bio->bi_iter.bi_sector << 9)
+                                        + total_bytes;
+                                index = 0;
+                                data = kmap_atomic(bvec->bv_page);
+                        }
-                        sums = kzalloc(btrfs_ordered_sum_size(root, bytes_left),
+                        sums->sums[index] = ~(u32)0;
-                                       GFP_NOFS);
+                        sums->sums[index]
-                        BUG_ON(!sums); /* -ENOMEM */
+                                = btrfs_csum_data(data + bvec->bv_offset
-                        sums->len = bytes_left;
+                                                + (i * root->sectorsize),
-                        ordered = btrfs_lookup_ordered_extent(inode, offset);
+                                                sums->sums[index],
-                        BUG_ON(!ordered); /* Logic error */
+                                                root->sectorsize);
-                        sums->bytenr = ((u64)bio->bi_iter.bi_sector << 9) +
+                        btrfs_csum_final(sums->sums[index],
-                                       total_bytes;
+                                        (char *)(sums->sums + index));
-                        index = 0;
+                        index++;
+                        offset += root->sectorsize;
+                        this_sum_bytes += root->sectorsize;
+                        total_bytes += root->sectorsize;
                }
-                data = kmap_atomic(bvec->bv_page);
-                sums->sums[index] = ~(u32)0;
-                sums->sums[index] = btrfs_csum_data(data + bvec->bv_offset,
-                                                    sums->sums[index],
-                                                    bvec->bv_len);
                kunmap_atomic(data);
-                btrfs_csum_final(sums->sums[index],
-                                 (char *)(sums->sums + index));
                bio_index++;
-                index++;
-                total_bytes += bvec->bv_len;
-                this_sum_bytes += bvec->bv_len;
-                offset += bvec->bv_len;
                bvec++;
        }
        this_sum_bytes = 0;
diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c
index 098bb8f690c9..15a09cb156ce 100644
--- a/fs/btrfs/file.c
+++ b/fs/btrfs/file.c
@@ -41,6 +41,7 @@
 #include "locking.h"
 #include "volumes.h"
 #include "qgroup.h"
+#include "compression.h"
 static struct kmem_cache *btrfs_inode_defrag_cachep;
 /*
@@ -498,7 +499,7 @@ int btrfs_dirty_pages(struct btrfs_root *root, struct inode *inode,
        loff_t isize = i_size_read(inode);
        start_pos = pos & ~((u64)root->sectorsize - 1);
-        num_bytes = ALIGN(write_bytes + pos - start_pos, root->sectorsize);
+        num_bytes = round_up(write_bytes + pos - start_pos, root->sectorsize);
        end_of_last_block = start_pos + num_bytes - 1;
        err = btrfs_set_extent_delalloc(inode, start_pos, end_of_last_block,
@@ -1379,16 +1380,19 @@ fail:
 static noinline int
 lock_and_cleanup_extent_if_need(struct inode *inode, struct page **pages,
                                size_t num_pages, loff_t pos,
+                                size_t write_bytes,
                                u64 *lockstart, u64 *lockend,
                                struct extent_state **cached_state)
 {
+        struct btrfs_root *root = BTRFS_I(inode)->root;
        u64 start_pos;
        u64 last_pos;
        int i;
        int ret = 0;
-        start_pos = pos & ~((u64)PAGE_CACHE_SIZE - 1);
+        start_pos = round_down(pos, root->sectorsize);
-        last_pos = start_pos + ((u64)num_pages << PAGE_CACHE_SHIFT) - 1;
+        last_pos = start_pos
+                + round_up(pos + write_bytes - start_pos, root->sectorsize) - 1;
        if (start_pos < inode->i_size) {
                struct btrfs_ordered_extent *ordered;
@@ -1503,6 +1507,7 @@ static noinline ssize_t __btrfs_buffered_write(struct file *file,
        while (iov_iter_count(i) > 0) {
                size_t offset = pos & (PAGE_CACHE_SIZE - 1);
+                size_t sector_offset;
                size_t write_bytes = min(iov_iter_count(i),
                                         nrptrs * (size_t)PAGE_CACHE_SIZE -
                                         offset);
@@ -1511,6 +1516,8 @@ static noinline ssize_t __btrfs_buffered_write(struct file *file,
                size_t reserve_bytes;
                size_t dirty_pages;
                size_t copied;
+                size_t dirty_sectors;
+                size_t num_sectors;
                WARN_ON(num_pages > nrptrs);
@@ -1523,29 +1530,29 @@ static noinline ssize_t __btrfs_buffered_write(struct file *file,
                        break;
                }
-                reserve_bytes = num_pages << PAGE_CACHE_SHIFT;
+                sector_offset = pos & (root->sectorsize - 1);
+                reserve_bytes = round_up(write_bytes + sector_offset,
+                                root->sectorsize);
-                if (BTRFS_I(inode)->flags & (BTRFS_INODE_NODATACOW |
+                if ((BTRFS_I(inode)->flags & (BTRFS_INODE_NODATACOW |
-                                             BTRFS_INODE_PREALLOC)) {
+                                              BTRFS_INODE_PREALLOC)) &&
-                        ret = check_can_nocow(inode, pos, &write_bytes);
+                    check_can_nocow(inode, pos, &write_bytes) > 0) {
-                        if (ret < 0)
+                        /*
-                                break;
+                         * For nodata cow case, no need to reserve
-                        if (ret > 0) {
+                         * data space.
-                                /*
+                         */
-                                 * For nodata cow case, no need to reserve
+                        only_release_metadata = true;
-                                 * data space.
+                        /*
-                                 */
+                         * our prealloc extent may be smaller than
-                                only_release_metadata = true;
+                         * write_bytes, so scale down.
-                                /*
+                         */
-                                 * our prealloc extent may be smaller than
+                        num_pages = DIV_ROUND_UP(write_bytes + offset,
-                                 * write_bytes, so scale down.
+                                                 PAGE_CACHE_SIZE);
-                                 */
+                        reserve_bytes = round_up(write_bytes + sector_offset,
-                                num_pages = DIV_ROUND_UP(write_bytes + offset,
+                                        root->sectorsize);
-                                                         PAGE_CACHE_SIZE);
+                        goto reserve_metadata;
-                                reserve_bytes = num_pages << PAGE_CACHE_SHIFT;
-                                goto reserve_metadata;
-                        }
                }
                ret = btrfs_check_data_free_space(inode, pos, write_bytes);
                if (ret < 0)
                        break;
@@ -1576,8 +1583,8 @@ again:
                        break;
                ret = lock_and_cleanup_extent_if_need(inode, pages, num_pages,
-                                                      pos, &lockstart, &lockend,
+                                                pos, write_bytes, &lockstart,
-                                                      &cached_state);
+                                                &lockend, &cached_state);
                if (ret < 0) {
                        if (ret == -EAGAIN)
                                goto again;
@@ -1612,9 +1619,16 @@ again:
                 * we still have an outstanding extent for the chunk we actually
                 * managed to copy.
                 */
-                if (num_pages > dirty_pages) {
+                num_sectors = BTRFS_BYTES_TO_BLKS(root->fs_info,
-                        release_bytes = (num_pages - dirty_pages) <<
+                                                reserve_bytes);
-                                PAGE_CACHE_SHIFT;
+                dirty_sectors = round_up(copied + sector_offset,
+                                        root->sectorsize);
+                dirty_sectors = BTRFS_BYTES_TO_BLKS(root->fs_info,
+                                                dirty_sectors);
+                if (num_sectors > dirty_sectors) {
+                        release_bytes = (write_bytes - copied)
+                                & ~((u64)root->sectorsize - 1);
                        if (copied > 0) {
                                spin_lock(&BTRFS_I(inode)->lock);
                                BTRFS_I(inode)->outstanding_extents++;
@@ -1633,7 +1647,8 @@ again:
                        }
                }
-                release_bytes = dirty_pages << PAGE_CACHE_SHIFT;
+                release_bytes = round_up(copied + sector_offset,
+                                        root->sectorsize);
                if (copied > 0)
                        ret = btrfs_dirty_pages(root, inode, pages,
@@ -1654,8 +1669,7 @@ again:
                if (only_release_metadata && copied > 0) {
                        lockstart = round_down(pos, root->sectorsize);
-                        lockend = lockstart +
+                        lockend = round_up(pos + copied, root->sectorsize) - 1;
-                                (dirty_pages << PAGE_CACHE_SHIFT) - 1;
                        set_extent_bit(&BTRFS_I(inode)->io_tree, lockstart,
                                       lockend, EXTENT_NORESERVE, NULL,
@@ -1761,6 +1775,8 @@ static ssize_t btrfs_file_write_iter(struct kiocb *iocb,
        ssize_t err;
        loff_t pos;
        size_t count;
+        loff_t oldsize;
+        int clean_page = 0;
        inode_lock(inode);
        err = generic_write_checks(iocb, from);
@@ -1799,14 +1815,17 @@ static ssize_t btrfs_file_write_iter(struct kiocb *iocb,
        pos = iocb->ki_pos;
        count = iov_iter_count(from);
        start_pos = round_down(pos, root->sectorsize);
-        if (start_pos > i_size_read(inode)) {
+        oldsize = i_size_read(inode);
+        if (start_pos > oldsize) {
                /* Expand hole size to cover write data, preventing empty gap */
                end_pos = round_up(pos + count, root->sectorsize);
-                err = btrfs_cont_expand(inode, i_size_read(inode), end_pos);
+                err = btrfs_cont_expand(inode, oldsize, end_pos);
                if (err) {
                        inode_unlock(inode);
                        goto out;
                }
+                if (start_pos > round_up(oldsize, root->sectorsize))
+                        clean_page = 1;
        }
        if (sync)
@@ -1818,6 +1837,9 @@ static ssize_t btrfs_file_write_iter(struct kiocb *iocb,
                num_written = __btrfs_buffered_write(file, from, pos);
                if (num_written > 0)
                        iocb->ki_pos = pos + num_written;
+                if (clean_page)
+                        pagecache_isize_extended(inode, oldsize,
+                                                i_size_read(inode));
        }
        inode_unlock(inode);
@@ -1825,7 +1847,7 @@ static ssize_t btrfs_file_write_iter(struct kiocb *iocb,
        /*
         * We also have to set last_sub_trans to the current log transid,
         * otherwise subsequent syncs to a file that's been synced in this
-         * transaction will appear to have already occured.
+         * transaction will appear to have already occurred.
         */
        spin_lock(&BTRFS_I(inode)->lock);
        BTRFS_I(inode)->last_sub_trans = root->log_transid;
@@ -1996,10 +2018,11 @@ int btrfs_sync_file(struct file *file, loff_t start, loff_t end, int datasync)
         */
        smp_mb();
        if (btrfs_inode_in_log(inode, root->fs_info->generation) ||
-            (BTRFS_I(inode)->last_trans <=
+            (full_sync && BTRFS_I(inode)->last_trans <=
-             root->fs_info->last_trans_committed &&
+             root->fs_info->last_trans_committed) ||
-             (full_sync ||
+            (!btrfs_have_ordered_extents_in_range(inode, start, len) &&
-              !btrfs_have_ordered_extents_in_range(inode, start, len)))) {
+             BTRFS_I(inode)->last_trans
+             <= root->fs_info->last_trans_committed)) {
                /*
                 * We'v had everything committed since the last time we were
                 * modified so clear this flag in case it was set for whatever
@@ -2293,10 +2316,10 @@ static int btrfs_punch_hole(struct inode *inode, loff_t offset, loff_t len)
        int ret = 0;
        int err = 0;
        unsigned int rsv_count;
-        bool same_page;
+        bool same_block;
        bool no_holes = btrfs_fs_incompat(root->fs_info, NO_HOLES);
        u64 ino_size;
-        bool truncated_page = false;
+        bool truncated_block = false;
        bool updated_inode = false;
        ret = btrfs_wait_ordered_range(inode, offset, len);
@@ -2304,7 +2327,7 @@ static int btrfs_punch_hole(struct inode *inode, loff_t offset, loff_t len)
                return ret;
        inode_lock(inode);
-        ino_size = round_up(inode->i_size, PAGE_CACHE_SIZE);
+        ino_size = round_up(inode->i_size, root->sectorsize);
        ret = find_first_non_hole(inode, &offset, &len);
        if (ret < 0)
                goto out_only_mutex;
@@ -2317,31 +2340,30 @@ static int btrfs_punch_hole(struct inode *inode, loff_t offset, loff_t len)
        lockstart = round_up(offset, BTRFS_I(inode)->root->sectorsize);
        lockend = round_down(offset + len,
                             BTRFS_I(inode)->root->sectorsize) - 1;
-        same_page = ((offset >> PAGE_CACHE_SHIFT) ==
+        same_block = (BTRFS_BYTES_TO_BLKS(root->fs_info, offset))
-                    ((offset + len - 1) >> PAGE_CACHE_SHIFT));
+                == (BTRFS_BYTES_TO_BLKS(root->fs_info, offset + len - 1));
        /*
-         * We needn't truncate any page which is beyond the end of the file
+         * We needn't truncate any block which is beyond the end of the file
         * because we are sure there is no data there.
         */
        /*
-         * Only do this if we are in the same page and we aren't doing the
+         * Only do this if we are in the same block and we aren't doing the
-         * entire page.
+         * entire block.
         */
-        if (same_page && len < PAGE_CACHE_SIZE) {
+        if (same_block && len < root->sectorsize) {
                if (offset < ino_size) {
-                        truncated_page = true;
+                        truncated_block = true;
-                        ret = btrfs_truncate_page(inode, offset, len, 0);
+                        ret = btrfs_truncate_block(inode, offset, len, 0);
                } else {
                        ret = 0;
                }
                goto out_only_mutex;
        }
-        /* zero back part of the first page */
+        /* zero back part of the first block */
        if (offset < ino_size) {
-                truncated_page = true;
+                truncated_block = true;
-                ret = btrfs_truncate_page(inode, offset, 0, 0);
+                ret = btrfs_truncate_block(inode, offset, 0, 0);
                if (ret) {
                        inode_unlock(inode);
                        return ret;
@@ -2376,9 +2398,10 @@ static int btrfs_punch_hole(struct inode *inode, loff_t offset, loff_t len)
                if (!ret) {
                        /* zero the front end of the last page */
                        if (tail_start + tail_len < ino_size) {
-                                truncated_page = true;
+                                truncated_block = true;
-                                ret = btrfs_truncate_page(inode,
+                                ret = btrfs_truncate_block(inode,
-                                                tail_start + tail_len, 0, 1);
+                                                        tail_start + tail_len,
+                                                        0, 1);
                                if (ret)
                                        goto out_only_mutex;
                        }
@@ -2544,7 +2567,7 @@ out_trans:
                goto out_free;
        inode_inc_iversion(inode);
-        inode->i_mtime = inode->i_ctime = CURRENT_TIME;
+        inode->i_mtime = inode->i_ctime = current_fs_time(inode->i_sb);
        trans->block_rsv = &root->fs_info->trans_block_rsv;
        ret = btrfs_update_inode(trans, root, inode);
@@ -2558,7 +2581,7 @@ out:
        unlock_extent_cached(&BTRFS_I(inode)->io_tree, lockstart, lockend,
                             &cached_state, GFP_NOFS);
 out_only_mutex:
-        if (!updated_inode && truncated_page && !ret && !err) {
+        if (!updated_inode && truncated_block && !ret && !err) {
                /*
                 * If we only end up zeroing part of a page, we still need to
                 * update the inode item, so that all the time fields are
@@ -2611,7 +2634,7 @@ static int add_falloc_range(struct list_head *head, u64 start, u64 len)
                return 0;
        }
 insert:
-        range = kmalloc(sizeof(*range), GFP_NOFS);
+        range = kmalloc(sizeof(*range), GFP_KERNEL);
        if (!range)
                return -ENOMEM;
        range->start = start;
@@ -2678,10 +2701,10 @@ static long btrfs_fallocate(struct file *file, int mode,
        } else if (offset + len > inode->i_size) {
                /*
                 * If we are fallocating from the end of the file onward we
-                 * need to zero out the end of the page if i_size lands in the
+                 * need to zero out the end of the block if i_size lands in the
-                 * middle of a page.
+                 * middle of a block.
                 */
-                ret = btrfs_truncate_page(inode, inode->i_size, 0, 0);
+                ret = btrfs_truncate_block(inode, inode->i_size, 0, 0);
                if (ret)
                        goto out;
        }
@@ -2712,7 +2735,7 @@ static long btrfs_fallocate(struct file *file, int mode,
                        btrfs_put_ordered_extent(ordered);
                        unlock_extent_cached(&BTRFS_I(inode)->io_tree,
                                             alloc_start, locked_end,
-                                             &cached_state, GFP_NOFS);
+                                             &cached_state, GFP_KERNEL);
                        /*
                         * we can't wait on the range with the transaction
                         * running or with the extent lock held
@@ -2794,7 +2817,7 @@ static long btrfs_fallocate(struct file *file, int mode,
                if (IS_ERR(trans)) {
                        ret = PTR_ERR(trans);
                } else {
-                        inode->i_ctime = CURRENT_TIME;
+                        inode->i_ctime = current_fs_time(inode->i_sb);
                        i_size_write(inode, actual_end);
                        btrfs_ordered_update_i_size(inode, actual_end, NULL);
                        ret = btrfs_update_inode(trans, root, inode);
@@ -2806,7 +2829,7 @@ static long btrfs_fallocate(struct file *file, int mode,
        }
 out_unlock:
        unlock_extent_cached(&BTRFS_I(inode)->io_tree, alloc_start, locked_end,
-                             &cached_state, GFP_NOFS);
+                             &cached_state, GFP_KERNEL);
 out:
        /*
         * As we waited the extent range, the data_rsv_map must be empty
@@ -2939,8 +2962,7 @@ const struct file_operations btrfs_file_operations = {
 void btrfs_auto_defrag_exit(void)
 {
-        if (btrfs_inode_defrag_cachep)
+        kmem_cache_destroy(btrfs_inode_defrag_cachep);
-                kmem_cache_destroy(btrfs_inode_defrag_cachep);
 }
 int btrfs_auto_defrag_init(void)
diff --git a/fs/btrfs/inode-map.c b/fs/btrfs/inode-map.c
index e50316c4af15..1f0ec19b23f6 100644
--- a/fs/btrfs/inode-map.c
+++ b/fs/btrfs/inode-map.c
@@ -556,6 +556,9 @@ int btrfs_find_free_objectid(struct btrfs_root *root, u64 *objectid)
        mutex_lock(&root->objectid_mutex);
        if (unlikely(root->highest_objectid >= BTRFS_LAST_FREE_OBJECTID)) {
+                btrfs_warn(root->fs_info,
+                           "the objectid of root %llu reaches its highest value",
+                           root->root_key.objectid);
                ret = -ENOSPC;
                goto out;
        }
diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index d96f5cf38a2d..41a5688ffdfe 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -263,7 +263,7 @@ static noinline int cow_file_range_inline(struct btrfs_root *root,
                data_len = compressed_size;
        if (start > 0 ||
-            actual_end > PAGE_CACHE_SIZE ||
+            actual_end > root->sectorsize ||
            data_len > BTRFS_MAX_INLINE_DATA_SIZE(root) ||
            (!compressed_size &&
            (actual_end & (root->sectorsize - 1)) == 0) ||
@@ -2002,7 +2002,8 @@ again:
        if (PagePrivate2(page))
                goto out;
-        ordered = btrfs_lookup_ordered_extent(inode, page_start);
+        ordered = btrfs_lookup_ordered_range(inode, page_start,
+                                        PAGE_CACHE_SIZE);
        if (ordered) {
                unlock_extent_cached(&BTRFS_I(inode)->io_tree, page_start,
                                     page_end, &cached_state, GFP_NOFS);
@@ -4013,7 +4014,8 @@ err:
        btrfs_i_size_write(dir, dir->i_size - name_len * 2);
        inode_inc_iversion(inode);
        inode_inc_iversion(dir);
-        inode->i_ctime = dir->i_mtime = dir->i_ctime = CURRENT_TIME;
+        inode->i_ctime = dir->i_mtime =
+                dir->i_ctime = current_fs_time(inode->i_sb);
        ret = btrfs_update_inode(trans, root, dir);
 out:
        return ret;
@@ -4156,7 +4158,7 @@ int btrfs_unlink_subvol(struct btrfs_trans_handle *trans,
        btrfs_i_size_write(dir, dir->i_size - name_len * 2);
        inode_inc_iversion(dir);
-        dir->i_mtime = dir->i_ctime = CURRENT_TIME;
+        dir->i_mtime = dir->i_ctime = current_fs_time(dir->i_sb);
        ret = btrfs_update_inode_fallback(trans, root, dir);
        if (ret)
                btrfs_abort_transaction(trans, root, ret);
@@ -4211,11 +4213,20 @@ static int truncate_space_check(struct btrfs_trans_handle *trans,
 {
        int ret;
+        /*
+         * This is only used to apply pressure to the enospc system, we don't
+         * intend to use this reservation at all.
+         */
        bytes_deleted = btrfs_csum_bytes_to_leaves(root, bytes_deleted);
+        bytes_deleted *= root->nodesize;
        ret = btrfs_block_rsv_add(root, &root->fs_info->trans_block_rsv,
                                  bytes_deleted, BTRFS_RESERVE_NO_FLUSH);
-        if (!ret)
+        if (!ret) {
+                trace_btrfs_space_reservation(root->fs_info, "transaction",
+                                              trans->transid,
+                                              bytes_deleted, 1);
                trans->bytes_reserved += bytes_deleted;
+        }
        return ret;
 }
@@ -4248,7 +4259,8 @@ static int truncate_inline_extent(struct inode *inode,
                 * read the extent item from disk (data not in the page cache).
                 */
                btrfs_release_path(path);
-                return btrfs_truncate_page(inode, offset, page_end - offset, 0);
+                return btrfs_truncate_block(inode, offset, page_end - offset,
+                                        0);
        }
        btrfs_set_file_extent_ram_bytes(leaf, fi, size);
@@ -4601,17 +4613,17 @@ error:
 }
 /*
- * btrfs_truncate_page - read, zero a chunk and write a page
+ * btrfs_truncate_block - read, zero a chunk and write a block
 * @inode - inode that we're zeroing
 * @from - the offset to start zeroing
 * @len - the length to zero, 0 to zero the entire range respective to the
 *      offset
 * @front - zero up to the offset instead of from the offset on
 *
- * This will find the page for the "from" offset and cow the page and zero the
+ * This will find the block for the "from" offset and cow the block and zero the
 * part we want to zero.  This is used with truncate and hole punching.
 */
-int btrfs_truncate_page(struct inode *inode, loff_t from, loff_t len,
+int btrfs_truncate_block(struct inode *inode, loff_t from, loff_t len,
                        int front)
 {
        struct address_space *mapping = inode->i_mapping;
@@ -4622,18 +4634,19 @@ int btrfs_truncate_page(struct inode *inode, loff_t from, loff_t len,
        char *kaddr;
        u32 blocksize = root->sectorsize;
        pgoff_t index = from >> PAGE_CACHE_SHIFT;
-        unsigned offset = from & (PAGE_CACHE_SIZE-1);
+        unsigned offset = from & (blocksize - 1);
        struct page *page;
        gfp_t mask = btrfs_alloc_write_mask(mapping);
        int ret = 0;
-        u64 page_start;
+        u64 block_start;
-        u64 page_end;
+        u64 block_end;
        if ((offset & (blocksize - 1)) == 0 &&
            (!len || ((len & (blocksize - 1)) == 0)))
                goto out;
        ret = btrfs_delalloc_reserve_space(inode,
-                        round_down(from, PAGE_CACHE_SIZE), PAGE_CACHE_SIZE);
+                        round_down(from, blocksize), blocksize);
        if (ret)
                goto out;
@@ -4641,14 +4654,14 @@ again:
        page = find_or_create_page(mapping, index, mask);
        if (!page) {
                btrfs_delalloc_release_space(inode,
-                                round_down(from, PAGE_CACHE_SIZE),
+                                round_down(from, blocksize),
-                                PAGE_CACHE_SIZE);
+                                blocksize);
                ret = -ENOMEM;
                goto out;
        }
-        page_start = page_offset(page);
+        block_start = round_down(from, blocksize);
-        page_end = page_start + PAGE_CACHE_SIZE - 1;
+        block_end = block_start + blocksize - 1;
        if (!PageUptodate(page)) {
                ret = btrfs_readpage(NULL, page);
@@ -4665,12 +4678,12 @@ again:
        }
        wait_on_page_writeback(page);
-        lock_extent_bits(io_tree, page_start, page_end, &cached_state);
+        lock_extent_bits(io_tree, block_start, block_end, &cached_state);
        set_page_extent_mapped(page);
-        ordered = btrfs_lookup_ordered_extent(inode, page_start);
+        ordered = btrfs_lookup_ordered_extent(inode, block_start);
        if (ordered) {
-                unlock_extent_cached(io_tree, page_start, page_end,
+                unlock_extent_cached(io_tree, block_start, block_end,
                                     &cached_state, GFP_NOFS);
                unlock_page(page);
                page_cache_release(page);
@@ -4679,39 +4692,41 @@ again:
                goto again;
        }
-        clear_extent_bit(&BTRFS_I(inode)->io_tree, page_start, page_end,
+        clear_extent_bit(&BTRFS_I(inode)->io_tree, block_start, block_end,
                          EXTENT_DIRTY | EXTENT_DELALLOC |
                          EXTENT_DO_ACCOUNTING | EXTENT_DEFRAG,
                          0, 0, &cached_state, GFP_NOFS);
-        ret = btrfs_set_extent_delalloc(inode, page_start, page_end,
+        ret = btrfs_set_extent_delalloc(inode, block_start, block_end,
                                        &cached_state);
        if (ret) {
-                unlock_extent_cached(io_tree, page_start, page_end,
+                unlock_extent_cached(io_tree, block_start, block_end,
                                     &cached_state, GFP_NOFS);
                goto out_unlock;
        }
-        if (offset != PAGE_CACHE_SIZE) {
+        if (offset != blocksize) {
                if (!len)
-                        len = PAGE_CACHE_SIZE - offset;
+                        len = blocksize - offset;
                kaddr = kmap(page);
                if (front)
-                        memset(kaddr, 0, offset);
+                        memset(kaddr + (block_start - page_offset(page)),
+                                0, offset);
                else
-                        memset(kaddr + offset, 0, len);
+                        memset(kaddr + (block_start - page_offset(page)) +  offset,
+                                0, len);
                flush_dcache_page(page);
                kunmap(page);
        }
        ClearPageChecked(page);
        set_page_dirty(page);
-        unlock_extent_cached(io_tree, page_start, page_end, &cached_state,
+        unlock_extent_cached(io_tree, block_start, block_end, &cached_state,
                             GFP_NOFS);
 out_unlock:
        if (ret)
-                btrfs_delalloc_release_space(inode, page_start,
+                btrfs_delalloc_release_space(inode, block_start,
-                                             PAGE_CACHE_SIZE);
+                                             blocksize);
        unlock_page(page);
        page_cache_release(page);
 out:
@@ -4782,11 +4797,11 @@ int btrfs_cont_expand(struct inode *inode, loff_t oldsize, loff_t size)
        int err = 0;
        /*
-         * If our size started in the middle of a page we need to zero out the
+         * If our size started in the middle of a block we need to zero out the
-         * rest of the page before we expand the i_size, otherwise we could
+         * rest of the block before we expand the i_size, otherwise we could
         * expose stale data.
         */
-        err = btrfs_truncate_page(inode, oldsize, 0, 0);
+        err = btrfs_truncate_block(inode, oldsize, 0, 0);
        if (err)
                return err;
@@ -4895,7 +4910,6 @@ static int btrfs_setsize(struct inode *inode, struct iattr *attr)
        }
        if (newsize > oldsize) {
-                truncate_pagecache(inode, newsize);
                /*
                 * Don't do an expanding truncate while snapshoting is ongoing.
                 * This is to ensure the snapshot captures a fully consistent
@@ -4918,6 +4932,7 @@ static int btrfs_setsize(struct inode *inode, struct iattr *attr)
                i_size_write(inode, newsize);
                btrfs_ordered_update_i_size(inode, i_size_read(inode), NULL);
+                pagecache_isize_extended(inode, oldsize, newsize);
                ret = btrfs_update_inode(trans, root, inode);
                btrfs_end_write_no_snapshoting(root);
                btrfs_end_transaction(trans, root);
@@ -5588,7 +5603,7 @@ static struct inode *new_simple_dir(struct super_block *s,
        inode->i_op = &btrfs_dir_ro_inode_operations;
        inode->i_fop = &simple_dir_operations;
        inode->i_mode = S_IFDIR | S_IRUGO | S_IWUSR | S_IXUGO;
-        inode->i_mtime = CURRENT_TIME;
+        inode->i_mtime = current_fs_time(inode->i_sb);
        inode->i_atime = inode->i_mtime;
        inode->i_ctime = inode->i_mtime;
        BTRFS_I(inode)->i_otime = inode->i_mtime;
@@ -5790,7 +5805,7 @@ static int btrfs_real_readdir(struct file *file, struct dir_context *ctx)
                        if (name_len <= sizeof(tmp_name)) {
                                name_ptr = tmp_name;
                        } else {
-                                name_ptr = kmalloc(name_len, GFP_NOFS);
+                                name_ptr = kmalloc(name_len, GFP_KERNEL);
                                if (!name_ptr) {
                                        ret = -ENOMEM;
                                        goto err;
@@ -6172,7 +6187,7 @@ static struct inode *btrfs_new_inode(struct btrfs_trans_handle *trans,
        inode_init_owner(inode, dir, mode);
        inode_set_bytes(inode, 0);
-        inode->i_mtime = CURRENT_TIME;
+        inode->i_mtime = current_fs_time(inode->i_sb);
        inode->i_atime = inode->i_mtime;
        inode->i_ctime = inode->i_mtime;
        BTRFS_I(inode)->i_otime = inode->i_mtime;
@@ -6285,7 +6300,8 @@ int btrfs_add_link(struct btrfs_trans_handle *trans,
        btrfs_i_size_write(parent_inode, parent_inode->i_size +
                           name_len * 2);
        inode_inc_iversion(parent_inode);
-        parent_inode->i_mtime = parent_inode->i_ctime = CURRENT_TIME;
+        parent_inode->i_mtime = parent_inode->i_ctime =
+                current_fs_time(parent_inode->i_sb);
        ret = btrfs_update_inode(trans, root, parent_inode);
        if (ret)
                btrfs_abort_transaction(trans, root, ret);
@@ -6503,7 +6519,7 @@ static int btrfs_link(struct dentry *old_dentry, struct inode *dir,
        BTRFS_I(inode)->dir_index = 0ULL;
        inc_nlink(inode);
        inode_inc_iversion(inode);
-        inode->i_ctime = CURRENT_TIME;
+        inode->i_ctime = current_fs_time(inode->i_sb);
        ihold(inode);
        set_bit(BTRFS_INODE_COPY_EVERYTHING, &BTRFS_I(inode)->runtime_flags);
@@ -7414,7 +7430,26 @@ static int lock_extent_direct(struct inode *inode, u64 lockstart, u64 lockend,
                                     cached_state, GFP_NOFS);
                if (ordered) {
-                        btrfs_start_ordered_extent(inode, ordered, 1);
+                        /*
+                         * If we are doing a DIO read and the ordered extent we
+                         * found is for a buffered write, we can not wait for it
+                         * to complete and retry, because if we do so we can
+                         * deadlock with concurrent buffered writes on page
+                         * locks. This happens only if our DIO read covers more
+                         * than one extent map, if at this point has already
+                         * created an ordered extent for a previous extent map
+                         * and locked its range in the inode's io tree, and a
+                         * concurrent write against that previous extent map's
+                         * range and this range started (we unlock the ranges
+                         * in the io tree only when the bios complete and
+                         * buffered writes always lock pages before attempting
+                         * to lock range in the io tree).
+                         */
+                        if (writing ||
+                            test_bit(BTRFS_ORDERED_DIRECT, &ordered->flags))
+                                btrfs_start_ordered_extent(inode, ordered, 1);
+                        else
+                                ret = -ENOTBLK;
                        btrfs_put_ordered_extent(ordered);
                } else {
                        /*
@@ -7431,9 +7466,11 @@ static int lock_extent_direct(struct inode *inode, u64 lockstart, u64 lockend,
                         * that page.
                         */
                        ret = -ENOTBLK;
-                        break;
                }
+                if (ret)
+                        break;
                cond_resched();
        }
@@ -7764,9 +7801,9 @@ static int btrfs_check_dio_repairable(struct inode *inode,
 }
 static int dio_read_error(struct inode *inode, struct bio *failed_bio,
-                          struct page *page, u64 start, u64 end,
+                        struct page *page, unsigned int pgoff,
-                          int failed_mirror, bio_end_io_t *repair_endio,
+                        u64 start, u64 end, int failed_mirror,
-                          void *repair_arg)
+                        bio_end_io_t *repair_endio, void *repair_arg)
 {
        struct io_failure_record *failrec;
        struct bio *bio;
@@ -7787,7 +7824,9 @@ static int dio_read_error(struct inode *inode, struct bio *failed_bio,
                return -EIO;
        }
-        if (failed_bio->bi_vcnt > 1)
+        if ((failed_bio->bi_vcnt > 1)
+                || (failed_bio->bi_io_vec->bv_len
+                        > BTRFS_I(inode)->root->sectorsize))
                read_mode = READ_SYNC | REQ_FAILFAST_DEV;
        else
                read_mode = READ_SYNC;
@@ -7795,7 +7834,7 @@ static int dio_read_error(struct inode *inode, struct bio *failed_bio,
        isector = start - btrfs_io_bio(failed_bio)->logical;
        isector >>= inode->i_sb->s_blocksize_bits;
        bio = btrfs_create_repair_bio(inode, failed_bio, failrec, page,
-                                      0, isector, repair_endio, repair_arg);
+                                pgoff, isector, repair_endio, repair_arg);
        if (!bio) {
                free_io_failure(inode, failrec);
                return -EIO;
@@ -7825,12 +7864,17 @@ struct btrfs_retry_complete {
 static void btrfs_retry_endio_nocsum(struct bio *bio)
 {
        struct btrfs_retry_complete *done = bio->bi_private;
+        struct inode *inode;
        struct bio_vec *bvec;
        int i;
        if (bio->bi_error)
                goto end;
+        ASSERT(bio->bi_vcnt == 1);
+        inode = bio->bi_io_vec->bv_page->mapping->host;
+        ASSERT(bio->bi_io_vec->bv_len == BTRFS_I(inode)->root->sectorsize);
        done->uptodate = 1;
        bio_for_each_segment_all(bvec, bio, i)
                clean_io_failure(done->inode, done->start, bvec->bv_page, 0);
@@ -7842,25 +7886,35 @@ end:
 static int __btrfs_correct_data_nocsum(struct inode *inode,
                                       struct btrfs_io_bio *io_bio)
 {
+        struct btrfs_fs_info *fs_info;
        struct bio_vec *bvec;
        struct btrfs_retry_complete done;
        u64 start;
+        unsigned int pgoff;
+        u32 sectorsize;
+        int nr_sectors;
        int i;
        int ret;
+        fs_info = BTRFS_I(inode)->root->fs_info;
+        sectorsize = BTRFS_I(inode)->root->sectorsize;
        start = io_bio->logical;
        done.inode = inode;
        bio_for_each_segment_all(bvec, &io_bio->bio, i) {
-try_again:
+                nr_sectors = BTRFS_BYTES_TO_BLKS(fs_info, bvec->bv_len);
+                pgoff = bvec->bv_offset;
+next_block_or_try_again:
                done.uptodate = 0;
                done.start = start;
                init_completion(&done.done);
-                ret = dio_read_error(inode, &io_bio->bio, bvec->bv_page, start,
+                ret = dio_read_error(inode, &io_bio->bio, bvec->bv_page,
-                                     start + bvec->bv_len - 1,
+                                pgoff, start, start + sectorsize - 1,
-                                     io_bio->mirror_num,
+                                io_bio->mirror_num,
-                                     btrfs_retry_endio_nocsum, &done);
+                                btrfs_retry_endio_nocsum, &done);
                if (ret)
                        return ret;
@@ -7868,10 +7922,15 @@ try_again:
                if (!done.uptodate) {
                        /* We might have another mirror, so try again */
-                        goto try_again;
+                        goto next_block_or_try_again;
                }
-                start += bvec->bv_len;
+                start += sectorsize;
+                if (nr_sectors--) {
+                        pgoff += sectorsize;
+                        goto next_block_or_try_again;
+                }
        }
        return 0;
@@ -7881,7 +7940,9 @@ static void btrfs_retry_endio(struct bio *bio)
 {
        struct btrfs_retry_complete *done = bio->bi_private;
        struct btrfs_io_bio *io_bio = btrfs_io_bio(bio);
+        struct inode *inode;
        struct bio_vec *bvec;
+        u64 start;
        int uptodate;
        int ret;
        int i;
@@ -7890,13 +7951,20 @@ static void btrfs_retry_endio(struct bio *bio)
                goto end;
        uptodate = 1;
+        start = done->start;
+        ASSERT(bio->bi_vcnt == 1);
+        inode = bio->bi_io_vec->bv_page->mapping->host;
+        ASSERT(bio->bi_io_vec->bv_len == BTRFS_I(inode)->root->sectorsize);
        bio_for_each_segment_all(bvec, bio, i) {
                ret = __readpage_endio_check(done->inode, io_bio, i,
-                                             bvec->bv_page, 0,
+                                        bvec->bv_page, bvec->bv_offset,
-                                             done->start, bvec->bv_len);
+                                        done->start, bvec->bv_len);
                if (!ret)
                        clean_io_failure(done->inode, done->start,
-                                         bvec->bv_page, 0);
+                                        bvec->bv_page, bvec->bv_offset);
                else
                        uptodate = 0;
        }
@@ -7910,20 +7978,34 @@ end:
 static int __btrfs_subio_endio_read(struct inode *inode,
                                    struct btrfs_io_bio *io_bio, int err)
 {
+        struct btrfs_fs_info *fs_info;
        struct bio_vec *bvec;
        struct btrfs_retry_complete done;
        u64 start;
        u64 offset = 0;
+        u32 sectorsize;
+        int nr_sectors;
+        unsigned int pgoff;
+        int csum_pos;
        int i;
        int ret;
+        fs_info = BTRFS_I(inode)->root->fs_info;
+        sectorsize = BTRFS_I(inode)->root->sectorsize;
        err = 0;
        start = io_bio->logical;
        done.inode = inode;
        bio_for_each_segment_all(bvec, &io_bio->bio, i) {
-                ret = __readpage_endio_check(inode, io_bio, i, bvec->bv_page,
+                nr_sectors = BTRFS_BYTES_TO_BLKS(fs_info, bvec->bv_len);
-                                             0, start, bvec->bv_len);
+                pgoff = bvec->bv_offset;
+next_block:
+                csum_pos = BTRFS_BYTES_TO_BLKS(fs_info, offset);
+                ret = __readpage_endio_check(inode, io_bio, csum_pos,
+                                        bvec->bv_page, pgoff, start,
+                                        sectorsize);
                if (likely(!ret))
                        goto next;
 try_again:
@@ -7931,10 +8013,10 @@ try_again:
                done.start = start;
                init_completion(&done.done);
-                ret = dio_read_error(inode, &io_bio->bio, bvec->bv_page, start,
+                ret = dio_read_error(inode, &io_bio->bio, bvec->bv_page,
-                                     start + bvec->bv_len - 1,
+                                pgoff, start, start + sectorsize - 1,
-                                     io_bio->mirror_num,
+                                io_bio->mirror_num,
-                                     btrfs_retry_endio, &done);
+                                btrfs_retry_endio, &done);
                if (ret) {
                        err = ret;
                        goto next;
@@ -7947,8 +8029,15 @@ try_again:
                        goto try_again;
                }
 next:
-                offset += bvec->bv_len;
+                offset += sectorsize;
-                start += bvec->bv_len;
+                start += sectorsize;
+                ASSERT(nr_sectors);
+                if (--nr_sectors) {
+                        pgoff += sectorsize;
+                        goto next_block;
+                }
        }
        return err;
@@ -8202,9 +8291,11 @@ static int btrfs_submit_direct_hook(int rw, struct btrfs_dio_private *dip,
        u64 file_offset = dip->logical_offset;
        u64 submit_len = 0;
        u64 map_length;
-        int nr_pages = 0;
+        u32 blocksize = root->sectorsize;
-        int ret;
        int async_submit = 0;
+        int nr_sectors;
+        int ret;
+        int i;
        map_length = orig_bio->bi_iter.bi_size;
        ret = btrfs_map_block(root->fs_info, rw, start_sector << 9,
@@ -8234,9 +8325,12 @@ static int btrfs_submit_direct_hook(int rw, struct btrfs_dio_private *dip,
        atomic_inc(&dip->pending_bios);
        while (bvec <= (orig_bio->bi_io_vec + orig_bio->bi_vcnt - 1)) {
-                if (map_length < submit_len + bvec->bv_len ||
+                nr_sectors = BTRFS_BYTES_TO_BLKS(root->fs_info, bvec->bv_len);
-                    bio_add_page(bio, bvec->bv_page, bvec->bv_len,
+                i = 0;
-                                 bvec->bv_offset) < bvec->bv_len) {
+next_block:
+                if (unlikely(map_length < submit_len + blocksize ||
+                    bio_add_page(bio, bvec->bv_page, blocksize,
+                            bvec->bv_offset + (i * blocksize)) < blocksize)) {
                        /*
                         * inc the count before we submit the bio so
                         * we know the end IO handler won't happen before
@@ -8257,7 +8351,6 @@ static int btrfs_submit_direct_hook(int rw, struct btrfs_dio_private *dip,
                        file_offset += submit_len;
                        submit_len = 0;
-                        nr_pages = 0;
                        bio = btrfs_dio_bio_alloc(orig_bio->bi_bdev,
                                                  start_sector, GFP_NOFS);
@@ -8275,9 +8368,14 @@ static int btrfs_submit_direct_hook(int rw, struct btrfs_dio_private *dip,
                                bio_put(bio);
                                goto out_err;
                        }
+                        goto next_block;
                } else {
-                        submit_len += bvec->bv_len;
+                        submit_len += blocksize;
-                        nr_pages++;
+                        if (--nr_sectors) {
+                                i++;
+                                goto next_block;
+                        }
                        bvec++;
                }
        }
@@ -8642,6 +8740,8 @@ static void btrfs_invalidatepage(struct page *page, unsigned int offset,
        struct extent_state *cached_state = NULL;
        u64 page_start = page_offset(page);
        u64 page_end = page_start + PAGE_CACHE_SIZE - 1;
+        u64 start;
+        u64 end;
        int inode_evicting = inode->i_state & I_FREEING;
        /*
@@ -8661,14 +8761,18 @@ static void btrfs_invalidatepage(struct page *page, unsigned int offset,
        if (!inode_evicting)
                lock_extent_bits(tree, page_start, page_end, &cached_state);
-        ordered = btrfs_lookup_ordered_extent(inode, page_start);
+again:
+        start = page_start;
+        ordered = btrfs_lookup_ordered_range(inode, start,
+                                        page_end - start + 1);
        if (ordered) {
+                end = min(page_end, ordered->file_offset + ordered->len - 1);
                /*
                 * IO on this page will never be started, so we need
                 * to account for any ordered extents now
                 */
                if (!inode_evicting)
-                        clear_extent_bit(tree, page_start, page_end,
+                        clear_extent_bit(tree, start, end,
                                         EXTENT_DIRTY | EXTENT_DELALLOC |
                                         EXTENT_LOCKED | EXTENT_DO_ACCOUNTING |
                                         EXTENT_DEFRAG, 1, 0, &cached_state,
@@ -8685,22 +8789,26 @@ static void btrfs_invalidatepage(struct page *page, unsigned int offset,
                        spin_lock_irq(&tree->lock);
                        set_bit(BTRFS_ORDERED_TRUNCATED, &ordered->flags);
-                        new_len = page_start - ordered->file_offset;
+                        new_len = start - ordered->file_offset;
                        if (new_len < ordered->truncated_len)
                                ordered->truncated_len = new_len;
                        spin_unlock_irq(&tree->lock);
                        if (btrfs_dec_test_ordered_pending(inode, &ordered,
-                                                           page_start,
+                                                           start,
-                                                           PAGE_CACHE_SIZE, 1))
+                                                           end - start + 1, 1))
                                btrfs_finish_ordered_io(ordered);
                }
                btrfs_put_ordered_extent(ordered);
                if (!inode_evicting) {
                        cached_state = NULL;
-                        lock_extent_bits(tree, page_start, page_end,
+                        lock_extent_bits(tree, start, end,
                                         &cached_state);
                }
+                start = end + 1;
+                if (start < page_end)
+                        goto again;
        }
        /*
@@ -8761,15 +8869,28 @@ int btrfs_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf)
        loff_t size;
        int ret;
        int reserved = 0;
+        u64 reserved_space;
        u64 page_start;
        u64 page_end;
+        u64 end;
+        reserved_space = PAGE_CACHE_SIZE;
        sb_start_pagefault(inode->i_sb);
        page_start = page_offset(page);
        page_end = page_start + PAGE_CACHE_SIZE - 1;
+        end = page_end;
+        /*
+         * Reserving delalloc space after obtaining the page lock can lead to
+         * deadlock. For example, if a dirty page is locked by this function
+         * and the call to btrfs_delalloc_reserve_space() ends up triggering
+         * dirty page write out, then the btrfs_writepage() function could
+         * end up waiting indefinitely to get a lock on the page currently
+         * being processed by btrfs_page_mkwrite() function.
+         */
        ret = btrfs_delalloc_reserve_space(inode, page_start,
-                                           PAGE_CACHE_SIZE);
+                                           reserved_space);
        if (!ret) {
                ret = file_update_time(vma->vm_file);
                reserved = 1;
@@ -8803,7 +8924,7 @@ again:
         * we can't set the delalloc bits if there are pending ordered
         * extents.  Drop our locks and wait for them to finish
         */
-        ordered = btrfs_lookup_ordered_extent(inode, page_start);
+        ordered = btrfs_lookup_ordered_range(inode, page_start, page_end);
        if (ordered) {
                unlock_extent_cached(io_tree, page_start, page_end,
                                     &cached_state, GFP_NOFS);
@@ -8813,6 +8934,18 @@ again:
                goto again;
        }
+        if (page->index == ((size - 1) >> PAGE_CACHE_SHIFT)) {
+                reserved_space = round_up(size - page_start, root->sectorsize);
+                if (reserved_space < PAGE_CACHE_SIZE) {
+                        end = page_start + reserved_space - 1;
+                        spin_lock(&BTRFS_I(inode)->lock);
+                        BTRFS_I(inode)->outstanding_extents++;
+                        spin_unlock(&BTRFS_I(inode)->lock);
+                        btrfs_delalloc_release_space(inode, page_start,
+                                                PAGE_CACHE_SIZE - reserved_space);
+                }
+        }
        /*
         * XXX - page_mkwrite gets called every time the page is dirtied, even
         * if it was already dirty, so for space accounting reasons we need to
@@ -8820,12 +8953,12 @@ again:
         * is probably a better way to do this, but for now keep consistent with
         * prepare_pages in the normal write path.
         */
-        clear_extent_bit(&BTRFS_I(inode)->io_tree, page_start, page_end,
+        clear_extent_bit(&BTRFS_I(inode)->io_tree, page_start, end,
                          EXTENT_DIRTY | EXTENT_DELALLOC |
                          EXTENT_DO_ACCOUNTING | EXTENT_DEFRAG,
                          0, 0, &cached_state, GFP_NOFS);
-        ret = btrfs_set_extent_delalloc(inode, page_start, page_end,
+        ret = btrfs_set_extent_delalloc(inode, page_start, end,
                                        &cached_state);
        if (ret) {
                unlock_extent_cached(io_tree, page_start, page_end,
@@ -8864,7 +8997,7 @@ out_unlock:
        }
        unlock_page(page);
 out:
-        btrfs_delalloc_release_space(inode, page_start, PAGE_CACHE_SIZE);
+        btrfs_delalloc_release_space(inode, page_start, reserved_space);
 out_noreserve:
        sb_end_pagefault(inode->i_sb);
        return ret;
@@ -9190,16 +9323,11 @@ void btrfs_destroy_cachep(void)
         * destroy cache.
         */
        rcu_barrier();
-        if (btrfs_inode_cachep)
+        kmem_cache_destroy(btrfs_inode_cachep);
-                kmem_cache_destroy(btrfs_inode_cachep);
+        kmem_cache_destroy(btrfs_trans_handle_cachep);
-        if (btrfs_trans_handle_cachep)
+        kmem_cache_destroy(btrfs_transaction_cachep);
-                kmem_cache_destroy(btrfs_trans_handle_cachep);
+        kmem_cache_destroy(btrfs_path_cachep);
-        if (btrfs_transaction_cachep)
+        kmem_cache_destroy(btrfs_free_space_cachep);
-                kmem_cache_destroy(btrfs_transaction_cachep);
-        if (btrfs_path_cachep)
-                kmem_cache_destroy(btrfs_path_cachep);
-        if (btrfs_free_space_cachep)
-                kmem_cache_destroy(btrfs_free_space_cachep);
 }
 int btrfs_init_cachep(void)
@@ -9250,7 +9378,6 @@ static int btrfs_getattr(struct vfsmount *mnt,
        generic_fillattr(inode, stat);
        stat->dev = BTRFS_I(inode)->root->anon_dev;
-        stat->blksize = PAGE_CACHE_SIZE;
        spin_lock(&BTRFS_I(inode)->lock);
        delalloc_bytes = BTRFS_I(inode)->delalloc_bytes;
@@ -9268,7 +9395,6 @@ static int btrfs_rename(struct inode *old_dir, struct dentry *old_dentry,
        struct btrfs_root *dest = BTRFS_I(new_dir)->root;
        struct inode *new_inode = d_inode(new_dentry);
        struct inode *old_inode = d_inode(old_dentry);
-        struct timespec ctime = CURRENT_TIME;
        u64 index = 0;
        u64 root_objectid;
        int ret;
@@ -9365,9 +9491,9 @@ static int btrfs_rename(struct inode *old_dir, struct dentry *old_dentry,
        inode_inc_iversion(old_dir);
        inode_inc_iversion(new_dir);
        inode_inc_iversion(old_inode);
-        old_dir->i_ctime = old_dir->i_mtime = ctime;
+        old_dir->i_ctime = old_dir->i_mtime =
-        new_dir->i_ctime = new_dir->i_mtime = ctime;
+        new_dir->i_ctime = new_dir->i_mtime =
-        old_inode->i_ctime = ctime;
+        old_inode->i_ctime = current_fs_time(old_dir->i_sb);
        if (old_dentry->d_parent != new_dentry->d_parent)
                btrfs_record_unlink_dir(trans, old_dir, old_inode, 1);
@@ -9392,7 +9518,7 @@ static int btrfs_rename(struct inode *old_dir, struct dentry *old_dentry,
        if (new_inode) {
                inode_inc_iversion(new_inode);
-                new_inode->i_ctime = CURRENT_TIME;
+                new_inode->i_ctime = current_fs_time(new_inode->i_sb);
                if (unlikely(btrfs_ino(new_inode) ==
                             BTRFS_EMPTY_SUBVOL_DIR_OBJECTID)) {
                        root_objectid = BTRFS_I(new_inode)->location.objectid;
@@ -9870,7 +9996,7 @@ next:
                *alloc_hint = ins.objectid + ins.offset;
                inode_inc_iversion(inode);
-                inode->i_ctime = CURRENT_TIME;
+                inode->i_ctime = current_fs_time(inode->i_sb);
                BTRFS_I(inode)->flags |= BTRFS_INODE_PREALLOC;
                if (!(mode & FALLOC_FL_KEEP_SIZE) &&
                    (actual_len > inode->i_size) &&
diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c
index 48aee9846329..053e677839fe 100644
--- a/fs/btrfs/ioctl.c
+++ b/fs/btrfs/ioctl.c
@@ -59,6 +59,8 @@
 #include "props.h"
 #include "sysfs.h"
 #include "qgroup.h"
+#include "tree-log.h"
+#include "compression.h"
 #ifdef CONFIG_64BIT
 /* If we have a 32-bit userspace and 64-bit kernel, then the UAPI
@@ -347,7 +349,7 @@ static int btrfs_ioctl_setflags(struct file *file, void __user *arg)
        btrfs_update_iflags(inode);
        inode_inc_iversion(inode);
-        inode->i_ctime = CURRENT_TIME;
+        inode->i_ctime = current_fs_time(inode->i_sb);
        ret = btrfs_update_inode(trans, root, inode);
        btrfs_end_transaction(trans, root);
@@ -443,7 +445,7 @@ static noinline int create_subvol(struct inode *dir,
        struct btrfs_root *root = BTRFS_I(dir)->root;
        struct btrfs_root *new_root;
        struct btrfs_block_rsv block_rsv;
-        struct timespec cur_time = CURRENT_TIME;
+        struct timespec cur_time = current_fs_time(dir->i_sb);
        struct inode *inode;
        int ret;
        int err;
@@ -844,10 +846,6 @@ static noinline int btrfs_mksubvol(struct path *parent,
        if (IS_ERR(dentry))
                goto out_unlock;
-        error = -EEXIST;
-        if (d_really_is_positive(dentry))
-                goto out_dput;
        error = btrfs_may_create(dir, dentry);
        if (error)
                goto out_dput;
@@ -2097,8 +2095,6 @@ static noinline int search_ioctl(struct inode *inode,
                key.offset = (u64)-1;
                root = btrfs_read_fs_root_no_name(info, &key);
                if (IS_ERR(root)) {
-                        btrfs_err(info, "could not find root %llu",
-                               sk->tree_id);
                        btrfs_free_path(path);
                        return -ENOENT;
                }
@@ -2476,6 +2472,8 @@ static noinline int btrfs_ioctl_snap_destroy(struct file *file,
        trans->block_rsv = &block_rsv;
        trans->bytes_reserved = block_rsv.size;
+        btrfs_record_snapshot_destroy(trans, dir);
        ret = btrfs_unlink_subvol(trans, root, dir,
                                dest->root_key.objectid,
                                dentry->d_name.name,
@@ -2960,8 +2958,8 @@ static int btrfs_cmp_data_prepare(struct inode *src, u64 loff,
         * of the array is bounded by len, which is in turn bounded by
         * BTRFS_MAX_DEDUPE_LEN.
         */
-        src_pgarr = kzalloc(num_pages * sizeof(struct page *), GFP_NOFS);
+        src_pgarr = kcalloc(num_pages, sizeof(struct page *), GFP_KERNEL);
-        dst_pgarr = kzalloc(num_pages * sizeof(struct page *), GFP_NOFS);
+        dst_pgarr = kcalloc(num_pages, sizeof(struct page *), GFP_KERNEL);
        if (!src_pgarr || !dst_pgarr) {
                kfree(src_pgarr);
                kfree(dst_pgarr);
@@ -3068,6 +3066,9 @@ static int btrfs_extent_same(struct inode *src, u64 loff, u64 olen,
                ret = extent_same_check_offsets(src, loff, &len, olen);
                if (ret)
                        goto out_unlock;
+                ret = extent_same_check_offsets(src, dst_loff, &len, olen);
+                if (ret)
+                        goto out_unlock;
                /*
                 * Single inode case wants the same checks, except we
@@ -3217,7 +3218,7 @@ static int clone_finish_inode_update(struct btrfs_trans_handle *trans,
        inode_inc_iversion(inode);
        if (!no_time_update)
-                inode->i_mtime = inode->i_ctime = CURRENT_TIME;
+                inode->i_mtime = inode->i_ctime = current_fs_time(inode->i_sb);
        /*
         * We round up to the block size at eof when determining which
         * extents to clone above, but shouldn't round up the file size.
@@ -3889,8 +3890,9 @@ static noinline int btrfs_clone_files(struct file *file, struct file *file_src,
         * Truncate page cache pages so that future reads will see the cloned
         * data immediately and not the previous data.
         */
-        truncate_inode_pages_range(&inode->i_data, destoff,
+        truncate_inode_pages_range(&inode->i_data,
-                                   PAGE_CACHE_ALIGN(destoff + len) - 1);
+                                round_down(destoff, PAGE_CACHE_SIZE),
+                                round_up(destoff + len, PAGE_CACHE_SIZE) - 1);
 out_unlock:
        if (!same_inode)
                btrfs_double_inode_unlock(src, inode);
@@ -5031,7 +5033,7 @@ static long _btrfs_ioctl_set_received_subvol(struct file *file,
        struct btrfs_root *root = BTRFS_I(inode)->root;
        struct btrfs_root_item *root_item = &root->root_item;
        struct btrfs_trans_handle *trans;
-        struct timespec ct = CURRENT_TIME;
+        struct timespec ct = current_fs_time(inode->i_sb);
        int ret = 0;
        int received_uuid_changed;
@@ -5262,8 +5264,7 @@ out_unlock:
          .compat_ro_flags = BTRFS_FEATURE_COMPAT_RO_##suffix, \
          .incompat_flags = BTRFS_FEATURE_INCOMPAT_##suffix }
-static int btrfs_ioctl_get_supported_features(struct file *file,
+int btrfs_ioctl_get_supported_features(void __user *arg)
-                                              void __user *arg)
 {
        static const struct btrfs_ioctl_feature_flags features[3] = {
                INIT_FEATURE_FLAGS(SUPP),
@@ -5542,7 +5543,7 @@ long btrfs_ioctl(struct file *file, unsigned int
        case BTRFS_IOC_SET_FSLABEL:
                return btrfs_ioctl_set_fslabel(file, argp);
        case BTRFS_IOC_GET_SUPPORTED_FEATURES:
-                return btrfs_ioctl_get_supported_features(file, argp);
+                return btrfs_ioctl_get_supported_features(argp);
        case BTRFS_IOC_GET_FEATURES:
                return btrfs_ioctl_get_features(file, argp);
        case BTRFS_IOC_SET_FEATURES:
diff --git a/fs/btrfs/ordered-data.c b/fs/btrfs/ordered-data.c
index 8c27292ea9ea..0de7da5a610d 100644
--- a/fs/btrfs/ordered-data.c
+++ b/fs/btrfs/ordered-data.c
@@ -25,6 +25,7 @@
 #include "btrfs_inode.h"
 #include "extent_io.h"
 #include "disk-io.h"
+#include "compression.h"
 static struct kmem_cache *btrfs_ordered_extent_cache;
@@ -1009,7 +1010,7 @@ int btrfs_ordered_update_i_size(struct inode *inode, u64 offset,
        for (; node; node = rb_prev(node)) {
                test = rb_entry(node, struct btrfs_ordered_extent, rb_node);
-                /* We treat this entry as if it doesnt exist */
+                /* We treat this entry as if it doesn't exist */
                if (test_bit(BTRFS_ORDERED_UPDATED_ISIZE, &test->flags))
                        continue;
                if (test->file_offset + test->len <= disk_i_size)
@@ -1114,6 +1115,5 @@ int __init ordered_data_init(void)
 void ordered_data_exit(void)
 {
-        if (btrfs_ordered_extent_cache)
+        kmem_cache_destroy(btrfs_ordered_extent_cache);
-                kmem_cache_destroy(btrfs_ordered_extent_cache);
 }
diff --git a/fs/btrfs/print-tree.c b/fs/btrfs/print-tree.c
index 647ab12fdf5d..147dc6ca5de1 100644
--- a/fs/btrfs/print-tree.c
+++ b/fs/btrfs/print-tree.c
@@ -295,8 +295,27 @@ void btrfs_print_leaf(struct btrfs_root *root, struct extent_buffer *l)
                               btrfs_dev_extent_chunk_offset(l, dev_extent),
                               btrfs_dev_extent_length(l, dev_extent));
                        break;
-                case BTRFS_DEV_STATS_KEY:
+                case BTRFS_PERSISTENT_ITEM_KEY:
-                        printk(KERN_INFO "\t\tdevice stats\n");
+                        printk(KERN_INFO "\t\tpersistent item objectid %llu offset %llu\n",
+                                        key.objectid, key.offset);
+                        switch (key.objectid) {
+                        case BTRFS_DEV_STATS_OBJECTID:
+                                printk(KERN_INFO "\t\tdevice stats\n");
+                                break;
+                        default:
+                                printk(KERN_INFO "\t\tunknown persistent item\n");
+                        }
+                        break;
+                case BTRFS_TEMPORARY_ITEM_KEY:
+                        printk(KERN_INFO "\t\ttemporary item objectid %llu offset %llu\n",
+                                        key.objectid, key.offset);
+                        switch (key.objectid) {
+                        case BTRFS_BALANCE_OBJECTID:
+                                printk(KERN_INFO "\t\tbalance status\n");
+                                break;
+                        default:
+                                printk(KERN_INFO "\t\tunknown temporary item\n");
+                        }
                        break;
                case BTRFS_DEV_REPLACE_KEY:
                        printk(KERN_INFO "\t\tdev replace\n");
diff --git a/fs/btrfs/props.c b/fs/btrfs/props.c
index f9e60231f685..36992128c746 100644
--- a/fs/btrfs/props.c
+++ b/fs/btrfs/props.c
@@ -22,6 +22,7 @@
 #include "hash.h"
 #include "transaction.h"
 #include "xattr.h"
+#include "compression.h"
 #define BTRFS_PROP_HANDLERS_HT_BITS 8
 static DEFINE_HASHTABLE(prop_handlers_ht, BTRFS_PROP_HANDLERS_HT_BITS);
diff --git a/fs/btrfs/reada.c b/fs/btrfs/reada.c
index 619f92963e27..b892914968c1 100644
--- a/fs/btrfs/reada.c
+++ b/fs/btrfs/reada.c
@@ -72,7 +72,7 @@ struct reada_extent {
        spinlock_t              lock;
        struct reada_zone       *zones[BTRFS_MAX_MIRRORS];
        int                     nzones;
-        struct btrfs_device     *scheduled_for;
+        int                     scheduled;
 };
 struct reada_zone {
@@ -101,67 +101,53 @@ static void reada_start_machine(struct btrfs_fs_info *fs_info);
 static void __reada_start_machine(struct btrfs_fs_info *fs_info);
 static int reada_add_block(struct reada_control *rc, u64 logical,
-                           struct btrfs_key *top, int level, u64 generation);
+                           struct btrfs_key *top, u64 generation);
 /* recurses */
 /* in case of err, eb might be NULL */
-static int __readahead_hook(struct btrfs_root *root, struct extent_buffer *eb,
+static void __readahead_hook(struct btrfs_fs_info *fs_info,
-                            u64 start, int err)
+                             struct reada_extent *re, struct extent_buffer *eb,
+                             u64 start, int err)
 {
        int level = 0;
        int nritems;
        int i;
        u64 bytenr;
        u64 generation;
-        struct reada_extent *re;
-        struct btrfs_fs_info *fs_info = root->fs_info;
        struct list_head list;
-        unsigned long index = start >> PAGE_CACHE_SHIFT;
-        struct btrfs_device *for_dev;
        if (eb)
                level = btrfs_header_level(eb);
-        /* find extent */
-        spin_lock(&fs_info->reada_lock);
-        re = radix_tree_lookup(&fs_info->reada_tree, index);
-        if (re)
-                re->refcnt++;
-        spin_unlock(&fs_info->reada_lock);
-        if (!re)
-                return -1;
        spin_lock(&re->lock);
        /*
         * just take the full list from the extent. afterwards we
         * don't need the lock anymore
         */
        list_replace_init(&re->extctl, &list);
-        for_dev = re->scheduled_for;
+        re->scheduled = 0;
-        re->scheduled_for = NULL;
        spin_unlock(&re->lock);
-        if (err == 0) {
+        /*
-                nritems = level ? btrfs_header_nritems(eb) : 0;
+         * this is the error case, the extent buffer has not been
-                generation = btrfs_header_generation(eb);
+         * read correctly. We won't access anything from it and
-                /*
+         * just cleanup our data structures. Effectively this will
-                 * FIXME: currently we just set nritems to 0 if this is a leaf,
+         * cut the branch below this node from read ahead.
-                 * effectively ignoring the content. In a next step we could
+         */
-                 * trigger more readahead depending from the content, e.g.
+        if (err)
-                 * fetch the checksums for the extents in the leaf.
+                goto cleanup;
-                 */
-        } else {
-                /*
-                 * this is the error case, the extent buffer has not been
-                 * read correctly. We won't access anything from it and
-                 * just cleanup our data structures. Effectively this will
-                 * cut the branch below this node from read ahead.
-                 */
-                nritems = 0;
-                generation = 0;
-        }
+        /*
+         * FIXME: currently we just set nritems to 0 if this is a leaf,
+         * effectively ignoring the content. In a next step we could
+         * trigger more readahead depending from the content, e.g.
+         * fetch the checksums for the extents in the leaf.
+         */
+        if (!level)
+                goto cleanup;
+        nritems = btrfs_header_nritems(eb);
+        generation = btrfs_header_generation(eb);
        for (i = 0; i < nritems; i++) {
                struct reada_extctl *rec;
                u64 n_gen;
@@ -188,19 +174,20 @@ static int __readahead_hook(struct btrfs_root *root, struct extent_buffer *eb,
                         */
 #ifdef DEBUG
                        if (rec->generation != generation) {
-                                btrfs_debug(root->fs_info,
+                                btrfs_debug(fs_info,
-                                           "generation mismatch for (%llu,%d,%llu) %llu != %llu",
+                                            "generation mismatch for (%llu,%d,%llu) %llu != %llu",
-                                       key.objectid, key.type, key.offset,
+                                            key.objectid, key.type, key.offset,
-                                       rec->generation, generation);
+                                            rec->generation, generation);
                        }
 #endif
                        if (rec->generation == generation &&
                            btrfs_comp_cpu_keys(&key, &rc->key_end) < 0 &&
                            btrfs_comp_cpu_keys(&next_key, &rc->key_start) > 0)
-                                reada_add_block(rc, bytenr, &next_key,
+                                reada_add_block(rc, bytenr, &next_key, n_gen);
-                                                level - 1, n_gen);
                }
        }
+cleanup:
        /*
         * free extctl records
         */
@@ -222,26 +209,37 @@ static int __readahead_hook(struct btrfs_root *root, struct extent_buffer *eb,
                reada_extent_put(fs_info, re);  /* one ref for each entry */
        }
-        reada_extent_put(fs_info, re);  /* our ref */
-        if (for_dev)
-                atomic_dec(&for_dev->reada_in_flight);
-        return 0;
+        return;
 }
 /*
 * start is passed separately in case eb in NULL, which may be the case with
 * failed I/O
 */
-int btree_readahead_hook(struct btrfs_root *root, struct extent_buffer *eb,
+int btree_readahead_hook(struct btrfs_fs_info *fs_info,
-                         u64 start, int err)
+                         struct extent_buffer *eb, u64 start, int err)
 {
-        int ret;
+        int ret = 0;
+        struct reada_extent *re;
-        ret = __readahead_hook(root, eb, start, err);
+        /* find extent */
+        spin_lock(&fs_info->reada_lock);
+        re = radix_tree_lookup(&fs_info->reada_tree,
+                               start >> PAGE_CACHE_SHIFT);
+        if (re)
+                re->refcnt++;
+        spin_unlock(&fs_info->reada_lock);
+        if (!re) {
+                ret = -1;
+                goto start_machine;
+        }
-        reada_start_machine(root->fs_info);
+        __readahead_hook(fs_info, re, eb, start, err);
+        reada_extent_put(fs_info, re);  /* our ref */
+start_machine:
+        reada_start_machine(fs_info);
        return ret;
 }
@@ -260,18 +258,14 @@ static struct reada_zone *reada_find_zone(struct btrfs_fs_info *fs_info,
        spin_lock(&fs_info->reada_lock);
        ret = radix_tree_gang_lookup(&dev->reada_zones, (void **)&zone,
                                     logical >> PAGE_CACHE_SHIFT, 1);
-        if (ret == 1)
+        if (ret == 1 && logical >= zone->start && logical <= zone->end) {
                kref_get(&zone->refcnt);
-        spin_unlock(&fs_info->reada_lock);
-        if (ret == 1) {
-                if (logical >= zone->start && logical < zone->end)
-                        return zone;
-                spin_lock(&fs_info->reada_lock);
-                kref_put(&zone->refcnt, reada_zone_release);
                spin_unlock(&fs_info->reada_lock);
+                return zone;
        }
+        spin_unlock(&fs_info->reada_lock);
        cache = btrfs_lookup_block_group(fs_info, logical);
        if (!cache)
                return NULL;
@@ -280,7 +274,7 @@ static struct reada_zone *reada_find_zone(struct btrfs_fs_info *fs_info,
        end = start + cache->key.offset - 1;
        btrfs_put_block_group(cache);
-        zone = kzalloc(sizeof(*zone), GFP_NOFS);
+        zone = kzalloc(sizeof(*zone), GFP_KERNEL);
        if (!zone)
                return NULL;
@@ -307,8 +301,10 @@ static struct reada_zone *reada_find_zone(struct btrfs_fs_info *fs_info,
                kfree(zone);
                ret = radix_tree_gang_lookup(&dev->reada_zones, (void **)&zone,
                                             logical >> PAGE_CACHE_SHIFT, 1);
-                if (ret == 1)
+                if (ret == 1 && logical >= zone->start && logical <= zone->end)
                        kref_get(&zone->refcnt);
+                else
+                        zone = NULL;
        }
        spin_unlock(&fs_info->reada_lock);
@@ -317,7 +313,7 @@ static struct reada_zone *reada_find_zone(struct btrfs_fs_info *fs_info,
 static struct reada_extent *reada_find_extent(struct btrfs_root *root,
                                              u64 logical,
-                                              struct btrfs_key *top, int level)
+                                              struct btrfs_key *top)
 {
        int ret;
        struct reada_extent *re = NULL;
@@ -330,9 +326,9 @@ static struct reada_extent *reada_find_extent(struct btrfs_root *root,
        u64 length;
        int real_stripes;
        int nzones = 0;
-        int i;
        unsigned long index = logical >> PAGE_CACHE_SHIFT;
        int dev_replace_is_ongoing;
+        int have_zone = 0;
        spin_lock(&fs_info->reada_lock);
        re = radix_tree_lookup(&fs_info->reada_tree, index);
@@ -343,7 +339,7 @@ static struct reada_extent *reada_find_extent(struct btrfs_root *root,
        if (re)
                return re;
-        re = kzalloc(sizeof(*re), GFP_NOFS);
+        re = kzalloc(sizeof(*re), GFP_KERNEL);
        if (!re)
                return NULL;
@@ -375,11 +371,16 @@ static struct reada_extent *reada_find_extent(struct btrfs_root *root,
                struct reada_zone *zone;
                dev = bbio->stripes[nzones].dev;
+                /* cannot read ahead on missing device. */
+                 if (!dev->bdev)
+                        continue;
                zone = reada_find_zone(fs_info, dev, logical, bbio);
                if (!zone)
-                        break;
+                        continue;
-                re->zones[nzones] = zone;
+                re->zones[re->nzones++] = zone;
                spin_lock(&zone->lock);
                if (!zone->elems)
                        kref_get(&zone->refcnt);
@@ -389,14 +390,13 @@ static struct reada_extent *reada_find_extent(struct btrfs_root *root,
                kref_put(&zone->refcnt, reada_zone_release);
                spin_unlock(&fs_info->reada_lock);
        }
-        re->nzones = nzones;
+        if (re->nzones == 0) {
-        if (nzones == 0) {
                /* not a single zone found, error and out */
                goto error;
        }
        /* insert extent in reada_tree + all per-device trees, all or nothing */
-        btrfs_dev_replace_lock(&fs_info->dev_replace);
+        btrfs_dev_replace_lock(&fs_info->dev_replace, 0);
        spin_lock(&fs_info->reada_lock);
        ret = radix_tree_insert(&fs_info->reada_tree, index, re);
        if (ret == -EEXIST) {
@@ -404,19 +404,20 @@ static struct reada_extent *reada_find_extent(struct btrfs_root *root,
                BUG_ON(!re_exist);
                re_exist->refcnt++;
                spin_unlock(&fs_info->reada_lock);
-                btrfs_dev_replace_unlock(&fs_info->dev_replace);
+                btrfs_dev_replace_unlock(&fs_info->dev_replace, 0);
                goto error;
        }
        if (ret) {
                spin_unlock(&fs_info->reada_lock);
-                btrfs_dev_replace_unlock(&fs_info->dev_replace);
+                btrfs_dev_replace_unlock(&fs_info->dev_replace, 0);
                goto error;
        }
        prev_dev = NULL;
        dev_replace_is_ongoing = btrfs_dev_replace_is_ongoing(
                        &fs_info->dev_replace);
-        for (i = 0; i < nzones; ++i) {
+        for (nzones = 0; nzones < re->nzones; ++nzones) {
-                dev = bbio->stripes[i].dev;
+                dev = re->zones[nzones]->device;
                if (dev == prev_dev) {
                        /*
                         * in case of DUP, just add the first zone. As both
@@ -427,15 +428,9 @@ static struct reada_extent *reada_find_extent(struct btrfs_root *root,
                         */
                        continue;
                }
-                if (!dev->bdev) {
+                if (!dev->bdev)
-                        /*
+                        continue;
-                         * cannot read ahead on missing device, but for RAID5/6,
-                         * REQ_GET_READ_MIRRORS return 1. So don't skip missing
-                         * device for such case.
-                         */
-                        if (nzones > 1)
-                                continue;
-                }
                if (dev_replace_is_ongoing &&
                    dev == fs_info->dev_replace.tgtdev) {
                        /*
@@ -447,8 +442,8 @@ static struct reada_extent *reada_find_extent(struct btrfs_root *root,
                prev_dev = dev;
                ret = radix_tree_insert(&dev->reada_extents, index, re);
                if (ret) {
-                        while (--i >= 0) {
+                        while (--nzones >= 0) {
-                                dev = bbio->stripes[i].dev;
+                                dev = re->zones[nzones]->device;
                                BUG_ON(dev == NULL);
                                /* ignore whether the entry was inserted */
                                radix_tree_delete(&dev->reada_extents, index);
@@ -456,21 +451,24 @@ static struct reada_extent *reada_find_extent(struct btrfs_root *root,
                        BUG_ON(fs_info == NULL);
                        radix_tree_delete(&fs_info->reada_tree, index);
                        spin_unlock(&fs_info->reada_lock);
-                        btrfs_dev_replace_unlock(&fs_info->dev_replace);
+                        btrfs_dev_replace_unlock(&fs_info->dev_replace, 0);
                        goto error;
                }
+                have_zone = 1;
        }
        spin_unlock(&fs_info->reada_lock);
-        btrfs_dev_replace_unlock(&fs_info->dev_replace);
+        btrfs_dev_replace_unlock(&fs_info->dev_replace, 0);
+        if (!have_zone)
+                goto error;
        btrfs_put_bbio(bbio);
        return re;
 error:
-        while (nzones) {
+        for (nzones = 0; nzones < re->nzones; ++nzones) {
                struct reada_zone *zone;
-                --nzones;
                zone = re->zones[nzones];
                kref_get(&zone->refcnt);
                spin_lock(&zone->lock);
@@ -531,8 +529,6 @@ static void reada_extent_put(struct btrfs_fs_info *fs_info,
                kref_put(&zone->refcnt, reada_zone_release);
                spin_unlock(&fs_info->reada_lock);
        }
-        if (re->scheduled_for)
-                atomic_dec(&re->scheduled_for->reada_in_flight);
        kfree(re);
 }
@@ -556,17 +552,17 @@ static void reada_control_release(struct kref *kref)
 }
 static int reada_add_block(struct reada_control *rc, u64 logical,
-                           struct btrfs_key *top, int level, u64 generation)
+                           struct btrfs_key *top, u64 generation)
 {
        struct btrfs_root *root = rc->root;
        struct reada_extent *re;
        struct reada_extctl *rec;
-        re = reada_find_extent(root, logical, top, level); /* takes one ref */
+        re = reada_find_extent(root, logical, top); /* takes one ref */
        if (!re)
                return -1;
-        rec = kzalloc(sizeof(*rec), GFP_NOFS);
+        rec = kzalloc(sizeof(*rec), GFP_KERNEL);
        if (!rec) {
                reada_extent_put(root->fs_info, re);
                return -ENOMEM;
@@ -662,7 +658,6 @@ static int reada_start_machine_dev(struct btrfs_fs_info *fs_info,
        u64 logical;
        int ret;
        int i;
-        int need_kick = 0;
        spin_lock(&fs_info->reada_lock);
        if (dev->reada_curr_zone == NULL) {
@@ -679,7 +674,7 @@ static int reada_start_machine_dev(struct btrfs_fs_info *fs_info,
         */
        ret = radix_tree_gang_lookup(&dev->reada_extents, (void **)&re,
                                     dev->reada_next >> PAGE_CACHE_SHIFT, 1);
-        if (ret == 0 || re->logical >= dev->reada_curr_zone->end) {
+        if (ret == 0 || re->logical > dev->reada_curr_zone->end) {
                ret = reada_pick_zone(dev);
                if (!ret) {
                        spin_unlock(&fs_info->reada_lock);
@@ -698,6 +693,15 @@ static int reada_start_machine_dev(struct btrfs_fs_info *fs_info,
        spin_unlock(&fs_info->reada_lock);
+        spin_lock(&re->lock);
+        if (re->scheduled || list_empty(&re->extctl)) {
+                spin_unlock(&re->lock);
+                reada_extent_put(fs_info, re);
+                return 0;
+        }
+        re->scheduled = 1;
+        spin_unlock(&re->lock);
        /*
         * find mirror num
         */
@@ -709,29 +713,20 @@ static int reada_start_machine_dev(struct btrfs_fs_info *fs_info,
        }
        logical = re->logical;
-        spin_lock(&re->lock);
-        if (re->scheduled_for == NULL) {
-                re->scheduled_for = dev;
-                need_kick = 1;
-        }
-        spin_unlock(&re->lock);
-        reada_extent_put(fs_info, re);
-        if (!need_kick)
-                return 0;
        atomic_inc(&dev->reada_in_flight);
        ret = reada_tree_block_flagged(fs_info->extent_root, logical,
                        mirror_num, &eb);
        if (ret)
-                __readahead_hook(fs_info->extent_root, NULL, logical, ret);
+                __readahead_hook(fs_info, re, NULL, logical, ret);
        else if (eb)
-                __readahead_hook(fs_info->extent_root, eb, eb->start, ret);
+                __readahead_hook(fs_info, re, eb, eb->start, ret);
        if (eb)
                free_extent_buffer(eb);
+        atomic_dec(&dev->reada_in_flight);
+        reada_extent_put(fs_info, re);
        return 1;
 }
@@ -752,6 +747,8 @@ static void reada_start_machine_worker(struct btrfs_work *work)
        set_task_ioprio(current, BTRFS_IOPRIO_READA);
        __reada_start_machine(fs_info);
        set_task_ioprio(current, old_ioprio);
+        atomic_dec(&fs_info->reada_works_cnt);
 }
 static void __reada_start_machine(struct btrfs_fs_info *fs_info)
@@ -783,15 +780,19 @@ static void __reada_start_machine(struct btrfs_fs_info *fs_info)
         * enqueue to workers to finish it. This will distribute the load to
         * the cores.
         */
-        for (i = 0; i < 2; ++i)
+        for (i = 0; i < 2; ++i) {
                reada_start_machine(fs_info);
+                if (atomic_read(&fs_info->reada_works_cnt) >
+                    BTRFS_MAX_MIRRORS * 2)
+                        break;
+        }
 }
 static void reada_start_machine(struct btrfs_fs_info *fs_info)
 {
        struct reada_machine_work *rmw;
-        rmw = kzalloc(sizeof(*rmw), GFP_NOFS);
+        rmw = kzalloc(sizeof(*rmw), GFP_KERNEL);
        if (!rmw) {
                /* FIXME we cannot handle this properly right now */
                BUG();
@@ -801,6 +802,7 @@ static void reada_start_machine(struct btrfs_fs_info *fs_info)
        rmw->fs_info = fs_info;
        btrfs_queue_work(fs_info->readahead_workers, &rmw->work);
+        atomic_inc(&fs_info->reada_works_cnt);
 }
 #ifdef DEBUG
@@ -848,10 +850,9 @@ static void dump_devs(struct btrfs_fs_info *fs_info, int all)
                        if (ret == 0)
                                break;
                        printk(KERN_DEBUG
-                                "  re: logical %llu size %u empty %d for %lld",
+                                "  re: logical %llu size %u empty %d scheduled %d",
                                re->logical, fs_info->tree_root->nodesize,
-                                list_empty(&re->extctl), re->scheduled_for ?
+                                list_empty(&re->extctl), re->scheduled);
-                                re->scheduled_for->devid : -1);
                        for (i = 0; i < re->nzones; ++i) {
                                printk(KERN_CONT " zone %llu-%llu devs",
@@ -878,27 +879,21 @@ static void dump_devs(struct btrfs_fs_info *fs_info, int all)
                                             index, 1);
                if (ret == 0)
                        break;
-                if (!re->scheduled_for) {
+                if (!re->scheduled) {
                        index = (re->logical >> PAGE_CACHE_SHIFT) + 1;
                        continue;
                }
                printk(KERN_DEBUG
-                        "re: logical %llu size %u list empty %d for %lld",
+                        "re: logical %llu size %u list empty %d scheduled %d",
                        re->logical, fs_info->tree_root->nodesize,
-                        list_empty(&re->extctl),
+                        list_empty(&re->extctl), re->scheduled);
-                        re->scheduled_for ? re->scheduled_for->devid : -1);
                for (i = 0; i < re->nzones; ++i) {
                        printk(KERN_CONT " zone %llu-%llu devs",
                                re->zones[i]->start,
                                re->zones[i]->end);
-                        for (i = 0; i < re->nzones; ++i) {
+                        for (j = 0; j < re->zones[i]->ndevs; ++j) {
-                                printk(KERN_CONT " zone %llu-%llu devs",
+                                printk(KERN_CONT " %lld",
-                                        re->zones[i]->start,
+                                       re->zones[i]->devs[j]->devid);
-                                        re->zones[i]->end);
-                                for (j = 0; j < re->zones[i]->ndevs; ++j) {
-                                        printk(KERN_CONT " %lld",
-                                                re->zones[i]->devs[j]->devid);
-                                }
                        }
                }
                printk(KERN_CONT "\n");
@@ -917,7 +912,6 @@ struct reada_control *btrfs_reada_add(struct btrfs_root *root,
        struct reada_control *rc;
        u64 start;
        u64 generation;
-        int level;
        int ret;
        struct extent_buffer *node;
        static struct btrfs_key max_key = {
@@ -926,7 +920,7 @@ struct reada_control *btrfs_reada_add(struct btrfs_root *root,
                .offset = (u64)-1
        };
-        rc = kzalloc(sizeof(*rc), GFP_NOFS);
+        rc = kzalloc(sizeof(*rc), GFP_KERNEL);
        if (!rc)
                return ERR_PTR(-ENOMEM);
@@ -940,11 +934,10 @@ struct reada_control *btrfs_reada_add(struct btrfs_root *root,
        node = btrfs_root_node(root);
        start = node->start;
-        level = btrfs_header_level(node);
        generation = btrfs_header_generation(node);
        free_extent_buffer(node);
-        ret = reada_add_block(rc, start, &max_key, level, generation);
+        ret = reada_add_block(rc, start, &max_key, generation);
        if (ret) {
                kfree(rc);
                return ERR_PTR(ret);
@@ -959,8 +952,11 @@ struct reada_control *btrfs_reada_add(struct btrfs_root *root,
 int btrfs_reada_wait(void *handle)
 {
        struct reada_control *rc = handle;
+        struct btrfs_fs_info *fs_info = rc->root->fs_info;
        while (atomic_read(&rc->elems)) {
+                if (!atomic_read(&fs_info->reada_works_cnt))
+                        reada_start_machine(fs_info);
                wait_event_timeout(rc->wait, atomic_read(&rc->elems) == 0,
                                   5 * HZ);
                dump_devs(rc->root->fs_info,
@@ -977,9 +973,13 @@ int btrfs_reada_wait(void *handle)
 int btrfs_reada_wait(void *handle)
 {
        struct reada_control *rc = handle;
+        struct btrfs_fs_info *fs_info = rc->root->fs_info;
        while (atomic_read(&rc->elems)) {
-                wait_event(rc->wait, atomic_read(&rc->elems) == 0);
+                if (!atomic_read(&fs_info->reada_works_cnt))
+                        reada_start_machine(fs_info);
+                wait_event_timeout(rc->wait, atomic_read(&rc->elems) == 0,
+                                   (HZ + 9) / 10);
        }
        kref_put(&rc->refcnt, reada_control_release);
diff --git a/fs/btrfs/root-tree.c b/fs/btrfs/root-tree.c
index 2c849b08a91b..9fcd6dfc3266 100644
--- a/fs/btrfs/root-tree.c
+++ b/fs/btrfs/root-tree.c
@@ -496,7 +496,7 @@ void btrfs_update_root_times(struct btrfs_trans_handle *trans,
                             struct btrfs_root *root)
 {
        struct btrfs_root_item *item = &root->root_item;
-        struct timespec ct = CURRENT_TIME;
+        struct timespec ct = current_fs_time(root->fs_info->sb);
        spin_lock(&root->root_item_lock);
        btrfs_set_root_ctransid(item, trans->transid);
diff --git a/fs/btrfs/scrub.c b/fs/btrfs/scrub.c
index 92bf5ee732fb..39dbdcbf4d13 100644
--- a/fs/btrfs/scrub.c
+++ b/fs/btrfs/scrub.c
@@ -461,7 +461,7 @@ struct scrub_ctx *scrub_setup_ctx(struct btrfs_device *dev, int is_dev_replace)
        struct btrfs_fs_info *fs_info = dev->dev_root->fs_info;
        int ret;
-        sctx = kzalloc(sizeof(*sctx), GFP_NOFS);
+        sctx = kzalloc(sizeof(*sctx), GFP_KERNEL);
        if (!sctx)
                goto nomem;
        atomic_set(&sctx->refs, 1);
@@ -472,7 +472,7 @@ struct scrub_ctx *scrub_setup_ctx(struct btrfs_device *dev, int is_dev_replace)
        for (i = 0; i < SCRUB_BIOS_PER_SCTX; ++i) {
                struct scrub_bio *sbio;
-                sbio = kzalloc(sizeof(*sbio), GFP_NOFS);
+                sbio = kzalloc(sizeof(*sbio), GFP_KERNEL);
                if (!sbio)
                        goto nomem;
                sctx->bios[i] = sbio;
@@ -611,7 +611,7 @@ static void scrub_print_warning(const char *errstr, struct scrub_block *sblock)
        u64 flags = 0;
        u64 ref_root;
        u32 item_size;
-        u8 ref_level;
+        u8 ref_level = 0;
        int ret;
        WARN_ON(sblock->page_count < 1);
@@ -1654,7 +1654,7 @@ static int scrub_add_page_to_wr_bio(struct scrub_ctx *sctx,
 again:
        if (!wr_ctx->wr_curr_bio) {
                wr_ctx->wr_curr_bio = kzalloc(sizeof(*wr_ctx->wr_curr_bio),
-                                              GFP_NOFS);
+                                              GFP_KERNEL);
                if (!wr_ctx->wr_curr_bio) {
                        mutex_unlock(&wr_ctx->wr_lock);
                        return -ENOMEM;
@@ -1671,7 +1671,8 @@ again:
                sbio->dev = wr_ctx->tgtdev;
                bio = sbio->bio;
                if (!bio) {
-                        bio = btrfs_io_bio_alloc(GFP_NOFS, wr_ctx->pages_per_wr_bio);
+                        bio = btrfs_io_bio_alloc(GFP_KERNEL,
+                                        wr_ctx->pages_per_wr_bio);
                        if (!bio) {
                                mutex_unlock(&wr_ctx->wr_lock);
                                return -ENOMEM;
@@ -2076,7 +2077,8 @@ again:
                sbio->dev = spage->dev;
                bio = sbio->bio;
                if (!bio) {
-                        bio = btrfs_io_bio_alloc(GFP_NOFS, sctx->pages_per_rd_bio);
+                        bio = btrfs_io_bio_alloc(GFP_KERNEL,
+                                        sctx->pages_per_rd_bio);
                        if (!bio)
                                return -ENOMEM;
                        sbio->bio = bio;
@@ -2241,7 +2243,7 @@ static int scrub_pages(struct scrub_ctx *sctx, u64 logical, u64 len,
        struct scrub_block *sblock;
        int index;
-        sblock = kzalloc(sizeof(*sblock), GFP_NOFS);
+        sblock = kzalloc(sizeof(*sblock), GFP_KERNEL);
        if (!sblock) {
                spin_lock(&sctx->stat_lock);
                sctx->stat.malloc_errors++;
@@ -2259,7 +2261,7 @@ static int scrub_pages(struct scrub_ctx *sctx, u64 logical, u64 len,
                struct scrub_page *spage;
                u64 l = min_t(u64, len, PAGE_SIZE);
-                spage = kzalloc(sizeof(*spage), GFP_NOFS);
+                spage = kzalloc(sizeof(*spage), GFP_KERNEL);
                if (!spage) {
 leave_nomem:
                        spin_lock(&sctx->stat_lock);
@@ -2286,7 +2288,7 @@ leave_nomem:
                        spage->have_csum = 0;
                }
                sblock->page_count++;
-                spage->page = alloc_page(GFP_NOFS);
+                spage->page = alloc_page(GFP_KERNEL);
                if (!spage->page)
                        goto leave_nomem;
                len -= l;
@@ -2541,7 +2543,7 @@ static int scrub_pages_for_parity(struct scrub_parity *sparity,
        struct scrub_block *sblock;
        int index;
-        sblock = kzalloc(sizeof(*sblock), GFP_NOFS);
+        sblock = kzalloc(sizeof(*sblock), GFP_KERNEL);
        if (!sblock) {
                spin_lock(&sctx->stat_lock);
                sctx->stat.malloc_errors++;
@@ -2561,7 +2563,7 @@ static int scrub_pages_for_parity(struct scrub_parity *sparity,
                struct scrub_page *spage;
                u64 l = min_t(u64, len, PAGE_SIZE);
-                spage = kzalloc(sizeof(*spage), GFP_NOFS);
+                spage = kzalloc(sizeof(*spage), GFP_KERNEL);
                if (!spage) {
 leave_nomem:
                        spin_lock(&sctx->stat_lock);
@@ -2591,7 +2593,7 @@ leave_nomem:
                        spage->have_csum = 0;
                }
                sblock->page_count++;
-                spage->page = alloc_page(GFP_NOFS);
+                spage->page = alloc_page(GFP_KERNEL);
                if (!spage->page)
                        goto leave_nomem;
                len -= l;
@@ -3857,16 +3859,16 @@ int btrfs_scrub_dev(struct btrfs_fs_info *fs_info, u64 devid, u64 start,
                return -EIO;
        }
-        btrfs_dev_replace_lock(&fs_info->dev_replace);
+        btrfs_dev_replace_lock(&fs_info->dev_replace, 0);
        if (dev->scrub_device ||
            (!is_dev_replace &&
             btrfs_dev_replace_is_ongoing(&fs_info->dev_replace))) {
-                btrfs_dev_replace_unlock(&fs_info->dev_replace);
+                btrfs_dev_replace_unlock(&fs_info->dev_replace, 0);
                mutex_unlock(&fs_info->scrub_lock);
                mutex_unlock(&fs_info->fs_devices->device_list_mutex);
                return -EINPROGRESS;
        }
-        btrfs_dev_replace_unlock(&fs_info->dev_replace);
+        btrfs_dev_replace_unlock(&fs_info->dev_replace, 0);
        ret = scrub_workers_get(fs_info, is_dev_replace);
        if (ret) {
diff --git a/fs/btrfs/send.c b/fs/btrfs/send.c
index 63a6152be04b..19b7bf4284ee 100644
--- a/fs/btrfs/send.c
+++ b/fs/btrfs/send.c
@@ -34,6 +34,7 @@
 #include "disk-io.h"
 #include "btrfs_inode.h"
 #include "transaction.h"
+#include "compression.h"
 static int g_verbose = 0;
@@ -304,7 +305,7 @@ static struct fs_path *fs_path_alloc(void)
 {
        struct fs_path *p;
-        p = kmalloc(sizeof(*p), GFP_NOFS);
+        p = kmalloc(sizeof(*p), GFP_KERNEL);
        if (!p)
                return NULL;
        p->reversed = 0;
@@ -363,11 +364,11 @@ static int fs_path_ensure_buf(struct fs_path *p, int len)
         * First time the inline_buf does not suffice
         */
        if (p->buf == p->inline_buf) {
-                tmp_buf = kmalloc(len, GFP_NOFS);
+                tmp_buf = kmalloc(len, GFP_KERNEL);
                if (tmp_buf)
                        memcpy(tmp_buf, p->buf, old_buf_len);
        } else {
-                tmp_buf = krealloc(p->buf, len, GFP_NOFS);
+                tmp_buf = krealloc(p->buf, len, GFP_KERNEL);
        }
        if (!tmp_buf)
                return -ENOMEM;
@@ -995,7 +996,7 @@ static int iterate_dir_item(struct btrfs_root *root, struct btrfs_path *path,
         * values are small.
         */
        buf_len = PATH_MAX;
-        buf = kmalloc(buf_len, GFP_NOFS);
+        buf = kmalloc(buf_len, GFP_KERNEL);
        if (!buf) {
                ret = -ENOMEM;
                goto out;
@@ -1042,7 +1043,7 @@ static int iterate_dir_item(struct btrfs_root *root, struct btrfs_path *path,
                                buf = NULL;
                        } else {
                                char *tmp = krealloc(buf, buf_len,
-                                                     GFP_NOFS | __GFP_NOWARN);
+                                                GFP_KERNEL | __GFP_NOWARN);
                                if (!tmp)
                                        kfree(buf);
@@ -1303,7 +1304,7 @@ static int find_extent_clone(struct send_ctx *sctx,
        /* We only use this path under the commit sem */
        tmp_path->need_commit_sem = 0;
-        backref_ctx = kmalloc(sizeof(*backref_ctx), GFP_NOFS);
+        backref_ctx = kmalloc(sizeof(*backref_ctx), GFP_KERNEL);
        if (!backref_ctx) {
                ret = -ENOMEM;
                goto out;
@@ -1984,7 +1985,7 @@ static int name_cache_insert(struct send_ctx *sctx,
        nce_head = radix_tree_lookup(&sctx->name_cache,
                        (unsigned long)nce->ino);
        if (!nce_head) {
-                nce_head = kmalloc(sizeof(*nce_head), GFP_NOFS);
+                nce_head = kmalloc(sizeof(*nce_head), GFP_KERNEL);
                if (!nce_head) {
                        kfree(nce);
                        return -ENOMEM;
@@ -2179,7 +2180,7 @@ out_cache:
        /*
         * Store the result of the lookup in the name cache.
         */
-        nce = kmalloc(sizeof(*nce) + fs_path_len(dest) + 1, GFP_NOFS);
+        nce = kmalloc(sizeof(*nce) + fs_path_len(dest) + 1, GFP_KERNEL);
        if (!nce) {
                ret = -ENOMEM;
                goto out;
@@ -2315,7 +2316,7 @@ static int send_subvol_begin(struct send_ctx *sctx)
        if (!path)
                return -ENOMEM;
-        name = kmalloc(BTRFS_PATH_NAME_MAX, GFP_NOFS);
+        name = kmalloc(BTRFS_PATH_NAME_MAX, GFP_KERNEL);
        if (!name) {
                btrfs_free_path(path);
                return -ENOMEM;
@@ -2730,7 +2731,7 @@ static int __record_ref(struct list_head *head, u64 dir,
 {
        struct recorded_ref *ref;
-        ref = kmalloc(sizeof(*ref), GFP_NOFS);
+        ref = kmalloc(sizeof(*ref), GFP_KERNEL);
        if (!ref)
                return -ENOMEM;
@@ -2755,7 +2756,7 @@ static int dup_ref(struct recorded_ref *ref, struct list_head *list)
 {
        struct recorded_ref *new;
-        new = kmalloc(sizeof(*ref), GFP_NOFS);
+        new = kmalloc(sizeof(*ref), GFP_KERNEL);
        if (!new)
                return -ENOMEM;
@@ -2818,7 +2819,7 @@ add_orphan_dir_info(struct send_ctx *sctx, u64 dir_ino)
        struct rb_node *parent = NULL;
        struct orphan_dir_info *entry, *odi;
-        odi = kmalloc(sizeof(*odi), GFP_NOFS);
+        odi = kmalloc(sizeof(*odi), GFP_KERNEL);
        if (!odi)
                return ERR_PTR(-ENOMEM);
        odi->ino = dir_ino;
@@ -2973,7 +2974,7 @@ static int add_waiting_dir_move(struct send_ctx *sctx, u64 ino, bool orphanized)
        struct rb_node *parent = NULL;
        struct waiting_dir_move *entry, *dm;
-        dm = kmalloc(sizeof(*dm), GFP_NOFS);
+        dm = kmalloc(sizeof(*dm), GFP_KERNEL);
        if (!dm)
                return -ENOMEM;
        dm->ino = ino;
@@ -3040,7 +3041,7 @@ static int add_pending_dir_move(struct send_ctx *sctx,
        int exists = 0;
        int ret;
-        pm = kmalloc(sizeof(*pm), GFP_NOFS);
+        pm = kmalloc(sizeof(*pm), GFP_KERNEL);
        if (!pm)
                return -ENOMEM;
        pm->parent_ino = parent_ino;
@@ -4280,7 +4281,7 @@ static int __find_xattr(int num, struct btrfs_key *di_key,
            strncmp(name, ctx->name, name_len) == 0) {
                ctx->found_idx = num;
                ctx->found_data_len = data_len;
-                ctx->found_data = kmemdup(data, data_len, GFP_NOFS);
+                ctx->found_data = kmemdup(data, data_len, GFP_KERNEL);
                if (!ctx->found_data)
                        return -ENOMEM;
                return 1;
@@ -4481,7 +4482,7 @@ static ssize_t fill_read_buf(struct send_ctx *sctx, u64 offset, u32 len)
        while (index <= last_index) {
                unsigned cur_len = min_t(unsigned, len,
                                         PAGE_CACHE_SIZE - pg_offset);
-                page = find_or_create_page(inode->i_mapping, index, GFP_NOFS);
+                page = find_or_create_page(inode->i_mapping, index, GFP_KERNEL);
                if (!page) {
                        ret = -ENOMEM;
                        break;
@@ -5989,7 +5990,7 @@ long btrfs_ioctl_send(struct file *mnt_file, void __user *arg_)
                goto out;
        }
-        sctx = kzalloc(sizeof(struct send_ctx), GFP_NOFS);
+        sctx = kzalloc(sizeof(struct send_ctx), GFP_KERNEL);
        if (!sctx) {
                ret = -ENOMEM;
                goto out;
@@ -5997,7 +5998,7 @@ long btrfs_ioctl_send(struct file *mnt_file, void __user *arg_)
        INIT_LIST_HEAD(&sctx->new_refs);
        INIT_LIST_HEAD(&sctx->deleted_refs);
-        INIT_RADIX_TREE(&sctx->name_cache, GFP_NOFS);
+        INIT_RADIX_TREE(&sctx->name_cache, GFP_KERNEL);
        INIT_LIST_HEAD(&sctx->name_cache_list);
        sctx->flags = arg->flags;
diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c
index d41e09fe8e38..00b8f37cc306 100644
--- a/fs/btrfs/super.c
+++ b/fs/btrfs/super.c
@@ -303,7 +303,8 @@ enum {
        Opt_check_integrity_print_mask, Opt_fatal_errors, Opt_rescan_uuid_tree,
        Opt_commit_interval, Opt_barrier, Opt_nodefrag, Opt_nodiscard,
        Opt_noenospc_debug, Opt_noflushoncommit, Opt_acl, Opt_datacow,
-        Opt_datasum, Opt_treelog, Opt_noinode_cache,
+        Opt_datasum, Opt_treelog, Opt_noinode_cache, Opt_usebackuproot,
+        Opt_nologreplay, Opt_norecovery,
 #ifdef CONFIG_BTRFS_DEBUG
        Opt_fragment_data, Opt_fragment_metadata, Opt_fragment_all,
 #endif
@@ -335,6 +336,8 @@ static const match_table_t tokens = {
        {Opt_noacl, "noacl"},
        {Opt_notreelog, "notreelog"},
        {Opt_treelog, "treelog"},
+        {Opt_nologreplay, "nologreplay"},
+        {Opt_norecovery, "norecovery"},
        {Opt_flushoncommit, "flushoncommit"},
        {Opt_noflushoncommit, "noflushoncommit"},
        {Opt_ratio, "metadata_ratio=%d"},
@@ -352,7 +355,8 @@ static const match_table_t tokens = {
        {Opt_inode_cache, "inode_cache"},
        {Opt_noinode_cache, "noinode_cache"},
        {Opt_no_space_cache, "nospace_cache"},
-        {Opt_recovery, "recovery"},
+        {Opt_recovery, "recovery"}, /* deprecated */
+        {Opt_usebackuproot, "usebackuproot"},
        {Opt_skip_balance, "skip_balance"},
        {Opt_check_integrity, "check_int"},
        {Opt_check_integrity_including_extent_data, "check_int_data"},
@@ -373,7 +377,8 @@ static const match_table_t tokens = {
 * reading in a new superblock is parsed here.
 * XXX JDM: This needs to be cleaned up for remount.
 */
-int btrfs_parse_options(struct btrfs_root *root, char *options)
+int btrfs_parse_options(struct btrfs_root *root, char *options,
+                        unsigned long new_flags)
 {
        struct btrfs_fs_info *info = root->fs_info;
        substring_t args[MAX_OPT_ARGS];
@@ -393,8 +398,12 @@ int btrfs_parse_options(struct btrfs_root *root, char *options)
        else if (cache_gen)
                btrfs_set_opt(info->mount_opt, SPACE_CACHE);
+        /*
+         * Even the options are empty, we still need to do extra check
+         * against new flags
+         */
        if (!options)
-                goto out;
+                goto check;
        /*
         * strsep changes the string, duplicate it because parse_options
@@ -606,6 +615,11 @@ int btrfs_parse_options(struct btrfs_root *root, char *options)
                        btrfs_clear_and_info(root, NOTREELOG,
                                             "enabling tree log");
                        break;
+                case Opt_norecovery:
+                case Opt_nologreplay:
+                        btrfs_set_and_info(root, NOLOGREPLAY,
+                                           "disabling log replay at mount time");
+                        break;
                case Opt_flushoncommit:
                        btrfs_set_and_info(root, FLUSHONCOMMIT,
                                           "turning on flush-on-commit");
@@ -696,8 +710,12 @@ int btrfs_parse_options(struct btrfs_root *root, char *options)
                                             "disabling auto defrag");
                        break;
                case Opt_recovery:
-                        btrfs_info(root->fs_info, "enabling auto recovery");
+                        btrfs_warn(root->fs_info,
-                        btrfs_set_opt(info->mount_opt, RECOVERY);
+                                   "'recovery' is deprecated, use 'usebackuproot' instead");
+                case Opt_usebackuproot:
+                        btrfs_info(root->fs_info,
+                                   "trying to use backup root at mount time");
+                        btrfs_set_opt(info->mount_opt, USEBACKUPROOT);
                        break;
                case Opt_skip_balance:
                        btrfs_set_opt(info->mount_opt, SKIP_BALANCE);
@@ -792,6 +810,15 @@ int btrfs_parse_options(struct btrfs_root *root, char *options)
                        break;
                }
        }
+check:
+        /*
+         * Extra check for current option against current flag
+         */
+        if (btrfs_test_opt(root, NOLOGREPLAY) && !(new_flags & MS_RDONLY)) {
+                btrfs_err(root->fs_info,
+                          "nologreplay must be used with ro mount option");
+                ret = -EINVAL;
+        }
 out:
        if (btrfs_fs_compat_ro(root->fs_info, FREE_SPACE_TREE) &&
            !btrfs_test_opt(root, FREE_SPACE_TREE) &&
@@ -1202,6 +1229,8 @@ static int btrfs_show_options(struct seq_file *seq, struct dentry *dentry)
                seq_puts(seq, ",ssd");
        if (btrfs_test_opt(root, NOTREELOG))
                seq_puts(seq, ",notreelog");
+        if (btrfs_test_opt(root, NOLOGREPLAY))
+                seq_puts(seq, ",nologreplay");
        if (btrfs_test_opt(root, FLUSHONCOMMIT))
                seq_puts(seq, ",flushoncommit");
        if (btrfs_test_opt(root, DISCARD))
@@ -1228,8 +1257,6 @@ static int btrfs_show_options(struct seq_file *seq, struct dentry *dentry)
                seq_puts(seq, ",inode_cache");
        if (btrfs_test_opt(root, SKIP_BALANCE))
                seq_puts(seq, ",skip_balance");
-        if (btrfs_test_opt(root, RECOVERY))
-                seq_puts(seq, ",recovery");
 #ifdef CONFIG_BTRFS_FS_CHECK_INTEGRITY
        if (btrfs_test_opt(root, CHECK_INTEGRITY_INCLUDING_EXTENT_DATA))
                seq_puts(seq, ",check_int_data");
@@ -1685,7 +1712,7 @@ static int btrfs_remount(struct super_block *sb, int *flags, char *data)
                }
        }
-        ret = btrfs_parse_options(root, data);
+        ret = btrfs_parse_options(root, data, *flags);
        if (ret) {
                ret = -EINVAL;
                goto restore;
@@ -2163,6 +2190,9 @@ static long btrfs_control_ioctl(struct file *file, unsigned int cmd,
                        break;
                ret = !(fs_devices->num_devices == fs_devices->total_devices);
                break;
+        case BTRFS_IOC_GET_SUPPORTED_FEATURES:
+                ret = btrfs_ioctl_get_supported_features((void __user*)arg);
+                break;
        }
        kfree(vol);
@@ -2261,7 +2291,7 @@ static void btrfs_interface_exit(void)
        misc_deregister(&btrfs_misc);
 }
-static void btrfs_print_info(void)
+static void btrfs_print_mod_info(void)
 {
        printk(KERN_INFO "Btrfs loaded"
 #ifdef CONFIG_BTRFS_DEBUG
@@ -2363,7 +2393,7 @@ static int __init init_btrfs_fs(void)
        btrfs_init_lockdep();
-        btrfs_print_info();
+        btrfs_print_mod_info();
        err = btrfs_run_sanity_tests();
        if (err)
diff --git a/fs/btrfs/tests/btrfs-tests.c b/fs/btrfs/tests/btrfs-tests.c
index 0e1e61a7ec23..f54bf450bad3 100644
--- a/fs/btrfs/tests/btrfs-tests.c
+++ b/fs/btrfs/tests/btrfs-tests.c
@@ -137,7 +137,6 @@ static void btrfs_free_dummy_fs_info(struct btrfs_fs_info *fs_info)
        void **slot;
        spin_lock(&fs_info->buffer_lock);
-restart:
        radix_tree_for_each_slot(slot, &fs_info->buffer_radix, &iter, 0) {
                struct extent_buffer *eb;
@@ -147,7 +146,7 @@ restart:
                /* Shouldn't happen but that kind of thinking creates CVE's */
                if (radix_tree_exception(eb)) {
                        if (radix_tree_deref_retry(eb))
-                                goto restart;
+                                slot = radix_tree_iter_retry(&iter);
                        continue;
                }
                spin_unlock(&fs_info->buffer_lock);
@@ -189,12 +188,6 @@ btrfs_alloc_dummy_block_group(unsigned long length)
                kfree(cache);
                return NULL;
        }
-        cache->fs_info = btrfs_alloc_dummy_fs_info();
-        if (!cache->fs_info) {
-                kfree(cache->free_space_ctl);
-                kfree(cache);
-                return NULL;
-        }
        cache->key.objectid = 0;
        cache->key.offset = length;
diff --git a/fs/btrfs/tests/free-space-tree-tests.c b/fs/btrfs/tests/free-space-tree-tests.c
index d05fe1ab4808..7cea4462acd5 100644
--- a/fs/btrfs/tests/free-space-tree-tests.c
+++ b/fs/btrfs/tests/free-space-tree-tests.c
@@ -485,6 +485,7 @@ static int run_test(test_func_t test_func, int bitmaps)
        cache->bitmap_low_thresh = 0;
        cache->bitmap_high_thresh = (u32)-1;
        cache->needs_free_space = 1;
+        cache->fs_info = root->fs_info;
        btrfs_init_dummy_trans(&trans);
diff --git a/fs/btrfs/tests/inode-tests.c b/fs/btrfs/tests/inode-tests.c
index e2d3da02deee..863a6a3af1f8 100644
--- a/fs/btrfs/tests/inode-tests.c
+++ b/fs/btrfs/tests/inode-tests.c
@@ -22,6 +22,7 @@
 #include "../disk-io.h"
 #include "../extent_io.h"
 #include "../volumes.h"
+#include "../compression.h"
 static void insert_extent(struct btrfs_root *root, u64 start, u64 len,
                          u64 ram_bytes, u64 offset, u64 disk_bytenr,
diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c
index b6031ce474f7..43885e51b882 100644
--- a/fs/btrfs/transaction.c
+++ b/fs/btrfs/transaction.c
@@ -637,6 +637,8 @@ struct btrfs_trans_handle *btrfs_start_transaction_fallback_global_rsv(
        trans->block_rsv = &root->fs_info->trans_block_rsv;
        trans->bytes_reserved = num_bytes;
+        trace_btrfs_space_reservation(root->fs_info, "transaction",
+                                      trans->transid, num_bytes, 1);
        return trans;
 }
@@ -1333,7 +1335,7 @@ static noinline int create_pending_snapshot(struct btrfs_trans_handle *trans,
        struct dentry *dentry;
        struct extent_buffer *tmp;
        struct extent_buffer *old;
-        struct timespec cur_time = CURRENT_TIME;
+        struct timespec cur_time;
        int ret = 0;
        u64 to_reserve = 0;
        u64 index = 0;
@@ -1375,12 +1377,16 @@ static noinline int create_pending_snapshot(struct btrfs_trans_handle *trans,
        rsv = trans->block_rsv;
        trans->block_rsv = &pending->block_rsv;
        trans->bytes_reserved = trans->block_rsv->reserved;
+        trace_btrfs_space_reservation(root->fs_info, "transaction",
+                                      trans->transid,
+                                      trans->bytes_reserved, 1);
        dentry = pending->dentry;
        parent_inode = pending->dir;
        parent_root = BTRFS_I(parent_inode)->root;
        record_root_in_trans(trans, parent_root);
+        cur_time = current_fs_time(parent_inode->i_sb);
        /*
         * insert the directory item
         */
@@ -1523,7 +1529,8 @@ static noinline int create_pending_snapshot(struct btrfs_trans_handle *trans,
        btrfs_i_size_write(parent_inode, parent_inode->i_size +
                                         dentry->d_name.len * 2);
-        parent_inode->i_mtime = parent_inode->i_ctime = CURRENT_TIME;
+        parent_inode->i_mtime = parent_inode->i_ctime =
+                current_fs_time(parent_inode->i_sb);
        ret = btrfs_update_inode_fallback(trans, parent_root, parent_inode);
        if (ret) {
                btrfs_abort_transaction(trans, root, ret);
diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c
index 978c3a810893..24d03c751149 100644
--- a/fs/btrfs/tree-log.c
+++ b/fs/btrfs/tree-log.c
@@ -26,6 +26,7 @@
 #include "print-tree.h"
 #include "backref.h"
 #include "hash.h"
+#include "compression.h"
 /* magic values for the inode_only field in btrfs_log_inode:
 *
@@ -1045,7 +1046,7 @@ again:
                /*
                 * NOTE: we have searched root tree and checked the
-                 * coresponding ref, it does not need to check again.
+                 * corresponding ref, it does not need to check again.
                 */
                *search_done = 1;
        }
@@ -4500,7 +4501,22 @@ static int btrfs_log_inode(struct btrfs_trans_handle *trans,
        mutex_lock(&BTRFS_I(inode)->log_mutex);
-        btrfs_get_logged_extents(inode, &logged_list, start, end);
+        /*
+         * Collect ordered extents only if we are logging data. This is to
+         * ensure a subsequent request to log this inode in LOG_INODE_ALL mode
+         * will process the ordered extents if they still exists at the time,
+         * because when we collect them we test and set for the flag
+         * BTRFS_ORDERED_LOGGED to prevent multiple log requests to process the
+         * same ordered extents. The consequence for the LOG_INODE_ALL log mode
+         * not processing the ordered extents is that we end up logging the
+         * corresponding file extent items, based on the extent maps in the
+         * inode's extent_map_tree's modified_list, without logging the
+         * respective checksums (since the may still be only attached to the
+         * ordered extents and have not been inserted in the csum tree by
+         * btrfs_finish_ordered_io() yet).
+         */
+        if (inode_only == LOG_INODE_ALL)
+                btrfs_get_logged_extents(inode, &logged_list, start, end);
        /*
         * a brute force approach to making sure we get the most uptodate
@@ -4772,6 +4788,42 @@ out_unlock:
 }
 /*
+ * Check if we must fallback to a transaction commit when logging an inode.
+ * This must be called after logging the inode and is used only in the context
+ * when fsyncing an inode requires the need to log some other inode - in which
+ * case we can't lock the i_mutex of each other inode we need to log as that
+ * can lead to deadlocks with concurrent fsync against other inodes (as we can
+ * log inodes up or down in the hierarchy) or rename operations for example. So
+ * we take the log_mutex of the inode after we have logged it and then check for
+ * its last_unlink_trans value - this is safe because any task setting
+ * last_unlink_trans must take the log_mutex and it must do this before it does
+ * the actual unlink operation, so if we do this check before a concurrent task
+ * sets last_unlink_trans it means we've logged a consistent version/state of
+ * all the inode items, otherwise we are not sure and must do a transaction
+ * commit (the concurrent task migth have only updated last_unlink_trans before
+ * we logged the inode or it might have also done the unlink).
+ */
+static bool btrfs_must_commit_transaction(struct btrfs_trans_handle *trans,
+                                          struct inode *inode)
+{
+        struct btrfs_fs_info *fs_info = BTRFS_I(inode)->root->fs_info;
+        bool ret = false;
+        mutex_lock(&BTRFS_I(inode)->log_mutex);
+        if (BTRFS_I(inode)->last_unlink_trans > fs_info->last_trans_committed) {
+                /*
+                 * Make sure any commits to the log are forced to be full
+                 * commits.
+                 */
+                btrfs_set_log_full_commit(fs_info, trans);
+                ret = true;
+        }
+        mutex_unlock(&BTRFS_I(inode)->log_mutex);
+        return ret;
+}
+/*
 * follow the dentry parent pointers up the chain and see if any
 * of the directories in it require a full commit before they can
 * be logged.  Returns zero if nothing special needs to be done or 1 if
@@ -4784,7 +4836,6 @@ static noinline int check_parent_dirs_for_sync(struct btrfs_trans_handle *trans,
                                               u64 last_committed)
 {
        int ret = 0;
-        struct btrfs_root *root;
        struct dentry *old_parent = NULL;
        struct inode *orig_inode = inode;
@@ -4816,14 +4867,7 @@ static noinline int check_parent_dirs_for_sync(struct btrfs_trans_handle *trans,
                        BTRFS_I(inode)->logged_trans = trans->transid;
                smp_mb();
-                if (BTRFS_I(inode)->last_unlink_trans > last_committed) {
+                if (btrfs_must_commit_transaction(trans, inode)) {
-                        root = BTRFS_I(inode)->root;
-                        /*
-                         * make sure any commits to the log are forced
-                         * to be full commits
-                         */
-                        btrfs_set_log_full_commit(root->fs_info, trans);
                        ret = 1;
                        break;
                }
@@ -4982,6 +5026,9 @@ process_leaf:
                        btrfs_release_path(path);
                        ret = btrfs_log_inode(trans, root, di_inode,
                                              log_mode, 0, LLONG_MAX, ctx);
+                        if (!ret &&
+                            btrfs_must_commit_transaction(trans, di_inode))
+                                ret = 1;
                        iput(di_inode);
                        if (ret)
                                goto next_dir_inode;
@@ -5096,6 +5143,9 @@ static int btrfs_log_all_parents(struct btrfs_trans_handle *trans,
                        ret = btrfs_log_inode(trans, root, dir_inode,
                                              LOG_INODE_ALL, 0, LLONG_MAX, ctx);
+                        if (!ret &&
+                            btrfs_must_commit_transaction(trans, dir_inode))
+                                ret = 1;
                        iput(dir_inode);
                        if (ret)
                                goto out;
@@ -5447,6 +5497,9 @@ error:
 * They revolve around files there were unlinked from the directory, and
 * this function updates the parent directory so that a full commit is
 * properly done if it is fsync'd later after the unlinks are done.
+ *
+ * Must be called before the unlink operations (updates to the subvolume tree,
+ * inodes, etc) are done.
 */
 void btrfs_record_unlink_dir(struct btrfs_trans_handle *trans,
                             struct inode *dir, struct inode *inode,
@@ -5462,8 +5515,11 @@ void btrfs_record_unlink_dir(struct btrfs_trans_handle *trans,
         * into the file.  When the file is logged we check it and
         * don't log the parents if the file is fully on disk.
         */
-        if (S_ISREG(inode->i_mode))
+        if (S_ISREG(inode->i_mode)) {
+                mutex_lock(&BTRFS_I(inode)->log_mutex);
                BTRFS_I(inode)->last_unlink_trans = trans->transid;
+                mutex_unlock(&BTRFS_I(inode)->log_mutex);
+        }
        /*
         * if this directory was already logged any new
@@ -5494,7 +5550,29 @@ void btrfs_record_unlink_dir(struct btrfs_trans_handle *trans,
        return;
 record:
+        mutex_lock(&BTRFS_I(dir)->log_mutex);
+        BTRFS_I(dir)->last_unlink_trans = trans->transid;
+        mutex_unlock(&BTRFS_I(dir)->log_mutex);
+}
+/*
+ * Make sure that if someone attempts to fsync the parent directory of a deleted
+ * snapshot, it ends up triggering a transaction commit. This is to guarantee
+ * that after replaying the log tree of the parent directory's root we will not
+ * see the snapshot anymore and at log replay time we will not see any log tree
+ * corresponding to the deleted snapshot's root, which could lead to replaying
+ * it after replaying the log tree of the parent directory (which would replay
+ * the snapshot delete operation).
+ *
+ * Must be called before the actual snapshot destroy operation (updates to the
+ * parent root and tree of tree roots trees, etc) are done.
+ */
+void btrfs_record_snapshot_destroy(struct btrfs_trans_handle *trans,
+                                   struct inode *dir)
+{
+        mutex_lock(&BTRFS_I(dir)->log_mutex);
        BTRFS_I(dir)->last_unlink_trans = trans->transid;
+        mutex_unlock(&BTRFS_I(dir)->log_mutex);
 }
 /*
diff --git a/fs/btrfs/tree-log.h b/fs/btrfs/tree-log.h
index 6916a781ea02..a9f1b75d080d 100644
--- a/fs/btrfs/tree-log.h
+++ b/fs/btrfs/tree-log.h
@@ -79,6 +79,8 @@ int btrfs_pin_log_trans(struct btrfs_root *root);
 void btrfs_record_unlink_dir(struct btrfs_trans_handle *trans,
                             struct inode *dir, struct inode *inode,
                             int for_rename);
+void btrfs_record_snapshot_destroy(struct btrfs_trans_handle *trans,
+                                   struct inode *dir);
 int btrfs_log_new_name(struct btrfs_trans_handle *trans,
                        struct inode *inode, struct inode *old_dir,
                        struct dentry *parent);
diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c
index 366b335946fa..e2b54d546b7c 100644
--- a/fs/btrfs/volumes.c
+++ b/fs/btrfs/volumes.c
@@ -138,7 +138,7 @@ static struct btrfs_fs_devices *__alloc_fs_devices(void)
 {
        struct btrfs_fs_devices *fs_devs;
-        fs_devs = kzalloc(sizeof(*fs_devs), GFP_NOFS);
+        fs_devs = kzalloc(sizeof(*fs_devs), GFP_KERNEL);
        if (!fs_devs)
                return ERR_PTR(-ENOMEM);
@@ -220,7 +220,7 @@ static struct btrfs_device *__alloc_device(void)
 {
        struct btrfs_device *dev;
-        dev = kzalloc(sizeof(*dev), GFP_NOFS);
+        dev = kzalloc(sizeof(*dev), GFP_KERNEL);
        if (!dev)
                return ERR_PTR(-ENOMEM);
@@ -733,7 +733,8 @@ static struct btrfs_fs_devices *clone_fs_devices(struct btrfs_fs_devices *orig)
                 * uuid mutex so nothing we touch in here is going to disappear.
                 */
                if (orig_dev->name) {
-                        name = rcu_string_strdup(orig_dev->name->str, GFP_NOFS);
+                        name = rcu_string_strdup(orig_dev->name->str,
+                                        GFP_KERNEL);
                        if (!name) {
                                kfree(device);
                                goto error;
@@ -1714,12 +1715,12 @@ int btrfs_rm_device(struct btrfs_root *root, char *device_path)
        } while (read_seqretry(&root->fs_info->profiles_lock, seq));
        num_devices = root->fs_info->fs_devices->num_devices;
-        btrfs_dev_replace_lock(&root->fs_info->dev_replace);
+        btrfs_dev_replace_lock(&root->fs_info->dev_replace, 0);
        if (btrfs_dev_replace_is_ongoing(&root->fs_info->dev_replace)) {
                WARN_ON(num_devices < 1);
                num_devices--;
        }
-        btrfs_dev_replace_unlock(&root->fs_info->dev_replace);
+        btrfs_dev_replace_unlock(&root->fs_info->dev_replace, 0);
        if ((all_avail & BTRFS_BLOCK_GROUP_RAID10) && num_devices <= 4) {
                ret = BTRFS_ERROR_DEV_RAID10_MIN_NOT_MET;
@@ -2287,7 +2288,7 @@ int btrfs_init_new_device(struct btrfs_root *root, char *device_path)
                goto error;
        }
-        name = rcu_string_strdup(device_path, GFP_NOFS);
+        name = rcu_string_strdup(device_path, GFP_KERNEL);
        if (!name) {
                kfree(device);
                ret = -ENOMEM;
@@ -2748,7 +2749,7 @@ int btrfs_remove_chunk(struct btrfs_trans_handle *trans,
            em->start + em->len < chunk_offset) {
                /*
                 * This is a logic error, but we don't want to just rely on the
-                 * user having built with ASSERT enabled, so if ASSERT doens't
+                 * user having built with ASSERT enabled, so if ASSERT doesn't
                 * do anything we still error out.
                 */
                ASSERT(0);
@@ -2966,7 +2967,7 @@ static int insert_balance_item(struct btrfs_root *root,
        }
        key.objectid = BTRFS_BALANCE_OBJECTID;
-        key.type = BTRFS_BALANCE_ITEM_KEY;
+        key.type = BTRFS_TEMPORARY_ITEM_KEY;
        key.offset = 0;
        ret = btrfs_insert_empty_item(trans, root, path, &key,
@@ -3015,7 +3016,7 @@ static int del_balance_item(struct btrfs_root *root)
        }
        key.objectid = BTRFS_BALANCE_OBJECTID;
-        key.type = BTRFS_BALANCE_ITEM_KEY;
+        key.type = BTRFS_TEMPORARY_ITEM_KEY;
        key.offset = 0;
        ret = btrfs_search_slot(trans, root, &key, path, -1, 1);
@@ -3686,12 +3687,12 @@ int btrfs_balance(struct btrfs_balance_control *bctl,
        }
        num_devices = fs_info->fs_devices->num_devices;
-        btrfs_dev_replace_lock(&fs_info->dev_replace);
+        btrfs_dev_replace_lock(&fs_info->dev_replace, 0);
        if (btrfs_dev_replace_is_ongoing(&fs_info->dev_replace)) {
                BUG_ON(num_devices < 1);
                num_devices--;
        }
-        btrfs_dev_replace_unlock(&fs_info->dev_replace);
+        btrfs_dev_replace_unlock(&fs_info->dev_replace, 0);
        allowed = BTRFS_AVAIL_ALLOC_BIT_SINGLE;
        if (num_devices == 1)
                allowed |= BTRFS_BLOCK_GROUP_DUP;
@@ -3867,7 +3868,7 @@ int btrfs_recover_balance(struct btrfs_fs_info *fs_info)
                return -ENOMEM;
        key.objectid = BTRFS_BALANCE_OBJECTID;
-        key.type = BTRFS_BALANCE_ITEM_KEY;
+        key.type = BTRFS_TEMPORARY_ITEM_KEY;
        key.offset = 0;
        ret = btrfs_search_slot(NULL, fs_info->tree_root, &key, path, 0, 0);
@@ -4118,7 +4119,7 @@ out:
 * Callback for btrfs_uuid_tree_iterate().
 * returns:
 * 0    check succeeded, the entry is not outdated.
- * < 0  if an error occured.
+ * < 0  if an error occurred.
 * > 0  if the check failed, which means the caller shall remove the entry.
 */
 static int btrfs_check_uuid_tree_entry(struct btrfs_fs_info *fs_info,
@@ -5062,10 +5063,10 @@ int btrfs_num_copies(struct btrfs_fs_info *fs_info, u64 logical, u64 len)
                ret = 1;
        free_extent_map(em);
-        btrfs_dev_replace_lock(&fs_info->dev_replace);
+        btrfs_dev_replace_lock(&fs_info->dev_replace, 0);
        if (btrfs_dev_replace_is_ongoing(&fs_info->dev_replace))
                ret++;
-        btrfs_dev_replace_unlock(&fs_info->dev_replace);
+        btrfs_dev_replace_unlock(&fs_info->dev_replace, 0);
        return ret;
 }
@@ -5325,10 +5326,12 @@ static int __btrfs_map_block(struct btrfs_fs_info *fs_info, int rw,
        if (!bbio_ret)
                goto out;
-        btrfs_dev_replace_lock(dev_replace);
+        btrfs_dev_replace_lock(dev_replace, 0);
        dev_replace_is_ongoing = btrfs_dev_replace_is_ongoing(dev_replace);
        if (!dev_replace_is_ongoing)
-                btrfs_dev_replace_unlock(dev_replace);
+                btrfs_dev_replace_unlock(dev_replace, 0);
+        else
+                btrfs_dev_replace_set_lock_blocking(dev_replace);
        if (dev_replace_is_ongoing && mirror_num == map->num_stripes + 1 &&
            !(rw & (REQ_WRITE | REQ_DISCARD | REQ_GET_READ_MIRRORS)) &&
@@ -5751,8 +5754,10 @@ static int __btrfs_map_block(struct btrfs_fs_info *fs_info, int rw,
                bbio->mirror_num = map->num_stripes + 1;
        }
 out:
-        if (dev_replace_is_ongoing)
+        if (dev_replace_is_ongoing) {
-                btrfs_dev_replace_unlock(dev_replace);
+                btrfs_dev_replace_clear_lock_blocking(dev_replace);
+                btrfs_dev_replace_unlock(dev_replace, 0);
+        }
        free_extent_map(em);
        return ret;
 }
@@ -6705,8 +6710,8 @@ int btrfs_init_dev_stats(struct btrfs_fs_info *fs_info)
                int item_size;
                struct btrfs_dev_stats_item *ptr;
-                key.objectid = 0;
+                key.objectid = BTRFS_DEV_STATS_OBJECTID;
-                key.type = BTRFS_DEV_STATS_KEY;
+                key.type = BTRFS_PERSISTENT_ITEM_KEY;
                key.offset = device->devid;
                ret = btrfs_search_slot(NULL, dev_root, &key, path, 0, 0);
                if (ret) {
@@ -6753,8 +6758,8 @@ static int update_dev_stat_item(struct btrfs_trans_handle *trans,
        int ret;
        int i;
-        key.objectid = 0;
+        key.objectid = BTRFS_DEV_STATS_OBJECTID;
-        key.type = BTRFS_DEV_STATS_KEY;
+        key.type = BTRFS_PERSISTENT_ITEM_KEY;
        key.offset = device->devid;
        path = btrfs_alloc_path();
diff --git a/fs/btrfs/xattr.c b/fs/btrfs/xattr.c
index 6c68d6356197..145d2b89e62d 100644
--- a/fs/btrfs/xattr.c
+++ b/fs/btrfs/xattr.c
@@ -249,7 +249,7 @@ int __btrfs_setxattr(struct btrfs_trans_handle *trans,
                goto out;
        inode_inc_iversion(inode);
-        inode->i_ctime = CURRENT_TIME;
+        inode->i_ctime = current_fs_time(inode->i_sb);
        set_bit(BTRFS_INODE_COPY_EVERYTHING, &BTRFS_I(inode)->runtime_flags);
        ret = btrfs_update_inode(trans, root, inode);
        BUG_ON(ret);
@@ -260,16 +260,12 @@ out:
 ssize_t btrfs_listxattr(struct dentry *dentry, char *buffer, size_t size)
 {
-        struct btrfs_key key, found_key;
+        struct btrfs_key key;
        struct inode *inode = d_inode(dentry);
        struct btrfs_root *root = BTRFS_I(inode)->root;
        struct btrfs_path *path;
-        struct extent_buffer *leaf;
+        int ret = 0;
-        struct btrfs_dir_item *di;
-        int ret = 0, slot;
        size_t total_size = 0, size_left = size;
-        unsigned long name_ptr;
-        size_t name_len;
        /*
         * ok we want all objects associated with this id.
@@ -291,6 +287,13 @@ ssize_t btrfs_listxattr(struct dentry *dentry, char *buffer, size_t size)
                goto err;
        while (1) {
+                struct extent_buffer *leaf;
+                int slot;
+                struct btrfs_dir_item *di;
+                struct btrfs_key found_key;
+                u32 item_size;
+                u32 cur;
                leaf = path->nodes[0];
                slot = path->slots[0];
@@ -316,31 +319,45 @@ ssize_t btrfs_listxattr(struct dentry *dentry, char *buffer, size_t size)
                if (found_key.type > BTRFS_XATTR_ITEM_KEY)
                        break;
                if (found_key.type < BTRFS_XATTR_ITEM_KEY)
-                        goto next;
+                        goto next_item;
                di = btrfs_item_ptr(leaf, slot, struct btrfs_dir_item);
-                if (verify_dir_item(root, leaf, di))
+                item_size = btrfs_item_size_nr(leaf, slot);
-                        goto next;
+                cur = 0;
+                while (cur < item_size) {
-                name_len = btrfs_dir_name_len(leaf, di);
+                        u16 name_len = btrfs_dir_name_len(leaf, di);
-                total_size += name_len + 1;
+                        u16 data_len = btrfs_dir_data_len(leaf, di);
+                        u32 this_len = sizeof(*di) + name_len + data_len;
+                        unsigned long name_ptr = (unsigned long)(di + 1);
+                        if (verify_dir_item(root, leaf, di)) {
+                                ret = -EIO;
+                                goto err;
+                        }
-                /* we are just looking for how big our buffer needs to be */
+                        total_size += name_len + 1;
-                if (!size)
+                        /*
-                        goto next;
+                         * We are just looking for how big our buffer needs to
+                         * be.
+                         */
+                        if (!size)
+                                goto next;
-                if (!buffer || (name_len + 1) > size_left) {
+                        if (!buffer || (name_len + 1) > size_left) {
-                        ret = -ERANGE;
+                                ret = -ERANGE;
-                        goto err;
+                                goto err;
-                }
+                        }
-                name_ptr = (unsigned long)(di + 1);
+                        read_extent_buffer(leaf, buffer, name_ptr, name_len);
-                read_extent_buffer(leaf, buffer, name_ptr, name_len);
+                        buffer[name_len] = '\0';
-                buffer[name_len] = '\0';
-                size_left -= name_len + 1;
+                        size_left -= name_len + 1;
-                buffer += name_len + 1;
+                        buffer += name_len + 1;
 next:
+                        cur += this_len;
+                        di = (struct btrfs_dir_item *)((char *)di + this_len);
+                }
+next_item:
                path->slots[0]++;
        }
        ret = total_size;
diff --git a/fs/buffer.c b/fs/buffer.c
index e1632abb4ca9..33be29675358 100644
--- a/fs/buffer.c
+++ b/fs/buffer.c
@@ -621,17 +621,17 @@ EXPORT_SYMBOL(mark_buffer_dirty_inode);
 * If warn is true, then emit a warning if the page is not uptodate and has
 * not been truncated.
 *
- * The caller must hold mem_cgroup_begin_page_stat() lock.
+ * The caller must hold lock_page_memcg().
 */
 static void __set_page_dirty(struct page *page, struct address_space *mapping,
-                             struct mem_cgroup *memcg, int warn)
+                             int warn)
 {
        unsigned long flags;
        spin_lock_irqsave(&mapping->tree_lock, flags);
        if (page->mapping) {    /* Race with truncate? */
                WARN_ON_ONCE(warn && !PageUptodate(page));
-                account_page_dirtied(page, mapping, memcg);
+                account_page_dirtied(page, mapping);
                radix_tree_tag_set(&mapping->page_tree,
                                page_index(page), PAGECACHE_TAG_DIRTY);
        }
@@ -666,7 +666,6 @@ static void __set_page_dirty(struct page *page, struct address_space *mapping,
 int __set_page_dirty_buffers(struct page *page)
 {
        int newly_dirty;
-        struct mem_cgroup *memcg;
        struct address_space *mapping = page_mapping(page);
        if (unlikely(!mapping))
@@ -683,17 +682,17 @@ int __set_page_dirty_buffers(struct page *page)
                } while (bh != head);
        }
        /*
-         * Use mem_group_begin_page_stat() to keep PageDirty synchronized with
+         * Lock out page->mem_cgroup migration to keep PageDirty
-         * per-memcg dirty page counters.
+         * synchronized with per-memcg dirty page counters.
         */
-        memcg = mem_cgroup_begin_page_stat(page);
+        lock_page_memcg(page);
        newly_dirty = !TestSetPageDirty(page);
        spin_unlock(&mapping->private_lock);
        if (newly_dirty)
-                __set_page_dirty(page, mapping, memcg, 1);
+                __set_page_dirty(page, mapping, 1);
-        mem_cgroup_end_page_stat(memcg);
+        unlock_page_memcg(page);
        if (newly_dirty)
                __mark_inode_dirty(mapping->host, I_DIRTY_PAGES);
@@ -1167,15 +1166,14 @@ void mark_buffer_dirty(struct buffer_head *bh)
        if (!test_set_buffer_dirty(bh)) {
                struct page *page = bh->b_page;
                struct address_space *mapping = NULL;
-                struct mem_cgroup *memcg;
-                memcg = mem_cgroup_begin_page_stat(page);
+                lock_page_memcg(page);
                if (!TestSetPageDirty(page)) {
                        mapping = page_mapping(page);
                        if (mapping)
-                                __set_page_dirty(page, mapping, memcg, 0);
+                                __set_page_dirty(page, mapping, 0);
                }
-                mem_cgroup_end_page_stat(memcg);
+                unlock_page_memcg(page);
                if (mapping)
                        __mark_inode_dirty(mapping->host, I_DIRTY_PAGES);
        }
diff --git a/fs/cachefiles/daemon.c b/fs/cachefiles/daemon.c
index 452e98dd7560..1ee54ffd3a24 100644
--- a/fs/cachefiles/daemon.c
+++ b/fs/cachefiles/daemon.c
@@ -162,6 +162,8 @@ static ssize_t cachefiles_daemon_read(struct file *file, char __user *_buffer,
                                      size_t buflen, loff_t *pos)
 {
        struct cachefiles_cache *cache = file->private_data;
+        unsigned long long b_released;
+        unsigned f_released;
        char buffer[256];
        int n;
@@ -174,6 +176,8 @@ static ssize_t cachefiles_daemon_read(struct file *file, char __user *_buffer,
        cachefiles_has_space(cache, 0, 0);
        /* summarise */
+        f_released = atomic_xchg(&cache->f_released, 0);
+        b_released = atomic_long_xchg(&cache->b_released, 0);
        clear_bit(CACHEFILES_STATE_CHANGED, &cache->flags);
        n = snprintf(buffer, sizeof(buffer),
@@ -183,15 +187,18 @@ static ssize_t cachefiles_daemon_read(struct file *file, char __user *_buffer,
                     " fstop=%llx"
                     " brun=%llx"
                     " bcull=%llx"
-                     " bstop=%llx",
+                     " bstop=%llx"
+                     " freleased=%x"
+                     " breleased=%llx",
                     test_bit(CACHEFILES_CULLING, &cache->flags) ? '1' : '0',
                     (unsigned long long) cache->frun,
                     (unsigned long long) cache->fcull,
                     (unsigned long long) cache->fstop,
                     (unsigned long long) cache->brun,
                     (unsigned long long) cache->bcull,
-                     (unsigned long long) cache->bstop
+                     (unsigned long long) cache->bstop,
-                     );
+                     f_released,
+                     b_released);
        if (n > buflen)
                return -EMSGSIZE;
diff --git a/fs/cachefiles/interface.c b/fs/cachefiles/interface.c
index 675a3332d72f..861d611b8c05 100644
--- a/fs/cachefiles/interface.c
+++ b/fs/cachefiles/interface.c
@@ -291,15 +291,8 @@ static void cachefiles_drop_object(struct fscache_object *_object)
        }
        /* note that the object is now inactive */
-        if (test_bit(CACHEFILES_OBJECT_ACTIVE, &object->flags)) {
+        if (test_bit(CACHEFILES_OBJECT_ACTIVE, &object->flags))
-                write_lock(&cache->active_lock);
+                cachefiles_mark_object_inactive(cache, object);
-                if (!test_and_clear_bit(CACHEFILES_OBJECT_ACTIVE,
-                                        &object->flags))
-                        BUG();
-                rb_erase(&object->active_node, &cache->active_nodes);
-                wake_up_bit(&object->flags, CACHEFILES_OBJECT_ACTIVE);
-                write_unlock(&cache->active_lock);
-        }
        dput(object->dentry);
        object->dentry = NULL;
diff --git a/fs/cachefiles/internal.h b/fs/cachefiles/internal.h
index 9c4b737a54df..2fcde1a34b7c 100644
--- a/fs/cachefiles/internal.h
+++ b/fs/cachefiles/internal.h
@@ -66,6 +66,8 @@ struct cachefiles_cache {
        struct rb_root                  active_nodes;   /* active nodes (can't be culled) */
        rwlock_t                        active_lock;    /* lock for active_nodes */
        atomic_t                        gravecounter;   /* graveyard uniquifier */
+        atomic_t                        f_released;     /* number of objects released lately */
+        atomic_long_t                   b_released;     /* number of blocks released lately */
        unsigned                        frun_percent;   /* when to stop culling (% files) */
        unsigned                        fcull_percent;  /* when to start culling (% files) */
        unsigned                        fstop_percent;  /* when to stop allocating (% files) */
@@ -157,6 +159,8 @@ extern char *cachefiles_cook_key(const u8 *raw, int keylen, uint8_t type);
 /*
 * namei.c
 */
+extern void cachefiles_mark_object_inactive(struct cachefiles_cache *cache,
+                                            struct cachefiles_object *object);
 extern int cachefiles_delete_object(struct cachefiles_cache *cache,
                                    struct cachefiles_object *object);
 extern int cachefiles_walk_to_object(struct cachefiles_object *parent,
diff --git a/fs/cachefiles/namei.c b/fs/cachefiles/namei.c
index 1c2334c163dd..4ae75006e73b 100644
--- a/fs/cachefiles/namei.c
+++ b/fs/cachefiles/namei.c
@@ -258,6 +258,28 @@ requeue:
 }
 /*
+ * Mark an object as being inactive.
+ */
+void cachefiles_mark_object_inactive(struct cachefiles_cache *cache,
+                                     struct cachefiles_object *object)
+{
+        write_lock(&cache->active_lock);
+        rb_erase(&object->active_node, &cache->active_nodes);
+        clear_bit(CACHEFILES_OBJECT_ACTIVE, &object->flags);
+        write_unlock(&cache->active_lock);
+        wake_up_bit(&object->flags, CACHEFILES_OBJECT_ACTIVE);
+        /* This object can now be culled, so we need to let the daemon know
+         * that there is something it can remove if it needs to.
+         */
+        atomic_long_add(d_backing_inode(object->dentry)->i_blocks,
+                        &cache->b_released);
+        if (atomic_inc_return(&cache->f_released))
+                cachefiles_state_changed(cache);
+}
+/*
 * delete an object representation from the cache
 * - file backed objects are unlinked
 * - directory backed objects are stuffed into the graveyard for userspace to
@@ -684,11 +706,7 @@ mark_active_timed_out:
 check_error:
        _debug("check error %d", ret);
-        write_lock(&cache->active_lock);
+        cachefiles_mark_object_inactive(cache, object);
-        rb_erase(&object->active_node, &cache->active_nodes);
-        clear_bit(CACHEFILES_OBJECT_ACTIVE, &object->flags);
-        wake_up_bit(&object->flags, CACHEFILES_OBJECT_ACTIVE);
-        write_unlock(&cache->active_lock);
 release_dentry:
        dput(object->dentry);
        object->dentry = NULL;
diff --git a/fs/ceph/addr.c b/fs/ceph/addr.c
index 19adeb0ef82a..fc5cae2a0db2 100644
--- a/fs/ceph/addr.c
+++ b/fs/ceph/addr.c
@@ -175,8 +175,8 @@ static void ceph_invalidatepage(struct page *page, unsigned int offset,
 static int ceph_releasepage(struct page *page, gfp_t g)
 {
-        struct inode *inode = page->mapping ? page->mapping->host : NULL;
+        dout("%p releasepage %p idx %lu\n", page->mapping->host,
-        dout("%p releasepage %p idx %lu\n", inode, page, page->index);
+             page, page->index);
        WARN_ON(PageDirty(page));
        /* Can we release the page from the cache? */
@@ -276,7 +276,7 @@ static void finish_read(struct ceph_osd_request *req, struct ceph_msg *msg)
        for (i = 0; i < num_pages; i++) {
                struct page *page = osd_data->pages[i];
-                if (rc < 0 && rc != ENOENT)
+                if (rc < 0 && rc != -ENOENT)
                        goto unlock;
                if (bytes < (int)PAGE_CACHE_SIZE) {
                        /* zero (remainder of) page */
@@ -606,71 +606,71 @@ static void writepages_finish(struct ceph_osd_request *req,
        struct inode *inode = req->r_inode;
        struct ceph_inode_info *ci = ceph_inode(inode);
        struct ceph_osd_data *osd_data;
-        unsigned wrote;
        struct page *page;
-        int num_pages;
+        int num_pages, total_pages = 0;
-        int i;
+        int i, j;
+        int rc = req->r_result;
        struct ceph_snap_context *snapc = req->r_snapc;
        struct address_space *mapping = inode->i_mapping;
-        int rc = req->r_result;
-        u64 bytes = req->r_ops[0].extent.length;
        struct ceph_fs_client *fsc = ceph_inode_to_client(inode);
-        long writeback_stat;
+        bool remove_page;
-        unsigned issued = ceph_caps_issued(ci);
-        osd_data = osd_req_op_extent_osd_data(req, 0);
-        BUG_ON(osd_data->type != CEPH_OSD_DATA_TYPE_PAGES);
+        dout("writepages_finish %p rc %d\n", inode, rc);
-        num_pages = calc_pages_for((u64)osd_data->alignment,
+        if (rc < 0)
-                                        (u64)osd_data->length);
-        if (rc >= 0) {
-                /*
-                 * Assume we wrote the pages we originally sent.  The
-                 * osd might reply with fewer pages if our writeback
-                 * raced with a truncation and was adjusted at the osd,
-                 * so don't believe the reply.
-                 */
-                wrote = num_pages;
-        } else {
-                wrote = 0;
                mapping_set_error(mapping, rc);
-        }
-        dout("writepages_finish %p rc %d bytes %llu wrote %d (pages)\n",
-             inode, rc, bytes, wrote);
-        /* clean all pages */
+        /*
-        for (i = 0; i < num_pages; i++) {
+         * We lost the cache cap, need to truncate the page before
-                page = osd_data->pages[i];
+         * it is unlocked, otherwise we'd truncate it later in the
-                BUG_ON(!page);
+         * page truncation thread, possibly losing some data that
-                WARN_ON(!PageUptodate(page));
+         * raced its way in
+         */
+        remove_page = !(ceph_caps_issued(ci) &
+                        (CEPH_CAP_FILE_CACHE|CEPH_CAP_FILE_LAZYIO));
-                writeback_stat =
+        /* clean all pages */
-                        atomic_long_dec_return(&fsc->writeback_count);
+        for (i = 0; i < req->r_num_ops; i++) {
-                if (writeback_stat <
+                if (req->r_ops[i].op != CEPH_OSD_OP_WRITE)
-                    CONGESTION_OFF_THRESH(fsc->mount_options->congestion_kb))
+                        break;
-                        clear_bdi_congested(&fsc->backing_dev_info,
-                                            BLK_RW_ASYNC);
-                ceph_put_snap_context(page_snap_context(page));
+                osd_data = osd_req_op_extent_osd_data(req, i);
-                page->private = 0;
+                BUG_ON(osd_data->type != CEPH_OSD_DATA_TYPE_PAGES);
-                ClearPagePrivate(page);
+                num_pages = calc_pages_for((u64)osd_data->alignment,
-                dout("unlocking %d %p\n", i, page);
+                                           (u64)osd_data->length);
-                end_page_writeback(page);
+                total_pages += num_pages;
+                for (j = 0; j < num_pages; j++) {
+                        page = osd_data->pages[j];
+                        BUG_ON(!page);
+                        WARN_ON(!PageUptodate(page));
+                        if (atomic_long_dec_return(&fsc->writeback_count) <
+                             CONGESTION_OFF_THRESH(
+                                        fsc->mount_options->congestion_kb))
+                                clear_bdi_congested(&fsc->backing_dev_info,
+                                                    BLK_RW_ASYNC);
+                        ceph_put_snap_context(page_snap_context(page));
+                        page->private = 0;
+                        ClearPagePrivate(page);
+                        dout("unlocking %p\n", page);
+                        end_page_writeback(page);
+                        if (remove_page)
+                                generic_error_remove_page(inode->i_mapping,
+                                                          page);
-                /*
+                        unlock_page(page);
-                 * We lost the cache cap, need to truncate the page before
+                }
-                 * it is unlocked, otherwise we'd truncate it later in the
+                dout("writepages_finish %p wrote %llu bytes cleaned %d pages\n",
-                 * page truncation thread, possibly losing some data that
+                     inode, osd_data->length, rc >= 0 ? num_pages : 0);
-                 * raced its way in
-                 */
-                if ((issued & (CEPH_CAP_FILE_CACHE|CEPH_CAP_FILE_LAZYIO)) == 0)
-                        generic_error_remove_page(inode->i_mapping, page);
-                unlock_page(page);
+                ceph_release_pages(osd_data->pages, num_pages);
        }
-        dout("%p wrote+cleaned %d pages\n", inode, wrote);
-        ceph_put_wrbuffer_cap_refs(ci, num_pages, snapc);
-        ceph_release_pages(osd_data->pages, num_pages);
+        ceph_put_wrbuffer_cap_refs(ci, total_pages, snapc);
+        osd_data = osd_req_op_extent_osd_data(req, 0);
        if (osd_data->pages_from_pool)
                mempool_free(osd_data->pages,
                             ceph_sb_to_client(inode->i_sb)->wb_pagevec_pool);
@@ -778,17 +778,15 @@ retry:
        while (!done && index <= end) {
                unsigned i;
                int first;
-                pgoff_t next;
+                pgoff_t strip_unit_end = 0;
-                int pvec_pages, locked_pages;
+                int num_ops = 0, op_idx;
-                struct page **pages = NULL;
+                int pvec_pages, locked_pages = 0;
+                struct page **pages = NULL, **data_pages;
                mempool_t *pool = NULL; /* Becomes non-null if mempool used */
                struct page *page;
                int want;
-                u64 offset, len;
+                u64 offset = 0, len = 0;
-                long writeback_stat;
-                next = 0;
-                locked_pages = 0;
                max_pages = max_pages_ever;
 get_more_pages:
@@ -824,8 +822,8 @@ get_more_pages:
                                unlock_page(page);
                                break;
                        }
-                        if (next && (page->index != next)) {
+                        if (strip_unit_end && (page->index > strip_unit_end)) {
-                                dout("not consecutive %p\n", page);
+                                dout("end of strip unit %p\n", page);
                                unlock_page(page);
                                break;
                        }
@@ -867,36 +865,31 @@ get_more_pages:
                        /*
                         * We have something to write.  If this is
                         * the first locked page this time through,
-                         * allocate an osd request and a page array
+                         * calculate max possinle write size and
-                         * that it will use.
+                         * allocate a page array
                         */
                        if (locked_pages == 0) {
-                                BUG_ON(pages);
+                                u64 objnum;
+                                u64 objoff;
                                /* prepare async write request */
                                offset = (u64)page_offset(page);
                                len = wsize;
-                                req = ceph_osdc_new_request(&fsc->client->osdc,
-                                                        &ci->i_layout, vino,
+                                rc = ceph_calc_file_object_mapping(&ci->i_layout,
-                                                        offset, &len, 0,
+                                                                offset, len,
-                                                        do_sync ? 2 : 1,
+                                                                &objnum, &objoff,
-                                                        CEPH_OSD_OP_WRITE,
+                                                                &len);
-                                                        CEPH_OSD_FLAG_WRITE |
+                                if (rc < 0) {
-                                                        CEPH_OSD_FLAG_ONDISK,
-                                                        snapc, truncate_seq,
-                                                        truncate_size, true);
-                                if (IS_ERR(req)) {
-                                        rc = PTR_ERR(req);
                                        unlock_page(page);
                                        break;
                                }
-                                if (do_sync)
+                                num_ops = 1 + do_sync;
-                                        osd_req_op_init(req, 1,
+                                strip_unit_end = page->index +
-                                                        CEPH_OSD_OP_STARTSYNC, 0);
+                                        ((len - 1) >> PAGE_CACHE_SHIFT);
-                                req->r_callback = writepages_finish;
-                                req->r_inode = inode;
+                                BUG_ON(pages);
                                max_pages = calc_pages_for(0, (u64)len);
                                pages = kmalloc(max_pages * sizeof (*pages),
                                                GFP_NOFS);
@@ -905,6 +898,20 @@ get_more_pages:
                                        pages = mempool_alloc(pool, GFP_NOFS);
                                        BUG_ON(!pages);
                                }
+                                len = 0;
+                        } else if (page->index !=
+                                   (offset + len) >> PAGE_CACHE_SHIFT) {
+                                if (num_ops >= (pool ?  CEPH_OSD_SLAB_OPS :
+                                                        CEPH_OSD_MAX_OPS)) {
+                                        redirty_page_for_writepage(wbc, page);
+                                        unlock_page(page);
+                                        break;
+                                }
+                                num_ops++;
+                                offset = (u64)page_offset(page);
+                                len = 0;
                        }
                        /* note position of first page in pvec */
@@ -913,18 +920,16 @@ get_more_pages:
                        dout("%p will write page %p idx %lu\n",
                             inode, page, page->index);
-                        writeback_stat =
+                        if (atomic_long_inc_return(&fsc->writeback_count) >
-                               atomic_long_inc_return(&fsc->writeback_count);
+                            CONGESTION_ON_THRESH(
-                        if (writeback_stat > CONGESTION_ON_THRESH(
                                    fsc->mount_options->congestion_kb)) {
                                set_bdi_congested(&fsc->backing_dev_info,
                                                  BLK_RW_ASYNC);
                        }
-                        set_page_writeback(page);
                        pages[locked_pages] = page;
                        locked_pages++;
-                        next = page->index + 1;
+                        len += PAGE_CACHE_SIZE;
                }
                /* did we get anything? */
@@ -944,38 +949,119 @@ get_more_pages:
                        /* shift unused pages over in the pvec...  we
                         * will need to release them below. */
                        for (j = i; j < pvec_pages; j++) {
-                                dout(" pvec leftover page %p\n",
+                                dout(" pvec leftover page %p\n", pvec.pages[j]);
-                                     pvec.pages[j]);
                                pvec.pages[j-i+first] = pvec.pages[j];
                        }
                        pvec.nr -= i-first;
                }
-                /* Format the osd request message and submit the write */
+new_request:
                offset = page_offset(pages[0]);
-                len = (u64)locked_pages << PAGE_CACHE_SHIFT;
+                len = wsize;
-                if (snap_size == -1) {
-                        len = min(len, (u64)i_size_read(inode) - offset);
+                req = ceph_osdc_new_request(&fsc->client->osdc,
-                         /* writepages_finish() clears writeback pages
+                                        &ci->i_layout, vino,
-                          * according to the data length, so make sure
+                                        offset, &len, 0, num_ops,
-                          * data length covers all locked pages */
+                                        CEPH_OSD_OP_WRITE,
-                        len = max(len, 1 +
+                                        CEPH_OSD_FLAG_WRITE |
-                                ((u64)(locked_pages - 1) << PAGE_CACHE_SHIFT));
+                                        CEPH_OSD_FLAG_ONDISK,
-                } else {
+                                        snapc, truncate_seq,
-                        len = min(len, snap_size - offset);
+                                        truncate_size, false);
+                if (IS_ERR(req)) {
+                        req = ceph_osdc_new_request(&fsc->client->osdc,
+                                                &ci->i_layout, vino,
+                                                offset, &len, 0,
+                                                min(num_ops,
+                                                    CEPH_OSD_SLAB_OPS),
+                                                CEPH_OSD_OP_WRITE,
+                                                CEPH_OSD_FLAG_WRITE |
+                                                CEPH_OSD_FLAG_ONDISK,
+                                                snapc, truncate_seq,
+                                                truncate_size, true);
+                        BUG_ON(IS_ERR(req));
                }
-                dout("writepages got %d pages at %llu~%llu\n",
+                BUG_ON(len < page_offset(pages[locked_pages - 1]) +
-                     locked_pages, offset, len);
+                             PAGE_CACHE_SIZE - offset);
+                req->r_callback = writepages_finish;
+                req->r_inode = inode;
-                osd_req_op_extent_osd_data_pages(req, 0, pages, len, 0,
+                /* Format the osd request message and submit the write */
+                len = 0;
+                data_pages = pages;
+                op_idx = 0;
+                for (i = 0; i < locked_pages; i++) {
+                        u64 cur_offset = page_offset(pages[i]);
+                        if (offset + len != cur_offset) {
+                                if (op_idx + do_sync + 1 == req->r_num_ops)
+                                        break;
+                                osd_req_op_extent_dup_last(req, op_idx,
+                                                           cur_offset - offset);
+                                dout("writepages got pages at %llu~%llu\n",
+                                     offset, len);
+                                osd_req_op_extent_osd_data_pages(req, op_idx,
+                                                        data_pages, len, 0,
                                                        !!pool, false);
+                                osd_req_op_extent_update(req, op_idx, len);
-                pages = NULL;   /* request message now owns the pages array */
+                                len = 0;
-                pool = NULL;
+                                offset = cur_offset; 
+                                data_pages = pages + i;
+                                op_idx++;
+                        }
-                /* Update the write op length in case we changed it */
+                        set_page_writeback(pages[i]);
+                        len += PAGE_CACHE_SIZE;
+                }
+                if (snap_size != -1) {
+                        len = min(len, snap_size - offset);
+                } else if (i == locked_pages) {
+                        /* writepages_finish() clears writeback pages
+                         * according to the data length, so make sure
+                         * data length covers all locked pages */
+                        u64 min_len = len + 1 - PAGE_CACHE_SIZE;
+                        len = min(len, (u64)i_size_read(inode) - offset);
+                        len = max(len, min_len);
+                }
+                dout("writepages got pages at %llu~%llu\n", offset, len);
+                osd_req_op_extent_osd_data_pages(req, op_idx, data_pages, len,
+                                                 0, !!pool, false);
+                osd_req_op_extent_update(req, op_idx, len);
-                osd_req_op_extent_update(req, 0, len);
+                if (do_sync) {
+                        op_idx++;
+                        osd_req_op_init(req, op_idx, CEPH_OSD_OP_STARTSYNC, 0);
+                }
+                BUG_ON(op_idx + 1 != req->r_num_ops);
+                pool = NULL;
+                if (i < locked_pages) {
+                        BUG_ON(num_ops <= req->r_num_ops);
+                        num_ops -= req->r_num_ops;
+                        num_ops += do_sync;
+                        locked_pages -= i;
+                        /* allocate new pages array for next request */
+                        data_pages = pages;
+                        pages = kmalloc(locked_pages * sizeof (*pages),
+                                        GFP_NOFS);
+                        if (!pages) {
+                                pool = fsc->wb_pagevec_pool;
+                                pages = mempool_alloc(pool, GFP_NOFS);
+                                BUG_ON(!pages);
+                        }
+                        memcpy(pages, data_pages + i,
+                               locked_pages * sizeof(*pages));
+                        memset(data_pages + i, 0,
+                               locked_pages * sizeof(*pages));
+                } else {
+                        BUG_ON(num_ops != req->r_num_ops);
+                        index = pages[i - 1]->index + 1;
+                        /* request message now owns the pages array */
+                        pages = NULL;
+                }
                vino = ceph_vino(inode);
                ceph_osdc_build_request(req, offset, snapc, vino.snap,
@@ -985,9 +1071,10 @@ get_more_pages:
                BUG_ON(rc);
                req = NULL;
-                /* continue? */
+                wbc->nr_to_write -= i;
-                index = next;
+                if (pages)
-                wbc->nr_to_write -= locked_pages;
+                        goto new_request;
                if (wbc->nr_to_write <= 0)
                        done = 1;
@@ -1522,7 +1609,7 @@ int ceph_uninline_data(struct file *filp, struct page *locked_page)
                                    ceph_vino(inode), 0, &len, 0, 1,
                                    CEPH_OSD_OP_CREATE,
                                    CEPH_OSD_FLAG_ONDISK | CEPH_OSD_FLAG_WRITE,
-                                    ceph_empty_snapc, 0, 0, false);
+                                    NULL, 0, 0, false);
        if (IS_ERR(req)) {
                err = PTR_ERR(req);
                goto out;
@@ -1540,9 +1627,8 @@ int ceph_uninline_data(struct file *filp, struct page *locked_page)
                                    ceph_vino(inode), 0, &len, 1, 3,
                                    CEPH_OSD_OP_WRITE,
                                    CEPH_OSD_FLAG_ONDISK | CEPH_OSD_FLAG_WRITE,
-                                    ceph_empty_snapc,
+                                    NULL, ci->i_truncate_seq,
-                                    ci->i_truncate_seq, ci->i_truncate_size,
+                                    ci->i_truncate_size, false);
-                                    false);
        if (IS_ERR(req)) {
                err = PTR_ERR(req);
                goto out;
@@ -1663,8 +1749,7 @@ static int __ceph_pool_perm_get(struct ceph_inode_info *ci, u32 pool)
                goto out;
        }
-        rd_req = ceph_osdc_alloc_request(&fsc->client->osdc,
+        rd_req = ceph_osdc_alloc_request(&fsc->client->osdc, NULL,
-                                         ceph_empty_snapc,
                                         1, false, GFP_NOFS);
        if (!rd_req) {
                err = -ENOMEM;
@@ -1678,8 +1763,7 @@ static int __ceph_pool_perm_get(struct ceph_inode_info *ci, u32 pool)
                 "%llx.00000000", ci->i_vino.ino);
        rd_req->r_base_oid.name_len = strlen(rd_req->r_base_oid.name);
-        wr_req = ceph_osdc_alloc_request(&fsc->client->osdc,
+        wr_req = ceph_osdc_alloc_request(&fsc->client->osdc, NULL,
-                                         ceph_empty_snapc,
                                         1, false, GFP_NOFS);
        if (!wr_req) {
                err = -ENOMEM;
diff --git a/fs/ceph/caps.c b/fs/ceph/caps.c
index 6fe0ad26a7df..de17bb232ff8 100644
--- a/fs/ceph/caps.c
+++ b/fs/ceph/caps.c
@@ -991,7 +991,7 @@ static int send_cap_msg(struct ceph_mds_session *session,
                        u32 seq, u64 flush_tid, u64 oldest_flush_tid,
                        u32 issue_seq, u32 mseq, u64 size, u64 max_size,
                        struct timespec *mtime, struct timespec *atime,
-                        u64 time_warp_seq,
+                        struct timespec *ctime, u64 time_warp_seq,
                        kuid_t uid, kgid_t gid, umode_t mode,
                        u64 xattr_version,
                        struct ceph_buffer *xattrs_buf,
@@ -1042,6 +1042,8 @@ static int send_cap_msg(struct ceph_mds_session *session,
                ceph_encode_timespec(&fc->mtime, mtime);
        if (atime)
                ceph_encode_timespec(&fc->atime, atime);
+        if (ctime)
+                ceph_encode_timespec(&fc->ctime, ctime);
        fc->time_warp_seq = cpu_to_le32(time_warp_seq);
        fc->uid = cpu_to_le32(from_kuid(&init_user_ns, uid));
@@ -1116,7 +1118,7 @@ static int __send_cap(struct ceph_mds_client *mdsc, struct ceph_cap *cap,
        int held, revoking, dropping, keep;
        u64 seq, issue_seq, mseq, time_warp_seq, follows;
        u64 size, max_size;
-        struct timespec mtime, atime;
+        struct timespec mtime, atime, ctime;
        int wake = 0;
        umode_t mode;
        kuid_t uid;
@@ -1180,6 +1182,7 @@ static int __send_cap(struct ceph_mds_client *mdsc, struct ceph_cap *cap,
        ci->i_requested_max_size = max_size;
        mtime = inode->i_mtime;
        atime = inode->i_atime;
+        ctime = inode->i_ctime;
        time_warp_seq = ci->i_time_warp_seq;
        uid = inode->i_uid;
        gid = inode->i_gid;
@@ -1198,7 +1201,7 @@ static int __send_cap(struct ceph_mds_client *mdsc, struct ceph_cap *cap,
        ret = send_cap_msg(session, ceph_vino(inode).ino, cap_id,
                op, keep, want, flushing, seq,
                flush_tid, oldest_flush_tid, issue_seq, mseq,
-                size, max_size, &mtime, &atime, time_warp_seq,
+                size, max_size, &mtime, &atime, &ctime, time_warp_seq,
                uid, gid, mode, xattr_version, xattr_blob,
                follows, inline_data);
        if (ret < 0) {
@@ -1320,7 +1323,7 @@ retry:
                             capsnap->dirty, 0, capsnap->flush_tid, 0,
                             0, mseq, capsnap->size, 0,
                             &capsnap->mtime, &capsnap->atime,
-                             capsnap->time_warp_seq,
+                             &capsnap->ctime, capsnap->time_warp_seq,
                             capsnap->uid, capsnap->gid, capsnap->mode,
                             capsnap->xattr_version, capsnap->xattr_blob,
                             capsnap->follows, capsnap->inline_data);
diff --git a/fs/ceph/dir.c b/fs/ceph/dir.c
index fd11fb231a2e..fadc243dfb28 100644
--- a/fs/ceph/dir.c
+++ b/fs/ceph/dir.c
@@ -38,7 +38,7 @@ int ceph_init_dentry(struct dentry *dentry)
        if (dentry->d_fsdata)
                return 0;
-        di = kmem_cache_alloc(ceph_dentry_cachep, GFP_KERNEL | __GFP_ZERO);
+        di = kmem_cache_zalloc(ceph_dentry_cachep, GFP_KERNEL);
        if (!di)
                return -ENOMEM;          /* oh well */
@@ -68,23 +68,6 @@ out_unlock:
        return 0;
 }
-struct inode *ceph_get_dentry_parent_inode(struct dentry *dentry)
-{
-        struct inode *inode = NULL;
-        if (!dentry)
-                return NULL;
-        spin_lock(&dentry->d_lock);
-        if (!IS_ROOT(dentry)) {
-                inode = d_inode(dentry->d_parent);
-                ihold(inode);
-        }
-        spin_unlock(&dentry->d_lock);
-        return inode;
-}
 /*
 * for readdir, we encode the directory frag and offset within that
 * frag into f_pos.
@@ -624,6 +607,7 @@ static struct dentry *ceph_lookup(struct inode *dir, struct dentry *dentry,
        struct ceph_mds_client *mdsc = fsc->mdsc;
        struct ceph_mds_request *req;
        int op;
+        int mask;
        int err;
        dout("lookup %p dentry %p '%pd'\n",
@@ -666,8 +650,12 @@ static struct dentry *ceph_lookup(struct inode *dir, struct dentry *dentry,
                return ERR_CAST(req);
        req->r_dentry = dget(dentry);
        req->r_num_caps = 2;
-        /* we only need inode linkage */
-        req->r_args.getattr.mask = cpu_to_le32(CEPH_STAT_CAP_INODE);
+        mask = CEPH_STAT_CAP_INODE | CEPH_CAP_AUTH_SHARED;
+        if (ceph_security_xattr_wanted(dir))
+                mask |= CEPH_CAP_XATTR_SHARED;
+        req->r_args.getattr.mask = cpu_to_le32(mask);
        req->r_locked_dir = dir;
        err = ceph_mdsc_do_request(mdsc, NULL, req);
        err = ceph_handle_snapdir(req, dentry, err);
@@ -1095,6 +1083,7 @@ static int dir_lease_is_valid(struct inode *dir, struct dentry *dentry)
 static int ceph_d_revalidate(struct dentry *dentry, unsigned int flags)
 {
        int valid = 0;
+        struct dentry *parent;
        struct inode *dir;
        if (flags & LOOKUP_RCU)
@@ -1103,7 +1092,8 @@ static int ceph_d_revalidate(struct dentry *dentry, unsigned int flags)
        dout("d_revalidate %p '%pd' inode %p offset %lld\n", dentry,
             dentry, d_inode(dentry), ceph_dentry(dentry)->offset);
-        dir = ceph_get_dentry_parent_inode(dentry);
+        parent = dget_parent(dentry);
+        dir = d_inode(parent);
        /* always trust cached snapped dentries, snapdir dentry */
        if (ceph_snap(dir) != CEPH_NOSNAP) {
@@ -1121,13 +1111,48 @@ static int ceph_d_revalidate(struct dentry *dentry, unsigned int flags)
                        valid = 1;
        }
+        if (!valid) {
+                struct ceph_mds_client *mdsc =
+                        ceph_sb_to_client(dir->i_sb)->mdsc;
+                struct ceph_mds_request *req;
+                int op, mask, err;
+                op = ceph_snap(dir) == CEPH_SNAPDIR ?
+                        CEPH_MDS_OP_LOOKUPSNAP : CEPH_MDS_OP_LOOKUP;
+                req = ceph_mdsc_create_request(mdsc, op, USE_ANY_MDS);
+                if (!IS_ERR(req)) {
+                        req->r_dentry = dget(dentry);
+                        req->r_num_caps = 2;
+                        mask = CEPH_STAT_CAP_INODE | CEPH_CAP_AUTH_SHARED;
+                        if (ceph_security_xattr_wanted(dir))
+                                mask |= CEPH_CAP_XATTR_SHARED;
+                        req->r_args.getattr.mask = mask;
+                        req->r_locked_dir = dir;
+                        err = ceph_mdsc_do_request(mdsc, NULL, req);
+                        if (err == 0 || err == -ENOENT) {
+                                if (dentry == req->r_dentry) {
+                                        valid = !d_unhashed(dentry);
+                                } else {
+                                        d_invalidate(req->r_dentry);
+                                        err = -EAGAIN;
+                                }
+                        }
+                        ceph_mdsc_put_request(req);
+                        dout("d_revalidate %p lookup result=%d\n",
+                             dentry, err);
+                }
+        }
        dout("d_revalidate %p %s\n", dentry, valid ? "valid" : "invalid");
        if (valid) {
                ceph_dentry_lru_touch(dentry);
        } else {
                ceph_dir_clear_complete(dir);
        }
-        iput(dir);
+        dput(parent);
        return valid;
 }
diff --git a/fs/ceph/export.c b/fs/ceph/export.c
index 3b3172357326..6e72c98162d5 100644
--- a/fs/ceph/export.c
+++ b/fs/ceph/export.c
@@ -71,12 +71,18 @@ static struct dentry *__fh_to_dentry(struct super_block *sb, u64 ino)
        inode = ceph_find_inode(sb, vino);
        if (!inode) {
                struct ceph_mds_request *req;
+                int mask;
                req = ceph_mdsc_create_request(mdsc, CEPH_MDS_OP_LOOKUPINO,
                                               USE_ANY_MDS);
                if (IS_ERR(req))
                        return ERR_CAST(req);
+                mask = CEPH_STAT_CAP_INODE;
+                if (ceph_security_xattr_wanted(d_inode(sb->s_root)))
+                        mask |= CEPH_CAP_XATTR_SHARED;
+                req->r_args.getattr.mask = cpu_to_le32(mask);
                req->r_ino1 = vino;
                req->r_num_caps = 1;
                err = ceph_mdsc_do_request(mdsc, NULL, req);
@@ -128,6 +134,7 @@ static struct dentry *__get_parent(struct super_block *sb,
        struct ceph_mds_request *req;
        struct inode *inode;
        struct dentry *dentry;
+        int mask;
        int err;
        req = ceph_mdsc_create_request(mdsc, CEPH_MDS_OP_LOOKUPPARENT,
@@ -144,6 +151,12 @@ static struct dentry *__get_parent(struct super_block *sb,
                        .snap = CEPH_NOSNAP,
                };
        }
+        mask = CEPH_STAT_CAP_INODE;
+        if (ceph_security_xattr_wanted(d_inode(sb->s_root)))
+                mask |= CEPH_CAP_XATTR_SHARED;
+        req->r_args.getattr.mask = cpu_to_le32(mask);
        req->r_num_caps = 1;
        err = ceph_mdsc_do_request(mdsc, NULL, req);
        inode = req->r_target_inode;
diff --git a/fs/ceph/file.c b/fs/ceph/file.c
index eb9028e8cfc5..ef38f01c1795 100644
--- a/fs/ceph/file.c
+++ b/fs/ceph/file.c
@@ -157,7 +157,7 @@ static int ceph_init_file(struct inode *inode, struct file *file, int fmode)
        case S_IFDIR:
                dout("init_file %p %p 0%o (regular)\n", inode, file,
                     inode->i_mode);
-                cf = kmem_cache_alloc(ceph_file_cachep, GFP_KERNEL | __GFP_ZERO);
+                cf = kmem_cache_zalloc(ceph_file_cachep, GFP_KERNEL);
                if (cf == NULL) {
                        ceph_put_fmode(ceph_inode(inode), fmode); /* clean up */
                        return -ENOMEM;
@@ -300,6 +300,7 @@ int ceph_atomic_open(struct inode *dir, struct dentry *dentry,
        struct ceph_mds_request *req;
        struct dentry *dn;
        struct ceph_acls_info acls = {};
+       int mask;
        int err;
        dout("atomic_open %p dentry %p '%pd' %s flags %d mode 0%o\n",
@@ -335,6 +336,12 @@ int ceph_atomic_open(struct inode *dir, struct dentry *dentry,
                        acls.pagelist = NULL;
                }
        }
+       mask = CEPH_STAT_CAP_INODE | CEPH_CAP_AUTH_SHARED;
+       if (ceph_security_xattr_wanted(dir))
+               mask |= CEPH_CAP_XATTR_SHARED;
+       req->r_args.open.mask = cpu_to_le32(mask);
        req->r_locked_dir = dir;           /* caller holds dir->i_mutex */
        err = ceph_mdsc_do_request(mdsc,
                                   (flags & (O_CREAT|O_TRUNC)) ? dir : NULL,
@@ -725,7 +732,6 @@ static void ceph_aio_retry_work(struct work_struct *work)
        ret = ceph_osdc_start_request(req->r_osdc, req, false);
 out:
        if (ret < 0) {
-                BUG_ON(ret == -EOLDSNAPC);
                req->r_result = ret;
                ceph_aio_complete_req(req, NULL);
        }
@@ -783,7 +789,7 @@ ceph_direct_read_write(struct kiocb *iocb, struct iov_iter *iter,
        int num_pages = 0;
        int flags;
        int ret;
-        struct timespec mtime = CURRENT_TIME;
+        struct timespec mtime = current_fs_time(inode->i_sb);
        size_t count = iov_iter_count(iter);
        loff_t pos = iocb->ki_pos;
        bool write = iov_iter_rw(iter) == WRITE;
@@ -949,7 +955,6 @@ ceph_direct_read_write(struct kiocb *iocb, struct iov_iter *iter,
                                ret = ceph_osdc_start_request(req->r_osdc,
                                                              req, false);
                        if (ret < 0) {
-                                BUG_ON(ret == -EOLDSNAPC);
                                req->r_result = ret;
                                ceph_aio_complete_req(req, NULL);
                        }
@@ -988,7 +993,7 @@ ceph_sync_write(struct kiocb *iocb, struct iov_iter *from, loff_t pos,
        int flags;
        int check_caps = 0;
        int ret;
-        struct timespec mtime = CURRENT_TIME;
+        struct timespec mtime = current_fs_time(inode->i_sb);
        size_t count = iov_iter_count(from);
        if (ceph_snap(file_inode(file)) != CEPH_NOSNAP)
diff --git a/fs/ceph/inode.c b/fs/ceph/inode.c
index 5849b88bbed3..ed58b168904a 100644
--- a/fs/ceph/inode.c
+++ b/fs/ceph/inode.c
@@ -549,6 +549,10 @@ int ceph_fill_file_size(struct inode *inode, int issued,
        if (ceph_seq_cmp(truncate_seq, ci->i_truncate_seq) > 0 ||
            (truncate_seq == ci->i_truncate_seq && size > inode->i_size)) {
                dout("size %lld -> %llu\n", inode->i_size, size);
+                if (size > 0 && S_ISDIR(inode->i_mode)) {
+                        pr_err("fill_file_size non-zero size for directory\n");
+                        size = 0;
+                }
                i_size_write(inode, size);
                inode->i_blocks = (size + (1<<9) - 1) >> 9;
                ci->i_reported_size = size;
@@ -977,13 +981,8 @@ out_unlock:
 /*
 * splice a dentry to an inode.
 * caller must hold directory i_mutex for this to be safe.
- *
- * we will only rehash the resulting dentry if @prehash is
- * true; @prehash will be set to false (for the benefit of
- * the caller) if we fail.
 */
-static struct dentry *splice_dentry(struct dentry *dn, struct inode *in,
+static struct dentry *splice_dentry(struct dentry *dn, struct inode *in)
-                                    bool *prehash)
 {
        struct dentry *realdn;
@@ -996,8 +995,6 @@ static struct dentry *splice_dentry(struct dentry *dn, struct inode *in,
        if (IS_ERR(realdn)) {
                pr_err("splice_dentry error %ld %p inode %p ino %llx.%llx\n",
                       PTR_ERR(realdn), dn, in, ceph_vinop(in));
-                if (prehash)
-                        *prehash = false; /* don't rehash on error */
                dn = realdn; /* note realdn contains the error */
                goto out;
        } else if (realdn) {
@@ -1013,8 +1010,6 @@ static struct dentry *splice_dentry(struct dentry *dn, struct inode *in,
                dout("dn %p attached to %p ino %llx.%llx\n",
                     dn, d_inode(dn), ceph_vinop(d_inode(dn)));
        }
-        if ((!prehash || *prehash) && d_unhashed(dn))
-                d_rehash(dn);
 out:
        return dn;
 }
@@ -1247,10 +1242,8 @@ retry_lookup:
                                dout("d_delete %p\n", dn);
                                d_delete(dn);
                        } else {
-                                dout("d_instantiate %p NULL\n", dn);
-                                d_instantiate(dn, NULL);
                                if (have_lease && d_unhashed(dn))
-                                        d_rehash(dn);
+                                        d_add(dn, NULL);
                                update_dentry_lease(dn, rinfo->dlease,
                                                    session,
                                                    req->r_request_started);
@@ -1262,7 +1255,7 @@ retry_lookup:
                if (d_really_is_negative(dn)) {
                        ceph_dir_clear_ordered(dir);
                        ihold(in);
-                        dn = splice_dentry(dn, in, &have_lease);
+                        dn = splice_dentry(dn, in);
                        if (IS_ERR(dn)) {
                                err = PTR_ERR(dn);
                                goto done;
@@ -1272,6 +1265,7 @@ retry_lookup:
                        dout(" %p links to %p %llx.%llx, not %llx.%llx\n",
                             dn, d_inode(dn), ceph_vinop(d_inode(dn)),
                             ceph_vinop(in));
+                        d_invalidate(dn);
                        have_lease = false;
                }
@@ -1292,7 +1286,7 @@ retry_lookup:
                dout(" linking snapped dir %p to dn %p\n", in, dn);
                ceph_dir_clear_ordered(dir);
                ihold(in);
-                dn = splice_dentry(dn, in, NULL);
+                dn = splice_dentry(dn, in);
                if (IS_ERR(dn)) {
                        err = PTR_ERR(dn);
                        goto done;
@@ -1360,15 +1354,20 @@ static int fill_readdir_cache(struct inode *dir, struct dentry *dn,
        if (!ctl->page || pgoff != page_index(ctl->page)) {
                ceph_readdir_cache_release(ctl);
-                ctl->page  = grab_cache_page(&dir->i_data, pgoff);
+                if (idx == 0)
+                        ctl->page = grab_cache_page(&dir->i_data, pgoff);
+                else
+                        ctl->page = find_lock_page(&dir->i_data, pgoff);
                if (!ctl->page) {
                        ctl->index = -1;
-                        return -ENOMEM;
+                        return idx == 0 ? -ENOMEM : 0;
                }
                /* reading/filling the cache are serialized by
                 * i_mutex, no need to use page lock */
                unlock_page(ctl->page);
                ctl->dentries = kmap(ctl->page);
+                if (idx == 0)
+                        memset(ctl->dentries, 0, PAGE_CACHE_SIZE);
        }
        if (req->r_dir_release_cnt == atomic64_read(&ci->i_release_count) &&
@@ -1391,7 +1390,7 @@ int ceph_readdir_prepopulate(struct ceph_mds_request *req,
        struct qstr dname;
        struct dentry *dn;
        struct inode *in;
-        int err = 0, ret, i;
+        int err = 0, skipped = 0, ret, i;
        struct inode *snapdir = NULL;
        struct ceph_mds_request_head *rhead = req->r_request->front.iov_base;
        struct ceph_dentry_info *di;
@@ -1503,7 +1502,17 @@ retry_lookup:
                }
                if (d_really_is_negative(dn)) {
-                        struct dentry *realdn = splice_dentry(dn, in, NULL);
+                        struct dentry *realdn;
+                        if (ceph_security_xattr_deadlock(in)) {
+                                dout(" skip splicing dn %p to inode %p"
+                                     " (security xattr deadlock)\n", dn, in);
+                                iput(in);
+                                skipped++;
+                                goto next_item;
+                        }
+                        realdn = splice_dentry(dn, in);
                        if (IS_ERR(realdn)) {
                                err = PTR_ERR(realdn);
                                d_drop(dn);
@@ -1520,7 +1529,7 @@ retry_lookup:
                                    req->r_session,
                                    req->r_request_started);
-                if (err == 0 && cache_ctl.index >= 0) {
+                if (err == 0 && skipped == 0 && cache_ctl.index >= 0) {
                        ret = fill_readdir_cache(d_inode(parent), dn,
                                                 &cache_ctl, req);
                        if (ret < 0)
@@ -1531,7 +1540,7 @@ next_item:
                        dput(dn);
        }
 out:
-        if (err == 0) {
+        if (err == 0 && skipped == 0) {
                req->r_did_prepopulate = true;
                req->r_readdir_cache_idx = cache_ctl.index;
        }
@@ -1961,7 +1970,7 @@ int ceph_setattr(struct dentry *dentry, struct iattr *attr)
        if (dirtied) {
                inode_dirty_flags = __ceph_mark_dirty_caps(ci, dirtied,
                                                           &prealloc_cf);
-                inode->i_ctime = CURRENT_TIME;
+                inode->i_ctime = current_fs_time(inode->i_sb);
        }
        release &= issued;
diff --git a/fs/ceph/mds_client.c b/fs/ceph/mds_client.c
index 911d64d865f1..44852c3ae531 100644
--- a/fs/ceph/mds_client.c
+++ b/fs/ceph/mds_client.c
@@ -1729,7 +1729,7 @@ ceph_mdsc_create_request(struct ceph_mds_client *mdsc, int op, int mode)
        init_completion(&req->r_safe_completion);
        INIT_LIST_HEAD(&req->r_unsafe_item);
-        req->r_stamp = CURRENT_TIME;
+        req->r_stamp = current_fs_time(mdsc->fsc->sb);
        req->r_op = op;
        req->r_direct_mode = mode;
@@ -2540,6 +2540,7 @@ static void handle_reply(struct ceph_mds_session *session, struct ceph_msg *msg)
        /* insert trace into our cache */
        mutex_lock(&req->r_fill_mutex);
+        current->journal_info = req;
        err = ceph_fill_trace(mdsc->fsc->sb, req, req->r_session);
        if (err == 0) {
                if (result == 0 && (req->r_op == CEPH_MDS_OP_READDIR ||
@@ -2547,6 +2548,7 @@ static void handle_reply(struct ceph_mds_session *session, struct ceph_msg *msg)
                        ceph_readdir_prepopulate(req, req->r_session);
                ceph_unreserve_caps(mdsc, &req->r_caps_reservation);
        }
+        current->journal_info = NULL;
        mutex_unlock(&req->r_fill_mutex);
        up_read(&mdsc->snap_rwsem);
@@ -3764,7 +3766,6 @@ void ceph_mdsc_handle_map(struct ceph_mds_client *mdsc, struct ceph_msg *msg)
        dout("handle_map epoch %u len %d\n", epoch, (int)maplen);
        /* do we need it? */
-        ceph_monc_got_mdsmap(&mdsc->fsc->client->monc, epoch);
        mutex_lock(&mdsc->mutex);
        if (mdsc->mdsmap && epoch <= mdsc->mdsmap->m_epoch) {
                dout("handle_map epoch %u <= our %u\n",
@@ -3791,6 +3792,8 @@ void ceph_mdsc_handle_map(struct ceph_mds_client *mdsc, struct ceph_msg *msg)
        mdsc->fsc->sb->s_maxbytes = mdsc->mdsmap->m_max_file_size;
        __wake_requests(mdsc, &mdsc->waiting_for_map);
+        ceph_monc_got_map(&mdsc->fsc->client->monc, CEPH_SUB_MDSMAP,
+                          mdsc->mdsmap->m_epoch);
        mutex_unlock(&mdsc->mutex);
        schedule_delayed(mdsc);
diff --git a/fs/ceph/snap.c b/fs/ceph/snap.c
index 4aa7122a8d38..9caaa7ffc93f 100644
--- a/fs/ceph/snap.c
+++ b/fs/ceph/snap.c
@@ -296,8 +296,6 @@ static int cmpu64_rev(const void *a, const void *b)
 }
-struct ceph_snap_context *ceph_empty_snapc;
 /*
 * build the snap context for a given realm.
 */
@@ -987,17 +985,3 @@ out:
                up_write(&mdsc->snap_rwsem);
        return;
 }
-int __init ceph_snap_init(void)
-{
-        ceph_empty_snapc = ceph_create_snap_context(0, GFP_NOFS);
-        if (!ceph_empty_snapc)
-                return -ENOMEM;
-        ceph_empty_snapc->seq = 1;
-        return 0;
-}
-void ceph_snap_exit(void)
-{
-        ceph_put_snap_context(ceph_empty_snapc);
-}
diff --git a/fs/ceph/super.c b/fs/ceph/super.c
index ca4d5e8457f1..c973043deb0e 100644
--- a/fs/ceph/super.c
+++ b/fs/ceph/super.c
@@ -439,8 +439,8 @@ static int ceph_show_options(struct seq_file *m, struct dentry *root)
        if (fsopt->flags & CEPH_MOUNT_OPT_DIRSTAT)
                seq_puts(m, ",dirstat");
-        if ((fsopt->flags & CEPH_MOUNT_OPT_RBYTES) == 0)
+        if ((fsopt->flags & CEPH_MOUNT_OPT_RBYTES))
-                seq_puts(m, ",norbytes");
+                seq_puts(m, ",rbytes");
        if (fsopt->flags & CEPH_MOUNT_OPT_NOASYNCREADDIR)
                seq_puts(m, ",noasyncreaddir");
        if ((fsopt->flags & CEPH_MOUNT_OPT_DCACHE) == 0)
@@ -530,7 +530,7 @@ static struct ceph_fs_client *create_fs_client(struct ceph_mount_options *fsopt,
                goto fail;
        }
        fsc->client->extra_mon_dispatch = extra_mon_dispatch;
-        fsc->client->monc.want_mdsmap = 1;
+        ceph_monc_want_map(&fsc->client->monc, CEPH_SUB_MDSMAP, 0, true);
        fsc->mount_options = fsopt;
@@ -793,22 +793,20 @@ static struct dentry *ceph_real_mount(struct ceph_fs_client *fsc,
        struct dentry *root;
        int first = 0;   /* first vfsmount for this super_block */
-        dout("mount start\n");
+        dout("mount start %p\n", fsc);
        mutex_lock(&fsc->client->mount_mutex);
-        err = __ceph_open_session(fsc->client, started);
+        if (!fsc->sb->s_root) {
-        if (err < 0)
+                err = __ceph_open_session(fsc->client, started);
-                goto out;
+                if (err < 0)
+                        goto out;
-        dout("mount opening root\n");
+                dout("mount opening root\n");
-        root = open_root_dentry(fsc, "", started);
+                root = open_root_dentry(fsc, "", started);
-        if (IS_ERR(root)) {
+                if (IS_ERR(root)) {
-                err = PTR_ERR(root);
+                        err = PTR_ERR(root);
-                goto out;
+                        goto out;
-        }
+                }
-        if (fsc->sb->s_root) {
-                dput(root);
-        } else {
                fsc->sb->s_root = root;
                first = 1;
@@ -818,6 +816,7 @@ static struct dentry *ceph_real_mount(struct ceph_fs_client *fsc,
        }
        if (path[0] == 0) {
+                root = fsc->sb->s_root;
                dget(root);
        } else {
                dout("mount opening base mountpoint\n");
@@ -833,16 +832,14 @@ static struct dentry *ceph_real_mount(struct ceph_fs_client *fsc,
        mutex_unlock(&fsc->client->mount_mutex);
        return root;
-out:
-        mutex_unlock(&fsc->client->mount_mutex);
-        return ERR_PTR(err);
 fail:
        if (first) {
                dput(fsc->sb->s_root);
                fsc->sb->s_root = NULL;
        }
-        goto out;
+out:
+        mutex_unlock(&fsc->client->mount_mutex);
+        return ERR_PTR(err);
 }
 static int ceph_set_super(struct super_block *s, void *data)
@@ -1042,19 +1039,14 @@ static int __init init_ceph(void)
        ceph_flock_init();
        ceph_xattr_init();
-        ret = ceph_snap_init();
-        if (ret)
-                goto out_xattr;
        ret = register_filesystem(&ceph_fs_type);
        if (ret)
-                goto out_snap;
+                goto out_xattr;
        pr_info("loaded (mds proto %d)\n", CEPH_MDSC_PROTOCOL);
        return 0;
-out_snap:
-        ceph_snap_exit();
 out_xattr:
        ceph_xattr_exit();
        destroy_caches();
@@ -1066,7 +1058,6 @@ static void __exit exit_ceph(void)
 {
        dout("exit_ceph\n");
        unregister_filesystem(&ceph_fs_type);
-        ceph_snap_exit();
        ceph_xattr_exit();
        destroy_caches();
 }
diff --git a/fs/ceph/super.h b/fs/ceph/super.h
index 9c458eb52245..e705c4d612d7 100644
--- a/fs/ceph/super.h
+++ b/fs/ceph/super.h
@@ -37,8 +37,7 @@
 #define CEPH_MOUNT_OPT_FSCACHE         (1<<10) /* use fscache */
 #define CEPH_MOUNT_OPT_NOPOOLPERM      (1<<11) /* no pool permission check */
-#define CEPH_MOUNT_OPT_DEFAULT    (CEPH_MOUNT_OPT_RBYTES | \
+#define CEPH_MOUNT_OPT_DEFAULT    CEPH_MOUNT_OPT_DCACHE
-                                   CEPH_MOUNT_OPT_DCACHE)
 #define ceph_set_mount_opt(fsc, opt) \
        (fsc)->mount_options->flags |= CEPH_MOUNT_OPT_##opt;
@@ -469,7 +468,7 @@ static inline struct inode *ceph_find_inode(struct super_block *sb,
 #define CEPH_I_POOL_PERM        (1 << 4)  /* pool rd/wr bits are valid */
 #define CEPH_I_POOL_RD          (1 << 5)  /* can read from pool */
 #define CEPH_I_POOL_WR          (1 << 6)  /* can write to pool */
+#define CEPH_I_SEC_INITED       (1 << 7)  /* security initialized */
 static inline void __ceph_dir_set_complete(struct ceph_inode_info *ci,
                                           long long release_count,
@@ -721,7 +720,6 @@ static inline int default_congestion_kb(void)
 /* snap.c */
-extern struct ceph_snap_context *ceph_empty_snapc;
 struct ceph_snap_realm *ceph_lookup_snap_realm(struct ceph_mds_client *mdsc,
                                               u64 ino);
 extern void ceph_get_snap_realm(struct ceph_mds_client *mdsc,
@@ -738,8 +736,6 @@ extern void ceph_queue_cap_snap(struct ceph_inode_info *ci);
 extern int __ceph_finish_cap_snap(struct ceph_inode_info *ci,
                                  struct ceph_cap_snap *capsnap);
 extern void ceph_cleanup_empty_realms(struct ceph_mds_client *mdsc);
-extern int ceph_snap_init(void);
-extern void ceph_snap_exit(void);
 /*
 * a cap_snap is "pending" if it is still awaiting an in-progress
@@ -808,6 +804,20 @@ extern void __init ceph_xattr_init(void);
 extern void ceph_xattr_exit(void);
 extern const struct xattr_handler *ceph_xattr_handlers[];
+#ifdef CONFIG_SECURITY
+extern bool ceph_security_xattr_deadlock(struct inode *in);
+extern bool ceph_security_xattr_wanted(struct inode *in);
+#else
+static inline bool ceph_security_xattr_deadlock(struct inode *in)
+{
+        return false;
+}
+static inline bool ceph_security_xattr_wanted(struct inode *in)
+{
+        return false;
+}
+#endif
 /* acl.c */
 struct ceph_acls_info {
        void *default_acl;
@@ -947,7 +957,6 @@ extern void ceph_dentry_lru_touch(struct dentry *dn);
 extern void ceph_dentry_lru_del(struct dentry *dn);
 extern void ceph_invalidate_dentry_lease(struct dentry *dentry);
 extern unsigned ceph_dentry_hash(struct inode *dir, struct dentry *dn);
-extern struct inode *ceph_get_dentry_parent_inode(struct dentry *dentry);
 extern void ceph_readdir_cache_release(struct ceph_readdir_cache_control *ctl);
 /*
diff --git a/fs/ceph/xattr.c b/fs/ceph/xattr.c
index 819163d8313b..9410abdef3ce 100644
--- a/fs/ceph/xattr.c
+++ b/fs/ceph/xattr.c
@@ -714,31 +714,62 @@ void __ceph_build_xattrs_blob(struct ceph_inode_info *ci)
        }
 }
+static inline int __get_request_mask(struct inode *in) {
+        struct ceph_mds_request *req = current->journal_info;
+        int mask = 0;
+        if (req && req->r_target_inode == in) {
+                if (req->r_op == CEPH_MDS_OP_LOOKUP ||
+                    req->r_op == CEPH_MDS_OP_LOOKUPINO ||
+                    req->r_op == CEPH_MDS_OP_LOOKUPPARENT ||
+                    req->r_op == CEPH_MDS_OP_GETATTR) {
+                        mask = le32_to_cpu(req->r_args.getattr.mask);
+                } else if (req->r_op == CEPH_MDS_OP_OPEN ||
+                           req->r_op == CEPH_MDS_OP_CREATE) {
+                        mask = le32_to_cpu(req->r_args.open.mask);
+                }
+        }
+        return mask;
+}
 ssize_t __ceph_getxattr(struct inode *inode, const char *name, void *value,
                      size_t size)
 {
        struct ceph_inode_info *ci = ceph_inode(inode);
-        int err;
        struct ceph_inode_xattr *xattr;
        struct ceph_vxattr *vxattr = NULL;
+        int req_mask;
+        int err;
        if (!ceph_is_valid_xattr(name))
                return -ENODATA;
        /* let's see if a virtual xattr was requested */
        vxattr = ceph_match_vxattr(inode, name);
-        if (vxattr && !(vxattr->exists_cb && !vxattr->exists_cb(ci))) {
+        if (vxattr) {
-                err = vxattr->getxattr_cb(ci, value, size);
+                err = -ENODATA;
+                if (!(vxattr->exists_cb && !vxattr->exists_cb(ci)))
+                        err = vxattr->getxattr_cb(ci, value, size);
                return err;
        }
+        req_mask = __get_request_mask(inode);
        spin_lock(&ci->i_ceph_lock);
        dout("getxattr %p ver=%lld index_ver=%lld\n", inode,
             ci->i_xattrs.version, ci->i_xattrs.index_version);
        if (ci->i_xattrs.version == 0 ||
-            !__ceph_caps_issued_mask(ci, CEPH_CAP_XATTR_SHARED, 1)) {
+            !((req_mask & CEPH_CAP_XATTR_SHARED) ||
+              __ceph_caps_issued_mask(ci, CEPH_CAP_XATTR_SHARED, 1))) {
                spin_unlock(&ci->i_ceph_lock);
+                /* security module gets xattr while filling trace */
+                if (current->journal_info != NULL) {
+                        pr_warn_ratelimited("sync getxattr %p "
+                                            "during filling trace\n", inode);
+                        return -EBUSY;
+                }
                /* get xattrs from mds (if we don't already have them) */
                err = ceph_do_getattr(inode, CEPH_STAT_CAP_XATTR, true);
                if (err)
@@ -765,6 +796,9 @@ ssize_t __ceph_getxattr(struct inode *inode, const char *name, void *value,
        memcpy(value, xattr->val, xattr->val_len);
+        if (current->journal_info != NULL &&
+            !strncmp(name, XATTR_SECURITY_PREFIX, XATTR_SECURITY_PREFIX_LEN))
+                ci->i_ceph_flags |= CEPH_I_SEC_INITED;
 out:
        spin_unlock(&ci->i_ceph_lock);
        return err;
@@ -999,7 +1033,7 @@ retry:
                dirty = __ceph_mark_dirty_caps(ci, CEPH_CAP_XATTR_EXCL,
                                               &prealloc_cf);
                ci->i_xattrs.dirty = true;
-                inode->i_ctime = CURRENT_TIME;
+                inode->i_ctime = current_fs_time(inode->i_sb);
        }
        spin_unlock(&ci->i_ceph_lock);
@@ -1015,7 +1049,15 @@ do_sync:
 do_sync_unlocked:
        if (lock_snap_rwsem)
                up_read(&mdsc->snap_rwsem);
-        err = ceph_sync_setxattr(dentry, name, value, size, flags);
+        /* security module set xattr while filling trace */
+        if (current->journal_info != NULL) {
+                pr_warn_ratelimited("sync setxattr %p "
+                                    "during filling trace\n", inode);
+                err = -EBUSY;
+        } else {
+                err = ceph_sync_setxattr(dentry, name, value, size, flags);
+        }
 out:
        ceph_free_cap_flush(prealloc_cf);
        kfree(newname);
@@ -1136,7 +1178,7 @@ retry:
        dirty = __ceph_mark_dirty_caps(ci, CEPH_CAP_XATTR_EXCL,
                                       &prealloc_cf);
        ci->i_xattrs.dirty = true;
-        inode->i_ctime = CURRENT_TIME;
+        inode->i_ctime = current_fs_time(inode->i_sb);
        spin_unlock(&ci->i_ceph_lock);
        if (lock_snap_rwsem)
                up_read(&mdsc->snap_rwsem);
@@ -1164,3 +1206,25 @@ int ceph_removexattr(struct dentry *dentry, const char *name)
        return __ceph_removexattr(dentry, name);
 }
+#ifdef CONFIG_SECURITY
+bool ceph_security_xattr_wanted(struct inode *in)
+{
+        return in->i_security != NULL;
+}
+bool ceph_security_xattr_deadlock(struct inode *in)
+{
+        struct ceph_inode_info *ci;
+        bool ret;
+        if (in->i_security == NULL)
+                return false;
+        ci = ceph_inode(in);
+        spin_lock(&ci->i_ceph_lock);
+        ret = !(ci->i_ceph_flags & CEPH_I_SEC_INITED) &&
+              !(ci->i_xattrs.version > 0 &&
+                __ceph_caps_issued_mask(ci, CEPH_CAP_XATTR_SHARED, 0));
+        spin_unlock(&ci->i_ceph_lock);
+        return ret;
+}
+#endif
diff --git a/fs/cifs/cifs_debug.c b/fs/cifs/cifs_debug.c
index 50b268483302..788e19195991 100644
--- a/fs/cifs/cifs_debug.c
+++ b/fs/cifs/cifs_debug.c
@@ -255,7 +255,6 @@ static const struct file_operations cifs_debug_data_proc_fops = {
 static ssize_t cifs_stats_proc_write(struct file *file,
                const char __user *buffer, size_t count, loff_t *ppos)
 {
-        char c;
        bool bv;
        int rc;
        struct list_head *tmp1, *tmp2, *tmp3;
@@ -263,11 +262,8 @@ static ssize_t cifs_stats_proc_write(struct file *file,
        struct cifs_ses *ses;
        struct cifs_tcon *tcon;
-        rc = get_user(c, buffer);
+        rc = kstrtobool_from_user(buffer, count, &bv);
-        if (rc)
+        if (rc == 0) {
-                return rc;
-        if (strtobool(&c, &bv) == 0) {
 #ifdef CONFIG_CIFS_STATS2
                atomic_set(&totBufAllocCount, 0);
                atomic_set(&totSmBufAllocCount, 0);
@@ -290,6 +286,8 @@ static ssize_t cifs_stats_proc_write(struct file *file,
                        }
                }
                spin_unlock(&cifs_tcp_ses_lock);
+        } else {
+                return rc;
        }
        return count;
@@ -433,17 +431,17 @@ static int cifsFYI_proc_open(struct inode *inode, struct file *file)
 static ssize_t cifsFYI_proc_write(struct file *file, const char __user *buffer,
                size_t count, loff_t *ppos)
 {
-        char c;
+        char c[2] = { '\0' };
        bool bv;
        int rc;
-        rc = get_user(c, buffer);
+        rc = get_user(c[0], buffer);
        if (rc)
                return rc;
-        if (strtobool(&c, &bv) == 0)
+        if (strtobool(c, &bv) == 0)
                cifsFYI = bv;
-        else if ((c > '1') && (c <= '9'))
+        else if ((c[0] > '1') && (c[0] <= '9'))
-                cifsFYI = (int) (c - '0'); /* see cifs_debug.h for meanings */
+                cifsFYI = (int) (c[0] - '0'); /* see cifs_debug.h for meanings */
        return count;
 }
@@ -471,20 +469,12 @@ static int cifs_linux_ext_proc_open(struct inode *inode, struct file *file)
 static ssize_t cifs_linux_ext_proc_write(struct file *file,
                const char __user *buffer, size_t count, loff_t *ppos)
 {
-        char c;
-        bool bv;
        int rc;
-        rc = get_user(c, buffer);
+        rc = kstrtobool_from_user(buffer, count, &linuxExtEnabled);
        if (rc)
                return rc;
-        rc = strtobool(&c, &bv);
-        if (rc)
-                return rc;
-        linuxExtEnabled = bv;
        return count;
 }
@@ -511,20 +501,12 @@ static int cifs_lookup_cache_proc_open(struct inode *inode, struct file *file)
 static ssize_t cifs_lookup_cache_proc_write(struct file *file,
                const char __user *buffer, size_t count, loff_t *ppos)
 {
-        char c;
-        bool bv;
        int rc;
-        rc = get_user(c, buffer);
+        rc = kstrtobool_from_user(buffer, count, &lookupCacheEnabled);
        if (rc)
                return rc;
-        rc = strtobool(&c, &bv);
-        if (rc)
-                return rc;
-        lookupCacheEnabled = bv;
        return count;
 }
@@ -551,20 +533,12 @@ static int traceSMB_proc_open(struct inode *inode, struct file *file)
 static ssize_t traceSMB_proc_write(struct file *file, const char __user *buffer,
                size_t count, loff_t *ppos)
 {
-        char c;
-        bool bv;
        int rc;
-        rc = get_user(c, buffer);
+        rc = kstrtobool_from_user(buffer, count, &traceSMB);
        if (rc)
                return rc;
-        rc = strtobool(&c, &bv);
-        if (rc)
-                return rc;
-        traceSMB = bv;
        return count;
 }
@@ -622,7 +596,6 @@ static ssize_t cifs_security_flags_proc_write(struct file *file,
        int rc;
        unsigned int flags;
        char flags_string[12];
-        char c;
        bool bv;
        if ((count < 1) || (count > 11))
@@ -635,11 +608,10 @@ static ssize_t cifs_security_flags_proc_write(struct file *file,
        if (count < 3) {
                /* single char or single char followed by null */
-                c = flags_string[0];
+                if (strtobool(flags_string, &bv) == 0) {
-                if (strtobool(&c, &bv) == 0) {
                        global_secflags = bv ? CIFSSEC_MAX : CIFSSEC_DEF;
                        return count;
-                } else if (!isdigit(c)) {
+                } else if (!isdigit(flags_string[0])) {
                        cifs_dbg(VFS, "Invalid SecurityFlags: %s\n",
                                        flags_string);
                        return -EINVAL;
diff --git a/fs/cifs/cifs_debug.h b/fs/cifs/cifs_debug.h
index 66cf0f9fff89..c611ca2339d7 100644
--- a/fs/cifs/cifs_debug.h
+++ b/fs/cifs/cifs_debug.h
@@ -25,7 +25,7 @@
 void cifs_dump_mem(char *label, void *data, int length);
 void cifs_dump_detail(void *);
 void cifs_dump_mids(struct TCP_Server_Info *);
-extern int traceSMB;            /* flag which enables the function below */
+extern bool traceSMB;           /* flag which enables the function below */
 void dump_smb(void *, int);
 #define CIFS_INFO       0x01
 #define CIFS_RC         0x02
diff --git a/fs/cifs/cifsencrypt.c b/fs/cifs/cifsencrypt.c
index e682b36a210f..4897dacf8944 100644
--- a/fs/cifs/cifsencrypt.c
+++ b/fs/cifs/cifsencrypt.c
@@ -33,6 +33,7 @@
 #include <linux/ctype.h>
 #include <linux/random.h>
 #include <linux/highmem.h>
+#include <crypto/skcipher.h>
 static int
 cifs_crypto_shash_md5_allocate(struct TCP_Server_Info *server)
@@ -789,38 +790,46 @@ int
 calc_seckey(struct cifs_ses *ses)
 {
        int rc;
-        struct crypto_blkcipher *tfm_arc4;
+        struct crypto_skcipher *tfm_arc4;
        struct scatterlist sgin, sgout;
-        struct blkcipher_desc desc;
+        struct skcipher_request *req;
        unsigned char sec_key[CIFS_SESS_KEY_SIZE]; /* a nonce */
        get_random_bytes(sec_key, CIFS_SESS_KEY_SIZE);
-        tfm_arc4 = crypto_alloc_blkcipher("ecb(arc4)", 0, CRYPTO_ALG_ASYNC);
+        tfm_arc4 = crypto_alloc_skcipher("ecb(arc4)", 0, CRYPTO_ALG_ASYNC);
        if (IS_ERR(tfm_arc4)) {
                rc = PTR_ERR(tfm_arc4);
                cifs_dbg(VFS, "could not allocate crypto API arc4\n");
                return rc;
        }
-        desc.tfm = tfm_arc4;
+        rc = crypto_skcipher_setkey(tfm_arc4, ses->auth_key.response,
-        rc = crypto_blkcipher_setkey(tfm_arc4, ses->auth_key.response,
                                        CIFS_SESS_KEY_SIZE);
        if (rc) {
                cifs_dbg(VFS, "%s: Could not set response as a key\n",
                         __func__);
-                return rc;
+                goto out_free_cipher;
+        }
+        req = skcipher_request_alloc(tfm_arc4, GFP_KERNEL);
+        if (!req) {
+                rc = -ENOMEM;
+                cifs_dbg(VFS, "could not allocate crypto API arc4 request\n");
+                goto out_free_cipher;
        }
        sg_init_one(&sgin, sec_key, CIFS_SESS_KEY_SIZE);
        sg_init_one(&sgout, ses->ntlmssp->ciphertext, CIFS_CPHTXT_SIZE);
-        rc = crypto_blkcipher_encrypt(&desc, &sgout, &sgin, CIFS_CPHTXT_SIZE);
+        skcipher_request_set_callback(req, 0, NULL, NULL);
+        skcipher_request_set_crypt(req, &sgin, &sgout, CIFS_CPHTXT_SIZE, NULL);
+        rc = crypto_skcipher_encrypt(req);
+        skcipher_request_free(req);
        if (rc) {
                cifs_dbg(VFS, "could not encrypt session key rc: %d\n", rc);
-                crypto_free_blkcipher(tfm_arc4);
+                goto out_free_cipher;
-                return rc;
        }
        /* make secondary_key/nonce as session key */
@@ -828,7 +837,8 @@ calc_seckey(struct cifs_ses *ses)
        /* and make len as that of session key only */
        ses->auth_key.len = CIFS_SESS_KEY_SIZE;
-        crypto_free_blkcipher(tfm_arc4);
+out_free_cipher:
+        crypto_free_skcipher(tfm_arc4);
        return rc;
 }
diff --git a/fs/cifs/cifsfs.c b/fs/cifs/cifsfs.c
index 2eea40353e60..1d86fc620e5c 100644
--- a/fs/cifs/cifsfs.c
+++ b/fs/cifs/cifsfs.c
@@ -54,10 +54,10 @@
 #endif
 int cifsFYI = 0;
-int traceSMB = 0;
+bool traceSMB;
 bool enable_oplocks = true;
-unsigned int linuxExtEnabled = 1;
+bool linuxExtEnabled = true;
-unsigned int lookupCacheEnabled = 1;
+bool lookupCacheEnabled = true;
 unsigned int global_secflags = CIFSSEC_DEF;
 /* unsigned int ntlmv2_support = 0; */
 unsigned int sign_CIFS_PDUs = 1;
@@ -642,9 +642,7 @@ cifs_get_root(struct smb_vol *vol, struct super_block *sb)
                while (*s && *s != sep)
                        s++;
-                inode_lock(dir);
+                child = lookup_one_len_unlocked(p, dentry, s - p);
-                child = lookup_one_len(p, dentry, s - p);
-                inode_unlock(dir);
                dput(dentry);
                dentry = child;
        } while (!IS_ERR(dentry));
diff --git a/fs/cifs/cifsglob.h b/fs/cifs/cifsglob.h
index a25b2513f146..d21da9f05bae 100644
--- a/fs/cifs/cifsglob.h
+++ b/fs/cifs/cifsglob.h
@@ -1596,11 +1596,11 @@ GLOBAL_EXTERN atomic_t midCount;
 /* Misc globals */
 GLOBAL_EXTERN bool enable_oplocks; /* enable or disable oplocks */
-GLOBAL_EXTERN unsigned int lookupCacheEnabled;
+GLOBAL_EXTERN bool lookupCacheEnabled;
 GLOBAL_EXTERN unsigned int global_secflags;     /* if on, session setup sent
                                with more secure ntlmssp2 challenge/resp */
 GLOBAL_EXTERN unsigned int sign_CIFS_PDUs;  /* enable smb packet signing */
-GLOBAL_EXTERN unsigned int linuxExtEnabled;/*enable Linux/Unix CIFS extensions*/
+GLOBAL_EXTERN bool linuxExtEnabled;/*enable Linux/Unix CIFS extensions*/
 GLOBAL_EXTERN unsigned int CIFSMaxBufSize;  /* max size not including hdr */
 GLOBAL_EXTERN unsigned int cifs_min_rcv;    /* min size of big ntwrk buf pool */
 GLOBAL_EXTERN unsigned int cifs_min_small;  /* min size of small buf pool */
diff --git a/fs/cifs/smbencrypt.c b/fs/cifs/smbencrypt.c
index a4232ec4f2ba..699b7868108f 100644
--- a/fs/cifs/smbencrypt.c
+++ b/fs/cifs/smbencrypt.c
@@ -23,6 +23,7 @@
   Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
 */
+#include <crypto/skcipher.h>
 #include <linux/module.h>
 #include <linux/slab.h>
 #include <linux/fs.h>
@@ -70,31 +71,42 @@ smbhash(unsigned char *out, const unsigned char *in, unsigned char *key)
 {
        int rc;
        unsigned char key2[8];
-        struct crypto_blkcipher *tfm_des;
+        struct crypto_skcipher *tfm_des;
        struct scatterlist sgin, sgout;
-        struct blkcipher_desc desc;
+        struct skcipher_request *req;
        str_to_key(key, key2);
-        tfm_des = crypto_alloc_blkcipher("ecb(des)", 0, CRYPTO_ALG_ASYNC);
+        tfm_des = crypto_alloc_skcipher("ecb(des)", 0, CRYPTO_ALG_ASYNC);
        if (IS_ERR(tfm_des)) {
                rc = PTR_ERR(tfm_des);
                cifs_dbg(VFS, "could not allocate des crypto API\n");
                goto smbhash_err;
        }
-        desc.tfm = tfm_des;
+        req = skcipher_request_alloc(tfm_des, GFP_KERNEL);
+        if (!req) {
+                rc = -ENOMEM;
+                cifs_dbg(VFS, "could not allocate des crypto API\n");
+                goto smbhash_free_skcipher;
+        }
-        crypto_blkcipher_setkey(tfm_des, key2, 8);
+        crypto_skcipher_setkey(tfm_des, key2, 8);
        sg_init_one(&sgin, in, 8);
        sg_init_one(&sgout, out, 8);
-        rc = crypto_blkcipher_encrypt(&desc, &sgout, &sgin, 8);
+        skcipher_request_set_callback(req, 0, NULL, NULL);
+        skcipher_request_set_crypt(req, &sgin, &sgout, 8, NULL);
+        rc = crypto_skcipher_encrypt(req);
        if (rc)
                cifs_dbg(VFS, "could not encrypt crypt key rc: %d\n", rc);
-        crypto_free_blkcipher(tfm_des);
+        skcipher_request_free(req);
+smbhash_free_skcipher:
+        crypto_free_skcipher(tfm_des);
 smbhash_err:
        return rc;
 }
diff --git a/fs/compat_ioctl.c b/fs/compat_ioctl.c
index 6402eaf8ab95..bd01b92aad98 100644
--- a/fs/compat_ioctl.c
+++ b/fs/compat_ioctl.c
@@ -1040,28 +1040,6 @@ COMPATIBLE_IOCTL(PPPIOCGL2TPSTATS)
 /* PPPOX */
 COMPATIBLE_IOCTL(PPPOEIOCSFWD)
 COMPATIBLE_IOCTL(PPPOEIOCDFWD)
-/* ppdev */
-COMPATIBLE_IOCTL(PPSETMODE)
-COMPATIBLE_IOCTL(PPRSTATUS)
-COMPATIBLE_IOCTL(PPRCONTROL)
-COMPATIBLE_IOCTL(PPWCONTROL)
-COMPATIBLE_IOCTL(PPFCONTROL)
-COMPATIBLE_IOCTL(PPRDATA)
-COMPATIBLE_IOCTL(PPWDATA)
-COMPATIBLE_IOCTL(PPCLAIM)
-COMPATIBLE_IOCTL(PPRELEASE)
-COMPATIBLE_IOCTL(PPYIELD)
-COMPATIBLE_IOCTL(PPEXCL)
-COMPATIBLE_IOCTL(PPDATADIR)
-COMPATIBLE_IOCTL(PPNEGOT)
-COMPATIBLE_IOCTL(PPWCTLONIRQ)
-COMPATIBLE_IOCTL(PPCLRIRQ)
-COMPATIBLE_IOCTL(PPSETPHASE)
-COMPATIBLE_IOCTL(PPGETMODES)
-COMPATIBLE_IOCTL(PPGETMODE)
-COMPATIBLE_IOCTL(PPGETPHASE)
-COMPATIBLE_IOCTL(PPGETFLAGS)
-COMPATIBLE_IOCTL(PPSETFLAGS)
 /* Big A */
 /* sparc only */
 /* Big Q for sound/OSS */
diff --git a/fs/configfs/dir.c b/fs/configfs/dir.c
index f419519ec41f..ea59c891fc53 100644
--- a/fs/configfs/dir.c
+++ b/fs/configfs/dir.c
@@ -432,14 +432,9 @@ static int configfs_attach_attr(struct configfs_dirent * sd, struct dentry * den
                                (sd->s_type & CONFIGFS_ITEM_BIN_ATTR) ?
                                        configfs_init_bin_file :
                                        configfs_init_file);
-        if (error) {
+        if (error)
                configfs_put(sd);
-                return error;
+        return error;
-        }
-        d_rehash(dentry);
-        return 0;
 }
 static struct dentry * configfs_lookup(struct inode *dir,
@@ -701,23 +696,29 @@ static int populate_groups(struct config_group *group)
 {
        struct config_group *new_group;
        int ret = 0;
-        int i;
-        if (group->default_groups) {
-                for (i = 0; group->default_groups[i]; i++) {
-                        new_group = group->default_groups[i];
-                        ret = create_default_group(group, new_group);
+        list_for_each_entry(new_group, &group->default_groups, group_entry) {
-                        if (ret) {
+                ret = create_default_group(group, new_group);
-                                detach_groups(group);
+                if (ret) {
-                                break;
+                        detach_groups(group);
-                        }
+                        break;
                }
        }
        return ret;
 }
+void configfs_remove_default_groups(struct config_group *group)
+{
+        struct config_group *g, *n;
+        list_for_each_entry_safe(g, n, &group->default_groups, group_entry) {
+                list_del(&g->group_entry);
+                config_item_put(&g->cg_item);
+        }
+}
+EXPORT_SYMBOL(configfs_remove_default_groups);
 /*
 * All of link_obj/unlink_obj/link_group/unlink_group require that
 * subsys->su_mutex is held.
@@ -766,15 +767,10 @@ static void link_obj(struct config_item *parent_item, struct config_item *item)
 static void unlink_group(struct config_group *group)
 {
-        int i;
        struct config_group *new_group;
-        if (group->default_groups) {
+        list_for_each_entry(new_group, &group->default_groups, group_entry)
-                for (i = 0; group->default_groups[i]; i++) {
+                unlink_group(new_group);
-                        new_group = group->default_groups[i];
-                        unlink_group(new_group);
-                }
-        }
        group->cg_subsys = NULL;
        unlink_obj(&group->cg_item);
@@ -782,7 +778,6 @@ static void unlink_group(struct config_group *group)
 static void link_group(struct config_group *parent_group, struct config_group *group)
 {
-        int i;
        struct config_group *new_group;
        struct configfs_subsystem *subsys = NULL; /* gcc is a turd */
@@ -796,12 +791,8 @@ static void link_group(struct config_group *parent_group, struct config_group *g
                BUG();
        group->cg_subsys = subsys;
-        if (group->default_groups) {
+        list_for_each_entry(new_group, &group->default_groups, group_entry)
-                for (i = 0; group->default_groups[i]; i++) {
+                link_group(group, new_group);
-                        new_group = group->default_groups[i];
-                        link_group(group, new_group);
-                }
-        }
 }
 /*
diff --git a/fs/configfs/inode.c b/fs/configfs/inode.c
index cee087d8f7e0..03d124ae27d7 100644
--- a/fs/configfs/inode.c
+++ b/fs/configfs/inode.c
@@ -75,7 +75,8 @@ int configfs_setattr(struct dentry * dentry, struct iattr * iattr)
                sd_iattr->ia_mode = sd->s_mode;
                sd_iattr->ia_uid = GLOBAL_ROOT_UID;
                sd_iattr->ia_gid = GLOBAL_ROOT_GID;
-                sd_iattr->ia_atime = sd_iattr->ia_mtime = sd_iattr->ia_ctime = CURRENT_TIME;
+                sd_iattr->ia_atime = sd_iattr->ia_mtime =
+                        sd_iattr->ia_ctime = current_fs_time(inode->i_sb);
                sd->s_iattr = sd_iattr;
        }
        /* attributes were changed atleast once in past */
@@ -111,7 +112,8 @@ int configfs_setattr(struct dentry * dentry, struct iattr * iattr)
 static inline void set_default_inode_attr(struct inode * inode, umode_t mode)
 {
        inode->i_mode = mode;
-        inode->i_atime = inode->i_mtime = inode->i_ctime = CURRENT_TIME;
+        inode->i_atime = inode->i_mtime =
+                inode->i_ctime = current_fs_time(inode->i_sb);
 }
 static inline void set_inode_attr(struct inode * inode, struct iattr * iattr)
@@ -195,13 +197,21 @@ int configfs_create(struct dentry * dentry, umode_t mode, void (*init)(struct in
                return -ENOMEM;
        p_inode = d_inode(dentry->d_parent);
-        p_inode->i_mtime = p_inode->i_ctime = CURRENT_TIME;
+        p_inode->i_mtime = p_inode->i_ctime = current_fs_time(p_inode->i_sb);
        configfs_set_inode_lock_class(sd, inode);
        init(inode);
-        d_instantiate(dentry, inode);
+        if (S_ISDIR(mode) || S_ISLNK(mode)) {
-        if (S_ISDIR(mode) || S_ISLNK(mode))
+                /*
+                 * ->symlink(), ->mkdir(), configfs_register_subsystem() or
+                 * create_default_group() - already hashed.
+                 */
+                d_instantiate(dentry, inode);
                dget(dentry);  /* pin link and directory dentries in core */
+        } else {
+                /* ->lookup() */
+                d_add(dentry, inode);
+        }
        return error;
 }
diff --git a/fs/configfs/item.c b/fs/configfs/item.c
index b863a09cd2f1..8b2a994042dd 100644
--- a/fs/configfs/item.c
+++ b/fs/configfs/item.c
@@ -182,6 +182,7 @@ void config_group_init(struct config_group *group)
 {
        config_item_init(&group->cg_item);
        INIT_LIST_HEAD(&group->cg_children);
+        INIT_LIST_HEAD(&group->default_groups);
 }
 EXPORT_SYMBOL(config_group_init);
diff --git a/fs/coredump.c b/fs/coredump.c
index 9ea87e9fdccf..47c32c3bfa1d 100644
--- a/fs/coredump.c
+++ b/fs/coredump.c
@@ -32,6 +32,9 @@
 #include <linux/pipe_fs_i.h>
 #include <linux/oom.h>
 #include <linux/compat.h>
+#include <linux/sched.h>
+#include <linux/fs.h>
+#include <linux/path.h>
 #include <linux/timekeeping.h>
 #include <asm/uaccess.h>
@@ -649,6 +652,8 @@ void do_coredump(const siginfo_t *siginfo)
                }
        } else {
                struct inode *inode;
+                int open_flags = O_CREAT | O_RDWR | O_NOFOLLOW |
+                                 O_LARGEFILE | O_EXCL;
                if (cprm.limit < binfmt->min_coredump)
                        goto fail_unlock;
@@ -687,10 +692,27 @@ void do_coredump(const siginfo_t *siginfo)
                 * what matters is that at least one of the two processes
                 * writes its coredump successfully, not which one.
                 */
-                cprm.file = filp_open(cn.corename,
+                if (need_suid_safe) {
-                                 O_CREAT | 2 | O_NOFOLLOW |
+                        /*
-                                 O_LARGEFILE | O_EXCL,
+                         * Using user namespaces, normal user tasks can change
-                                 0600);
+                         * their current->fs->root to point to arbitrary
+                         * directories. Since the intention of the "only dump
+                         * with a fully qualified path" rule is to control where
+                         * coredumps may be placed using root privileges,
+                         * current->fs->root must not be used. Instead, use the
+                         * root directory of init_task.
+                         */
+                        struct path root;
+                        task_lock(&init_task);
+                        get_fs_root(init_task.fs, &root);
+                        task_unlock(&init_task);
+                        cprm.file = file_open_root(root.dentry, root.mnt,
+                                cn.corename, open_flags, 0600);
+                        path_put(&root);
+                } else {
+                        cprm.file = filp_open(cn.corename, open_flags, 0600);
+                }
                if (IS_ERR(cprm.file))
                        goto fail_unlock;
diff --git a/fs/crypto/Kconfig b/fs/crypto/Kconfig
new file mode 100644
index 000000000000..92348faf9865
--- /dev/null
+++ b/fs/crypto/Kconfig
@@ -0,0 +1,18 @@
+config FS_ENCRYPTION
+        tristate "FS Encryption (Per-file encryption)"
+        depends on BLOCK
+        select CRYPTO
+        select CRYPTO_AES
+        select CRYPTO_CBC
+        select CRYPTO_ECB
+        select CRYPTO_XTS
+        select CRYPTO_CTS
+        select CRYPTO_CTR
+        select CRYPTO_SHA256
+        select KEYS
+        select ENCRYPTED_KEYS
+        help
+          Enable encryption of files and directories.  This
+          feature is similar to ecryptfs, but it is more memory
+          efficient since it avoids caching the encrypted and
+          decrypted pages in the page cache.
diff --git a/fs/crypto/Makefile b/fs/crypto/Makefile
new file mode 100644
index 000000000000..f17684c48739
--- /dev/null
+++ b/fs/crypto/Makefile
@@ -0,0 +1,3 @@
+obj-$(CONFIG_FS_ENCRYPTION)     += fscrypto.o
+fscrypto-y := crypto.o fname.o policy.o keyinfo.o
diff --git a/fs/crypto/crypto.c b/fs/crypto/crypto.c
new file mode 100644
index 000000000000..06cd1a22240b
--- /dev/null
+++ b/fs/crypto/crypto.c
@@ -0,0 +1,555 @@
+/*
+ * This contains encryption functions for per-file encryption.
+ *
+ * Copyright (C) 2015, Google, Inc.
+ * Copyright (C) 2015, Motorola Mobility
+ *
+ * Written by Michael Halcrow, 2014.
+ *
+ * Filename encryption additions
+ *      Uday Savagaonkar, 2014
+ * Encryption policy handling additions
+ *      Ildar Muslukhov, 2014
+ * Add fscrypt_pullback_bio_page()
+ *      Jaegeuk Kim, 2015.
+ *
+ * This has not yet undergone a rigorous security audit.
+ *
+ * The usage of AES-XTS should conform to recommendations in NIST
+ * Special Publication 800-38E and IEEE P1619/D16.
+ */
+#include <linux/pagemap.h>
+#include <linux/mempool.h>
+#include <linux/module.h>
+#include <linux/scatterlist.h>
+#include <linux/ratelimit.h>
+#include <linux/bio.h>
+#include <linux/dcache.h>
+#include <linux/fscrypto.h>
+#include <linux/ecryptfs.h>
+static unsigned int num_prealloc_crypto_pages = 32;
+static unsigned int num_prealloc_crypto_ctxs = 128;
+module_param(num_prealloc_crypto_pages, uint, 0444);
+MODULE_PARM_DESC(num_prealloc_crypto_pages,
+                "Number of crypto pages to preallocate");
+module_param(num_prealloc_crypto_ctxs, uint, 0444);
+MODULE_PARM_DESC(num_prealloc_crypto_ctxs,
+                "Number of crypto contexts to preallocate");
+static mempool_t *fscrypt_bounce_page_pool = NULL;
+static LIST_HEAD(fscrypt_free_ctxs);
+static DEFINE_SPINLOCK(fscrypt_ctx_lock);
+static struct workqueue_struct *fscrypt_read_workqueue;
+static DEFINE_MUTEX(fscrypt_init_mutex);
+static struct kmem_cache *fscrypt_ctx_cachep;
+struct kmem_cache *fscrypt_info_cachep;
+/**
+ * fscrypt_release_ctx() - Releases an encryption context
+ * @ctx: The encryption context to release.
+ *
+ * If the encryption context was allocated from the pre-allocated pool, returns
+ * it to that pool. Else, frees it.
+ *
+ * If there's a bounce page in the context, this frees that.
+ */
+void fscrypt_release_ctx(struct fscrypt_ctx *ctx)
+{
+        unsigned long flags;
+        if (ctx->flags & FS_WRITE_PATH_FL && ctx->w.bounce_page) {
+                mempool_free(ctx->w.bounce_page, fscrypt_bounce_page_pool);
+                ctx->w.bounce_page = NULL;
+        }
+        ctx->w.control_page = NULL;
+        if (ctx->flags & FS_CTX_REQUIRES_FREE_ENCRYPT_FL) {
+                kmem_cache_free(fscrypt_ctx_cachep, ctx);
+        } else {
+                spin_lock_irqsave(&fscrypt_ctx_lock, flags);
+                list_add(&ctx->free_list, &fscrypt_free_ctxs);
+                spin_unlock_irqrestore(&fscrypt_ctx_lock, flags);
+        }
+}
+EXPORT_SYMBOL(fscrypt_release_ctx);
+/**
+ * fscrypt_get_ctx() - Gets an encryption context
+ * @inode:       The inode for which we are doing the crypto
+ *
+ * Allocates and initializes an encryption context.
+ *
+ * Return: An allocated and initialized encryption context on success; error
+ * value or NULL otherwise.
+ */
+struct fscrypt_ctx *fscrypt_get_ctx(struct inode *inode)
+{
+        struct fscrypt_ctx *ctx = NULL;
+        struct fscrypt_info *ci = inode->i_crypt_info;
+        unsigned long flags;
+        if (ci == NULL)
+                return ERR_PTR(-ENOKEY);
+        /*
+         * We first try getting the ctx from a free list because in
+         * the common case the ctx will have an allocated and
+         * initialized crypto tfm, so it's probably a worthwhile
+         * optimization. For the bounce page, we first try getting it
+         * from the kernel allocator because that's just about as fast
+         * as getting it from a list and because a cache of free pages
+         * should generally be a "last resort" option for a filesystem
+         * to be able to do its job.
+         */
+        spin_lock_irqsave(&fscrypt_ctx_lock, flags);
+        ctx = list_first_entry_or_null(&fscrypt_free_ctxs,
+                                        struct fscrypt_ctx, free_list);
+        if (ctx)
+                list_del(&ctx->free_list);
+        spin_unlock_irqrestore(&fscrypt_ctx_lock, flags);
+        if (!ctx) {
+                ctx = kmem_cache_zalloc(fscrypt_ctx_cachep, GFP_NOFS);
+                if (!ctx)
+                        return ERR_PTR(-ENOMEM);
+                ctx->flags |= FS_CTX_REQUIRES_FREE_ENCRYPT_FL;
+        } else {
+                ctx->flags &= ~FS_CTX_REQUIRES_FREE_ENCRYPT_FL;
+        }
+        ctx->flags &= ~FS_WRITE_PATH_FL;
+        return ctx;
+}
+EXPORT_SYMBOL(fscrypt_get_ctx);
+/**
+ * fscrypt_complete() - The completion callback for page encryption
+ * @req: The asynchronous encryption request context
+ * @res: The result of the encryption operation
+ */
+static void fscrypt_complete(struct crypto_async_request *req, int res)
+{
+        struct fscrypt_completion_result *ecr = req->data;
+        if (res == -EINPROGRESS)
+                return;
+        ecr->res = res;
+        complete(&ecr->completion);
+}
+typedef enum {
+        FS_DECRYPT = 0,
+        FS_ENCRYPT,
+} fscrypt_direction_t;
+static int do_page_crypto(struct inode *inode,
+                        fscrypt_direction_t rw, pgoff_t index,
+                        struct page *src_page, struct page *dest_page)
+{
+        u8 xts_tweak[FS_XTS_TWEAK_SIZE];
+        struct skcipher_request *req = NULL;
+        DECLARE_FS_COMPLETION_RESULT(ecr);
+        struct scatterlist dst, src;
+        struct fscrypt_info *ci = inode->i_crypt_info;
+        struct crypto_skcipher *tfm = ci->ci_ctfm;
+        int res = 0;
+        req = skcipher_request_alloc(tfm, GFP_NOFS);
+        if (!req) {
+                printk_ratelimited(KERN_ERR
+                                "%s: crypto_request_alloc() failed\n",
+                                __func__);
+                return -ENOMEM;
+        }
+        skcipher_request_set_callback(
+                req, CRYPTO_TFM_REQ_MAY_BACKLOG | CRYPTO_TFM_REQ_MAY_SLEEP,
+                fscrypt_complete, &ecr);
+        BUILD_BUG_ON(FS_XTS_TWEAK_SIZE < sizeof(index));
+        memcpy(xts_tweak, &index, sizeof(index));
+        memset(&xts_tweak[sizeof(index)], 0,
+                        FS_XTS_TWEAK_SIZE - sizeof(index));
+        sg_init_table(&dst, 1);
+        sg_set_page(&dst, dest_page, PAGE_CACHE_SIZE, 0);
+        sg_init_table(&src, 1);
+        sg_set_page(&src, src_page, PAGE_CACHE_SIZE, 0);
+        skcipher_request_set_crypt(req, &src, &dst, PAGE_CACHE_SIZE,
+                                        xts_tweak);
+        if (rw == FS_DECRYPT)
+                res = crypto_skcipher_decrypt(req);
+        else
+                res = crypto_skcipher_encrypt(req);
+        if (res == -EINPROGRESS || res == -EBUSY) {
+                BUG_ON(req->base.data != &ecr);
+                wait_for_completion(&ecr.completion);
+                res = ecr.res;
+        }
+        skcipher_request_free(req);
+        if (res) {
+                printk_ratelimited(KERN_ERR
+                        "%s: crypto_skcipher_encrypt() returned %d\n",
+                        __func__, res);
+                return res;
+        }
+        return 0;
+}
+static struct page *alloc_bounce_page(struct fscrypt_ctx *ctx)
+{
+        ctx->w.bounce_page = mempool_alloc(fscrypt_bounce_page_pool,
+                                                        GFP_NOWAIT);
+        if (ctx->w.bounce_page == NULL)
+                return ERR_PTR(-ENOMEM);
+        ctx->flags |= FS_WRITE_PATH_FL;
+        return ctx->w.bounce_page;
+}
+/**
+ * fscypt_encrypt_page() - Encrypts a page
+ * @inode:          The inode for which the encryption should take place
+ * @plaintext_page: The page to encrypt. Must be locked.
+ *
+ * Allocates a ciphertext page and encrypts plaintext_page into it using the ctx
+ * encryption context.
+ *
+ * Called on the page write path.  The caller must call
+ * fscrypt_restore_control_page() on the returned ciphertext page to
+ * release the bounce buffer and the encryption context.
+ *
+ * Return: An allocated page with the encrypted content on success. Else, an
+ * error value or NULL.
+ */
+struct page *fscrypt_encrypt_page(struct inode *inode,
+                                struct page *plaintext_page)
+{
+        struct fscrypt_ctx *ctx;
+        struct page *ciphertext_page = NULL;
+        int err;
+        BUG_ON(!PageLocked(plaintext_page));
+        ctx = fscrypt_get_ctx(inode);
+        if (IS_ERR(ctx))
+                return (struct page *)ctx;
+        /* The encryption operation will require a bounce page. */
+        ciphertext_page = alloc_bounce_page(ctx);
+        if (IS_ERR(ciphertext_page))
+                goto errout;
+        ctx->w.control_page = plaintext_page;
+        err = do_page_crypto(inode, FS_ENCRYPT, plaintext_page->index,
+                                        plaintext_page, ciphertext_page);
+        if (err) {
+                ciphertext_page = ERR_PTR(err);
+                goto errout;
+        }
+        SetPagePrivate(ciphertext_page);
+        set_page_private(ciphertext_page, (unsigned long)ctx);
+        lock_page(ciphertext_page);
+        return ciphertext_page;
+errout:
+        fscrypt_release_ctx(ctx);
+        return ciphertext_page;
+}
+EXPORT_SYMBOL(fscrypt_encrypt_page);
+/**
+ * f2crypt_decrypt_page() - Decrypts a page in-place
+ * @page: The page to decrypt. Must be locked.
+ *
+ * Decrypts page in-place using the ctx encryption context.
+ *
+ * Called from the read completion callback.
+ *
+ * Return: Zero on success, non-zero otherwise.
+ */
+int fscrypt_decrypt_page(struct page *page)
+{
+        BUG_ON(!PageLocked(page));
+        return do_page_crypto(page->mapping->host,
+                        FS_DECRYPT, page->index, page, page);
+}
+EXPORT_SYMBOL(fscrypt_decrypt_page);
+int fscrypt_zeroout_range(struct inode *inode, pgoff_t lblk,
+                                sector_t pblk, unsigned int len)
+{
+        struct fscrypt_ctx *ctx;
+        struct page *ciphertext_page = NULL;
+        struct bio *bio;
+        int ret, err = 0;
+        BUG_ON(inode->i_sb->s_blocksize != PAGE_CACHE_SIZE);
+        ctx = fscrypt_get_ctx(inode);
+        if (IS_ERR(ctx))
+                return PTR_ERR(ctx);
+        ciphertext_page = alloc_bounce_page(ctx);
+        if (IS_ERR(ciphertext_page)) {
+                err = PTR_ERR(ciphertext_page);
+                goto errout;
+        }
+        while (len--) {
+                err = do_page_crypto(inode, FS_ENCRYPT, lblk,
+                                                ZERO_PAGE(0), ciphertext_page);
+                if (err)
+                        goto errout;
+                bio = bio_alloc(GFP_KERNEL, 1);
+                if (!bio) {
+                        err = -ENOMEM;
+                        goto errout;
+                }
+                bio->bi_bdev = inode->i_sb->s_bdev;
+                bio->bi_iter.bi_sector =
+                        pblk << (inode->i_sb->s_blocksize_bits - 9);
+                ret = bio_add_page(bio, ciphertext_page,
+                                        inode->i_sb->s_blocksize, 0);
+                if (ret != inode->i_sb->s_blocksize) {
+                        /* should never happen! */
+                        WARN_ON(1);
+                        bio_put(bio);
+                        err = -EIO;
+                        goto errout;
+                }
+                err = submit_bio_wait(WRITE, bio);
+                if ((err == 0) && bio->bi_error)
+                        err = -EIO;
+                bio_put(bio);
+                if (err)
+                        goto errout;
+                lblk++;
+                pblk++;
+        }
+        err = 0;
+errout:
+        fscrypt_release_ctx(ctx);
+        return err;
+}
+EXPORT_SYMBOL(fscrypt_zeroout_range);
+/*
+ * Validate dentries for encrypted directories to make sure we aren't
+ * potentially caching stale data after a key has been added or
+ * removed.
+ */
+static int fscrypt_d_revalidate(struct dentry *dentry, unsigned int flags)
+{
+        struct inode *dir = d_inode(dentry->d_parent);
+        struct fscrypt_info *ci = dir->i_crypt_info;
+        int dir_has_key, cached_with_key;
+        if (!dir->i_sb->s_cop->is_encrypted(dir))
+                return 0;
+        if (ci && ci->ci_keyring_key &&
+            (ci->ci_keyring_key->flags & ((1 << KEY_FLAG_INVALIDATED) |
+                                          (1 << KEY_FLAG_REVOKED) |
+                                          (1 << KEY_FLAG_DEAD))))
+                ci = NULL;
+        /* this should eventually be an flag in d_flags */
+        spin_lock(&dentry->d_lock);
+        cached_with_key = dentry->d_flags & DCACHE_ENCRYPTED_WITH_KEY;
+        spin_unlock(&dentry->d_lock);
+        dir_has_key = (ci != NULL);
+        /*
+         * If the dentry was cached without the key, and it is a
+         * negative dentry, it might be a valid name.  We can't check
+         * if the key has since been made available due to locking
+         * reasons, so we fail the validation so ext4_lookup() can do
+         * this check.
+         *
+         * We also fail the validation if the dentry was created with
+         * the key present, but we no longer have the key, or vice versa.
+         */
+        if ((!cached_with_key && d_is_negative(dentry)) ||
+                        (!cached_with_key && dir_has_key) ||
+                        (cached_with_key && !dir_has_key))
+                return 0;
+        return 1;
+}
+const struct dentry_operations fscrypt_d_ops = {
+        .d_revalidate = fscrypt_d_revalidate,
+};
+EXPORT_SYMBOL(fscrypt_d_ops);
+/*
+ * Call fscrypt_decrypt_page on every single page, reusing the encryption
+ * context.
+ */
+static void completion_pages(struct work_struct *work)
+{
+        struct fscrypt_ctx *ctx =
+                container_of(work, struct fscrypt_ctx, r.work);
+        struct bio *bio = ctx->r.bio;
+        struct bio_vec *bv;
+        int i;
+        bio_for_each_segment_all(bv, bio, i) {
+                struct page *page = bv->bv_page;
+                int ret = fscrypt_decrypt_page(page);
+                if (ret) {
+                        WARN_ON_ONCE(1);
+                        SetPageError(page);
+                } else {
+                        SetPageUptodate(page);
+                }
+                unlock_page(page);
+        }
+        fscrypt_release_ctx(ctx);
+        bio_put(bio);
+}
+void fscrypt_decrypt_bio_pages(struct fscrypt_ctx *ctx, struct bio *bio)
+{
+        INIT_WORK(&ctx->r.work, completion_pages);
+        ctx->r.bio = bio;
+        queue_work(fscrypt_read_workqueue, &ctx->r.work);
+}
+EXPORT_SYMBOL(fscrypt_decrypt_bio_pages);
+void fscrypt_pullback_bio_page(struct page **page, bool restore)
+{
+        struct fscrypt_ctx *ctx;
+        struct page *bounce_page;
+        /* The bounce data pages are unmapped. */
+        if ((*page)->mapping)
+                return;
+        /* The bounce data page is unmapped. */
+        bounce_page = *page;
+        ctx = (struct fscrypt_ctx *)page_private(bounce_page);
+        /* restore control page */
+        *page = ctx->w.control_page;
+        if (restore)
+                fscrypt_restore_control_page(bounce_page);
+}
+EXPORT_SYMBOL(fscrypt_pullback_bio_page);
+void fscrypt_restore_control_page(struct page *page)
+{
+        struct fscrypt_ctx *ctx;
+        ctx = (struct fscrypt_ctx *)page_private(page);
+        set_page_private(page, (unsigned long)NULL);
+        ClearPagePrivate(page);
+        unlock_page(page);
+        fscrypt_release_ctx(ctx);
+}
+EXPORT_SYMBOL(fscrypt_restore_control_page);
+static void fscrypt_destroy(void)
+{
+        struct fscrypt_ctx *pos, *n;
+        list_for_each_entry_safe(pos, n, &fscrypt_free_ctxs, free_list)
+                kmem_cache_free(fscrypt_ctx_cachep, pos);
+        INIT_LIST_HEAD(&fscrypt_free_ctxs);
+        mempool_destroy(fscrypt_bounce_page_pool);
+        fscrypt_bounce_page_pool = NULL;
+}
+/**
+ * fscrypt_initialize() - allocate major buffers for fs encryption.
+ *
+ * We only call this when we start accessing encrypted files, since it
+ * results in memory getting allocated that wouldn't otherwise be used.
+ *
+ * Return: Zero on success, non-zero otherwise.
+ */
+int fscrypt_initialize(void)
+{
+        int i, res = -ENOMEM;
+        if (fscrypt_bounce_page_pool)
+                return 0;
+        mutex_lock(&fscrypt_init_mutex);
+        if (fscrypt_bounce_page_pool)
+                goto already_initialized;
+        for (i = 0; i < num_prealloc_crypto_ctxs; i++) {
+                struct fscrypt_ctx *ctx;
+                ctx = kmem_cache_zalloc(fscrypt_ctx_cachep, GFP_NOFS);
+                if (!ctx)
+                        goto fail;
+                list_add(&ctx->free_list, &fscrypt_free_ctxs);
+        }
+        fscrypt_bounce_page_pool =
+                mempool_create_page_pool(num_prealloc_crypto_pages, 0);
+        if (!fscrypt_bounce_page_pool)
+                goto fail;
+already_initialized:
+        mutex_unlock(&fscrypt_init_mutex);
+        return 0;
+fail:
+        fscrypt_destroy();
+        mutex_unlock(&fscrypt_init_mutex);
+        return res;
+}
+EXPORT_SYMBOL(fscrypt_initialize);
+/**
+ * fscrypt_init() - Set up for fs encryption.
+ */
+static int __init fscrypt_init(void)
+{
+        fscrypt_read_workqueue = alloc_workqueue("fscrypt_read_queue",
+                                                        WQ_HIGHPRI, 0);
+        if (!fscrypt_read_workqueue)
+                goto fail;
+        fscrypt_ctx_cachep = KMEM_CACHE(fscrypt_ctx, SLAB_RECLAIM_ACCOUNT);
+        if (!fscrypt_ctx_cachep)
+                goto fail_free_queue;
+        fscrypt_info_cachep = KMEM_CACHE(fscrypt_info, SLAB_RECLAIM_ACCOUNT);
+        if (!fscrypt_info_cachep)
+                goto fail_free_ctx;
+        return 0;
+fail_free_ctx:
+        kmem_cache_destroy(fscrypt_ctx_cachep);
+fail_free_queue:
+        destroy_workqueue(fscrypt_read_workqueue);
+fail:
+        return -ENOMEM;
+}
+module_init(fscrypt_init)
+/**
+ * fscrypt_exit() - Shutdown the fs encryption system
+ */
+static void __exit fscrypt_exit(void)
+{
+        fscrypt_destroy();
+        if (fscrypt_read_workqueue)
+                destroy_workqueue(fscrypt_read_workqueue);
+        kmem_cache_destroy(fscrypt_ctx_cachep);
+        kmem_cache_destroy(fscrypt_info_cachep);
+}
+module_exit(fscrypt_exit);
+MODULE_LICENSE("GPL");
diff --git a/fs/f2fs/crypto_fname.c b/fs/crypto/fname.c
index ab377d496a39..5d6d49113efa 100644
--- a/fs/f2fs/crypto_fname.c
+++ b/fs/crypto/fname.c
@@ -1,46 +1,32 @@
 /*
- * linux/fs/f2fs/crypto_fname.c
+ * This contains functions for filename crypto management
- *
- * Copied from linux/fs/ext4/crypto.c
 *
 * Copyright (C) 2015, Google, Inc.
 * Copyright (C) 2015, Motorola Mobility
 *
- * This contains functions for filename crypto management in f2fs
- *
 * Written by Uday Savagaonkar, 2014.
- *
+ * Modified by Jaegeuk Kim, 2015.
- * Adjust f2fs dentry structure
- *      Jaegeuk Kim, 2015.
 *
 * This has not yet undergone a rigorous security audit.
 */
-#include <crypto/hash.h>
-#include <crypto/sha.h>
 #include <keys/encrypted-type.h>
 #include <keys/user-type.h>
-#include <linux/crypto.h>
-#include <linux/gfp.h>
-#include <linux/kernel.h>
-#include <linux/key.h>
-#include <linux/list.h>
-#include <linux/mempool.h>
-#include <linux/random.h>
 #include <linux/scatterlist.h>
-#include <linux/spinlock_types.h>
-#include <linux/f2fs_fs.h>
 #include <linux/ratelimit.h>
+#include <linux/fscrypto.h>
-#include "f2fs.h"
+static u32 size_round_up(size_t size, size_t blksize)
-#include "f2fs_crypto.h"
+{
-#include "xattr.h"
+        return ((size + blksize - 1) / blksize) * blksize;
+}
 /**
- * f2fs_dir_crypt_complete() -
+ * dir_crypt_complete() -
 */
-static void f2fs_dir_crypt_complete(struct crypto_async_request *req, int res)
+static void dir_crypt_complete(struct crypto_async_request *req, int res)
 {
-        struct f2fs_completion_result *ecr = req->data;
+        struct fscrypt_completion_result *ecr = req->data;
        if (res == -EINPROGRESS)
                return;
@@ -48,45 +34,35 @@ static void f2fs_dir_crypt_complete(struct crypto_async_request *req, int res)
        complete(&ecr->completion);
 }
-bool f2fs_valid_filenames_enc_mode(uint32_t mode)
-{
-        return (mode == F2FS_ENCRYPTION_MODE_AES_256_CTS);
-}
-static unsigned max_name_len(struct inode *inode)
-{
-        return S_ISLNK(inode->i_mode) ? inode->i_sb->s_blocksize :
-                                        F2FS_NAME_LEN;
-}
 /**
- * f2fs_fname_encrypt() -
+ * fname_encrypt() -
 *
 * This function encrypts the input filename, and returns the length of the
 * ciphertext. Errors are returned as negative numbers.  We trust the caller to
 * allocate sufficient memory to oname string.
 */
-static int f2fs_fname_encrypt(struct inode *inode,
+static int fname_encrypt(struct inode *inode,
-                        const struct qstr *iname, struct f2fs_str *oname)
+                        const struct qstr *iname, struct fscrypt_str *oname)
 {
        u32 ciphertext_len;
-        struct ablkcipher_request *req = NULL;
+        struct skcipher_request *req = NULL;
-        DECLARE_F2FS_COMPLETION_RESULT(ecr);
+        DECLARE_FS_COMPLETION_RESULT(ecr);
-        struct f2fs_crypt_info *ci = F2FS_I(inode)->i_crypt_info;
+        struct fscrypt_info *ci = inode->i_crypt_info;
-        struct crypto_ablkcipher *tfm = ci->ci_ctfm;
+        struct crypto_skcipher *tfm = ci->ci_ctfm;
        int res = 0;
-        char iv[F2FS_CRYPTO_BLOCK_SIZE];
+        char iv[FS_CRYPTO_BLOCK_SIZE];
        struct scatterlist src_sg, dst_sg;
-        int padding = 4 << (ci->ci_flags & F2FS_POLICY_FLAGS_PAD_MASK);
+        int padding = 4 << (ci->ci_flags & FS_POLICY_FLAGS_PAD_MASK);
        char *workbuf, buf[32], *alloc_buf = NULL;
-        unsigned lim = max_name_len(inode);
+        unsigned lim;
+        lim = inode->i_sb->s_cop->max_namelen(inode);
        if (iname->len <= 0 || iname->len > lim)
                return -EIO;
-        ciphertext_len = (iname->len < F2FS_CRYPTO_BLOCK_SIZE) ?
+        ciphertext_len = (iname->len < FS_CRYPTO_BLOCK_SIZE) ?
-                F2FS_CRYPTO_BLOCK_SIZE : iname->len;
+                                        FS_CRYPTO_BLOCK_SIZE : iname->len;
-        ciphertext_len = f2fs_fname_crypto_round_up(ciphertext_len, padding);
+        ciphertext_len = size_round_up(ciphertext_len, padding);
        ciphertext_len = (ciphertext_len > lim) ? lim : ciphertext_len;
        if (ciphertext_len <= sizeof(buf)) {
@@ -99,16 +75,16 @@ static int f2fs_fname_encrypt(struct inode *inode,
        }
        /* Allocate request */
-        req = ablkcipher_request_alloc(tfm, GFP_NOFS);
+        req = skcipher_request_alloc(tfm, GFP_NOFS);
        if (!req) {
                printk_ratelimited(KERN_ERR
                        "%s: crypto_request_alloc() failed\n", __func__);
                kfree(alloc_buf);
                return -ENOMEM;
        }
-        ablkcipher_request_set_callback(req,
+        skcipher_request_set_callback(req,
                        CRYPTO_TFM_REQ_MAY_BACKLOG | CRYPTO_TFM_REQ_MAY_SLEEP,
-                        f2fs_dir_crypt_complete, &ecr);
+                        dir_crypt_complete, &ecr);
        /* Copy the input */
        memcpy(workbuf, iname->name, iname->len);
@@ -116,79 +92,78 @@ static int f2fs_fname_encrypt(struct inode *inode,
                memset(workbuf + iname->len, 0, ciphertext_len - iname->len);
        /* Initialize IV */
-        memset(iv, 0, F2FS_CRYPTO_BLOCK_SIZE);
+        memset(iv, 0, FS_CRYPTO_BLOCK_SIZE);
        /* Create encryption request */
        sg_init_one(&src_sg, workbuf, ciphertext_len);
        sg_init_one(&dst_sg, oname->name, ciphertext_len);
-        ablkcipher_request_set_crypt(req, &src_sg, &dst_sg, ciphertext_len, iv);
+        skcipher_request_set_crypt(req, &src_sg, &dst_sg, ciphertext_len, iv);
-        res = crypto_ablkcipher_encrypt(req);
+        res = crypto_skcipher_encrypt(req);
        if (res == -EINPROGRESS || res == -EBUSY) {
-                BUG_ON(req->base.data != &ecr);
                wait_for_completion(&ecr.completion);
                res = ecr.res;
        }
        kfree(alloc_buf);
-        ablkcipher_request_free(req);
+        skcipher_request_free(req);
-        if (res < 0) {
+        if (res < 0)
                printk_ratelimited(KERN_ERR
                                "%s: Error (error code %d)\n", __func__, res);
-        }
        oname->len = ciphertext_len;
        return res;
 }
 /*
- * f2fs_fname_decrypt()
+ * fname_decrypt()
 *      This function decrypts the input filename, and returns
 *      the length of the plaintext.
 *      Errors are returned as negative numbers.
 *      We trust the caller to allocate sufficient memory to oname string.
 */
-static int f2fs_fname_decrypt(struct inode *inode,
+static int fname_decrypt(struct inode *inode,
-                        const struct f2fs_str *iname, struct f2fs_str *oname)
+                                const struct fscrypt_str *iname,
+                                struct fscrypt_str *oname)
 {
-        struct ablkcipher_request *req = NULL;
+        struct skcipher_request *req = NULL;
-        DECLARE_F2FS_COMPLETION_RESULT(ecr);
+        DECLARE_FS_COMPLETION_RESULT(ecr);
        struct scatterlist src_sg, dst_sg;
-        struct f2fs_crypt_info *ci = F2FS_I(inode)->i_crypt_info;
+        struct fscrypt_info *ci = inode->i_crypt_info;
-        struct crypto_ablkcipher *tfm = ci->ci_ctfm;
+        struct crypto_skcipher *tfm = ci->ci_ctfm;
        int res = 0;
-        char iv[F2FS_CRYPTO_BLOCK_SIZE];
+        char iv[FS_CRYPTO_BLOCK_SIZE];
-        unsigned lim = max_name_len(inode);
+        unsigned lim;
+        lim = inode->i_sb->s_cop->max_namelen(inode);
        if (iname->len <= 0 || iname->len > lim)
                return -EIO;
        /* Allocate request */
-        req = ablkcipher_request_alloc(tfm, GFP_NOFS);
+        req = skcipher_request_alloc(tfm, GFP_NOFS);
        if (!req) {
                printk_ratelimited(KERN_ERR
                        "%s: crypto_request_alloc() failed\n",  __func__);
                return -ENOMEM;
        }
-        ablkcipher_request_set_callback(req,
+        skcipher_request_set_callback(req,
                CRYPTO_TFM_REQ_MAY_BACKLOG | CRYPTO_TFM_REQ_MAY_SLEEP,
-                f2fs_dir_crypt_complete, &ecr);
+                dir_crypt_complete, &ecr);
        /* Initialize IV */
-        memset(iv, 0, F2FS_CRYPTO_BLOCK_SIZE);
+        memset(iv, 0, FS_CRYPTO_BLOCK_SIZE);
        /* Create decryption request */
        sg_init_one(&src_sg, iname->name, iname->len);
        sg_init_one(&dst_sg, oname->name, oname->len);
-        ablkcipher_request_set_crypt(req, &src_sg, &dst_sg, iname->len, iv);
+        skcipher_request_set_crypt(req, &src_sg, &dst_sg, iname->len, iv);
-        res = crypto_ablkcipher_decrypt(req);
+        res = crypto_skcipher_decrypt(req);
        if (res == -EINPROGRESS || res == -EBUSY) {
-                BUG_ON(req->base.data != &ecr);
                wait_for_completion(&ecr.completion);
                res = ecr.res;
        }
-        ablkcipher_request_free(req);
+        skcipher_request_free(req);
        if (res < 0) {
                printk_ratelimited(KERN_ERR
-                        "%s: Error in f2fs_fname_decrypt (error code %d)\n",
+                                "%s: Error (error code %d)\n", __func__, res);
-                        __func__, res);
                return res;
        }
@@ -200,7 +175,7 @@ static const char *lookup_table =
        "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+,";
 /**
- * f2fs_fname_encode_digest() -
+ * digest_encode() -
 *
 * Encodes the input digest using characters from the set [a-zA-Z0-9_+].
 * The encoded string is roughly 4/3 times the size of the input string.
@@ -249,148 +224,152 @@ static int digest_decode(const char *src, int len, char *dst)
        return cp - dst;
 }
-/**
+u32 fscrypt_fname_encrypted_size(struct inode *inode, u32 ilen)
- * f2fs_fname_crypto_round_up() -
- *
- * Return: The next multiple of block size
- */
-u32 f2fs_fname_crypto_round_up(u32 size, u32 blksize)
 {
-        return ((size + blksize - 1) / blksize) * blksize;
+        int padding = 32;
+        struct fscrypt_info *ci = inode->i_crypt_info;
+        if (ci)
+                padding = 4 << (ci->ci_flags & FS_POLICY_FLAGS_PAD_MASK);
+        if (ilen < FS_CRYPTO_BLOCK_SIZE)
+                ilen = FS_CRYPTO_BLOCK_SIZE;
+        return size_round_up(ilen, padding);
 }
+EXPORT_SYMBOL(fscrypt_fname_encrypted_size);
 /**
- * f2fs_fname_crypto_alloc_obuff() -
+ * fscrypt_fname_crypto_alloc_obuff() -
 *
 * Allocates an output buffer that is sufficient for the crypto operation
 * specified by the context and the direction.
 */
-int f2fs_fname_crypto_alloc_buffer(struct inode *inode,
+int fscrypt_fname_alloc_buffer(struct inode *inode,
-                                   u32 ilen, struct f2fs_str *crypto_str)
+                                u32 ilen, struct fscrypt_str *crypto_str)
 {
-        unsigned int olen;
+        unsigned int olen = fscrypt_fname_encrypted_size(inode, ilen);
-        int padding = 16;
-        struct f2fs_crypt_info *ci = F2FS_I(inode)->i_crypt_info;
-        if (ci)
-                padding = 4 << (ci->ci_flags & F2FS_POLICY_FLAGS_PAD_MASK);
-        if (padding < F2FS_CRYPTO_BLOCK_SIZE)
-                padding = F2FS_CRYPTO_BLOCK_SIZE;
-        olen = f2fs_fname_crypto_round_up(ilen, padding);
        crypto_str->len = olen;
-        if (olen < F2FS_FNAME_CRYPTO_DIGEST_SIZE * 2)
+        if (olen < FS_FNAME_CRYPTO_DIGEST_SIZE * 2)
-                olen = F2FS_FNAME_CRYPTO_DIGEST_SIZE * 2;
+                olen = FS_FNAME_CRYPTO_DIGEST_SIZE * 2;
-        /* Allocated buffer can hold one more character to null-terminate the
+        /*
-         * string */
+         * Allocated buffer can hold one more character to null-terminate the
+         * string
+         */
        crypto_str->name = kmalloc(olen + 1, GFP_NOFS);
        if (!(crypto_str->name))
                return -ENOMEM;
        return 0;
 }
+EXPORT_SYMBOL(fscrypt_fname_alloc_buffer);
 /**
- * f2fs_fname_crypto_free_buffer() -
+ * fscrypt_fname_crypto_free_buffer() -
 *
 * Frees the buffer allocated for crypto operation.
 */
-void f2fs_fname_crypto_free_buffer(struct f2fs_str *crypto_str)
+void fscrypt_fname_free_buffer(struct fscrypt_str *crypto_str)
 {
        if (!crypto_str)
                return;
        kfree(crypto_str->name);
        crypto_str->name = NULL;
 }
+EXPORT_SYMBOL(fscrypt_fname_free_buffer);
 /**
- * f2fs_fname_disk_to_usr() - converts a filename from disk space to user space
+ * fscrypt_fname_disk_to_usr() - converts a filename from disk space to user
+ * space
 */
-int f2fs_fname_disk_to_usr(struct inode *inode,
+int fscrypt_fname_disk_to_usr(struct inode *inode,
-                        f2fs_hash_t *hash,
+                        u32 hash, u32 minor_hash,
-                        const struct f2fs_str *iname,
+                        const struct fscrypt_str *iname,
-                        struct f2fs_str *oname)
+                        struct fscrypt_str *oname)
 {
        const struct qstr qname = FSTR_TO_QSTR(iname);
        char buf[24];
        int ret;
-        if (is_dot_dotdot(&qname)) {
+        if (fscrypt_is_dot_dotdot(&qname)) {
                oname->name[0] = '.';
                oname->name[iname->len - 1] = '.';
                oname->len = iname->len;
                return oname->len;
        }
-        if (F2FS_I(inode)->i_crypt_info)
+        if (iname->len < FS_CRYPTO_BLOCK_SIZE)
-                return f2fs_fname_decrypt(inode, iname, oname);
+                return -EUCLEAN;
-        if (iname->len <= F2FS_FNAME_CRYPTO_DIGEST_SIZE) {
+        if (inode->i_crypt_info)
+                return fname_decrypt(inode, iname, oname);
+        if (iname->len <= FS_FNAME_CRYPTO_DIGEST_SIZE) {
                ret = digest_encode(iname->name, iname->len, oname->name);
                oname->len = ret;
                return ret;
        }
        if (hash) {
-                memcpy(buf, hash, 4);
+                memcpy(buf, &hash, 4);
-                memset(buf + 4, 0, 4);
+                memcpy(buf + 4, &minor_hash, 4);
-        } else
+        } else {
                memset(buf, 0, 8);
+        }
        memcpy(buf + 8, iname->name + iname->len - 16, 16);
        oname->name[0] = '_';
        ret = digest_encode(buf, 24, oname->name + 1);
        oname->len = ret + 1;
        return ret + 1;
 }
+EXPORT_SYMBOL(fscrypt_fname_disk_to_usr);
 /**
- * f2fs_fname_usr_to_disk() - converts a filename from user space to disk space
+ * fscrypt_fname_usr_to_disk() - converts a filename from user space to disk
+ * space
 */
-int f2fs_fname_usr_to_disk(struct inode *inode,
+int fscrypt_fname_usr_to_disk(struct inode *inode,
                        const struct qstr *iname,
-                        struct f2fs_str *oname)
+                        struct fscrypt_str *oname)
 {
-        int res;
+        if (fscrypt_is_dot_dotdot(iname)) {
-        struct f2fs_crypt_info *ci = F2FS_I(inode)->i_crypt_info;
-        if (is_dot_dotdot(iname)) {
                oname->name[0] = '.';
                oname->name[iname->len - 1] = '.';
                oname->len = iname->len;
                return oname->len;
        }
+        if (inode->i_crypt_info)
-        if (ci) {
+                return fname_encrypt(inode, iname, oname);
-                res = f2fs_fname_encrypt(inode, iname, oname);
+        /*
-                return res;
+         * Without a proper key, a user is not allowed to modify the filenames
-        }
-        /* Without a proper key, a user is not allowed to modify the filenames
         * in a directory. Consequently, a user space name cannot be mapped to
-         * a disk-space name */
+         * a disk-space name
+         */
        return -EACCES;
 }
+EXPORT_SYMBOL(fscrypt_fname_usr_to_disk);
-int f2fs_fname_setup_filename(struct inode *dir, const struct qstr *iname,
+int fscrypt_setup_filename(struct inode *dir, const struct qstr *iname,
-                              int lookup, struct f2fs_filename *fname)
+                              int lookup, struct fscrypt_name *fname)
 {
-        struct f2fs_crypt_info *ci;
        int ret = 0, bigname = 0;
-        memset(fname, 0, sizeof(struct f2fs_filename));
+        memset(fname, 0, sizeof(struct fscrypt_name));
        fname->usr_fname = iname;
-        if (!f2fs_encrypted_inode(dir) || is_dot_dotdot(iname)) {
+        if (!dir->i_sb->s_cop->is_encrypted(dir) ||
+                                fscrypt_is_dot_dotdot(iname)) {
                fname->disk_name.name = (unsigned char *)iname->name;
                fname->disk_name.len = iname->len;
                return 0;
        }
-        ret = f2fs_get_encryption_info(dir);
+        ret = get_crypt_info(dir);
-        if (ret)
+        if (ret && ret != -EOPNOTSUPP)
                return ret;
-        ci = F2FS_I(dir)->i_crypt_info;
-        if (ci) {
+        if (dir->i_crypt_info) {
-                ret = f2fs_fname_crypto_alloc_buffer(dir, iname->len,
+                ret = fscrypt_fname_alloc_buffer(dir, iname->len,
-                                                     &fname->crypto_buf);
+                                                        &fname->crypto_buf);
                if (ret < 0)
                        return ret;
-                ret = f2fs_fname_encrypt(dir, iname, &fname->crypto_buf);
+                ret = fname_encrypt(dir, iname, &fname->crypto_buf);
                if (ret < 0)
                        goto errout;
                fname->disk_name.name = fname->crypto_buf.name;
@@ -400,18 +379,19 @@ int f2fs_fname_setup_filename(struct inode *dir, const struct qstr *iname,
        if (!lookup)
                return -EACCES;
-        /* We don't have the key and we are doing a lookup; decode the
+        /*
+         * We don't have the key and we are doing a lookup; decode the
         * user-supplied name
         */
        if (iname->name[0] == '_')
                bigname = 1;
-        if ((bigname && (iname->len != 33)) ||
+        if ((bigname && (iname->len != 33)) || (!bigname && (iname->len > 43)))
-            (!bigname && (iname->len > 43)))
                return -ENOENT;
        fname->crypto_buf.name = kmalloc(32, GFP_KERNEL);
        if (fname->crypto_buf.name == NULL)
                return -ENOMEM;
        ret = digest_decode(iname->name + bigname, iname->len - bigname,
                                fname->crypto_buf.name);
        if (ret < 0) {
@@ -421,20 +401,24 @@ int f2fs_fname_setup_filename(struct inode *dir, const struct qstr *iname,
        fname->crypto_buf.len = ret;
        if (bigname) {
                memcpy(&fname->hash, fname->crypto_buf.name, 4);
+                memcpy(&fname->minor_hash, fname->crypto_buf.name + 4, 4);
        } else {
                fname->disk_name.name = fname->crypto_buf.name;
                fname->disk_name.len = fname->crypto_buf.len;
        }
        return 0;
 errout:
-        f2fs_fname_crypto_free_buffer(&fname->crypto_buf);
+        fscrypt_fname_free_buffer(&fname->crypto_buf);
        return ret;
 }
+EXPORT_SYMBOL(fscrypt_setup_filename);
-void f2fs_fname_free_filename(struct f2fs_filename *fname)
+void fscrypt_free_filename(struct fscrypt_name *fname)
 {
        kfree(fname->crypto_buf.name);
        fname->crypto_buf.name = NULL;
        fname->usr_fname = NULL;
        fname->disk_name.name = NULL;
 }
+EXPORT_SYMBOL(fscrypt_free_filename);
diff --git a/fs/crypto/keyinfo.c b/fs/crypto/keyinfo.c
new file mode 100644
index 000000000000..06f5aa478bf2
--- /dev/null
+++ b/fs/crypto/keyinfo.c
@@ -0,0 +1,272 @@
+/*
+ * key management facility for FS encryption support.
+ *
+ * Copyright (C) 2015, Google, Inc.
+ *
+ * This contains encryption key functions.
+ *
+ * Written by Michael Halcrow, Ildar Muslukhov, and Uday Savagaonkar, 2015.
+ */
+#include <keys/encrypted-type.h>
+#include <keys/user-type.h>
+#include <linux/random.h>
+#include <linux/scatterlist.h>
+#include <uapi/linux/keyctl.h>
+#include <linux/fscrypto.h>
+static void derive_crypt_complete(struct crypto_async_request *req, int rc)
+{
+        struct fscrypt_completion_result *ecr = req->data;
+        if (rc == -EINPROGRESS)
+                return;
+        ecr->res = rc;
+        complete(&ecr->completion);
+}
+/**
+ * derive_key_aes() - Derive a key using AES-128-ECB
+ * @deriving_key: Encryption key used for derivation.
+ * @source_key:   Source key to which to apply derivation.
+ * @derived_key:  Derived key.
+ *
+ * Return: Zero on success; non-zero otherwise.
+ */
+static int derive_key_aes(u8 deriving_key[FS_AES_128_ECB_KEY_SIZE],
+                                u8 source_key[FS_AES_256_XTS_KEY_SIZE],
+                                u8 derived_key[FS_AES_256_XTS_KEY_SIZE])
+{
+        int res = 0;
+        struct skcipher_request *req = NULL;
+        DECLARE_FS_COMPLETION_RESULT(ecr);
+        struct scatterlist src_sg, dst_sg;
+        struct crypto_skcipher *tfm = crypto_alloc_skcipher("ecb(aes)", 0, 0);
+        if (IS_ERR(tfm)) {
+                res = PTR_ERR(tfm);
+                tfm = NULL;
+                goto out;
+        }
+        crypto_skcipher_set_flags(tfm, CRYPTO_TFM_REQ_WEAK_KEY);
+        req = skcipher_request_alloc(tfm, GFP_NOFS);
+        if (!req) {
+                res = -ENOMEM;
+                goto out;
+        }
+        skcipher_request_set_callback(req,
+                        CRYPTO_TFM_REQ_MAY_BACKLOG | CRYPTO_TFM_REQ_MAY_SLEEP,
+                        derive_crypt_complete, &ecr);
+        res = crypto_skcipher_setkey(tfm, deriving_key,
+                                        FS_AES_128_ECB_KEY_SIZE);
+        if (res < 0)
+                goto out;
+        sg_init_one(&src_sg, source_key, FS_AES_256_XTS_KEY_SIZE);
+        sg_init_one(&dst_sg, derived_key, FS_AES_256_XTS_KEY_SIZE);
+        skcipher_request_set_crypt(req, &src_sg, &dst_sg,
+                                        FS_AES_256_XTS_KEY_SIZE, NULL);
+        res = crypto_skcipher_encrypt(req);
+        if (res == -EINPROGRESS || res == -EBUSY) {
+                wait_for_completion(&ecr.completion);
+                res = ecr.res;
+        }
+out:
+        skcipher_request_free(req);
+        crypto_free_skcipher(tfm);
+        return res;
+}
+static void put_crypt_info(struct fscrypt_info *ci)
+{
+        if (!ci)
+                return;
+        key_put(ci->ci_keyring_key);
+        crypto_free_skcipher(ci->ci_ctfm);
+        kmem_cache_free(fscrypt_info_cachep, ci);
+}
+int get_crypt_info(struct inode *inode)
+{
+        struct fscrypt_info *crypt_info;
+        u8 full_key_descriptor[FS_KEY_DESC_PREFIX_SIZE +
+                                (FS_KEY_DESCRIPTOR_SIZE * 2) + 1];
+        struct key *keyring_key = NULL;
+        struct fscrypt_key *master_key;
+        struct fscrypt_context ctx;
+        const struct user_key_payload *ukp;
+        struct crypto_skcipher *ctfm;
+        const char *cipher_str;
+        u8 raw_key[FS_MAX_KEY_SIZE];
+        u8 mode;
+        int res;
+        res = fscrypt_initialize();
+        if (res)
+                return res;
+        if (!inode->i_sb->s_cop->get_context)
+                return -EOPNOTSUPP;
+retry:
+        crypt_info = ACCESS_ONCE(inode->i_crypt_info);
+        if (crypt_info) {
+                if (!crypt_info->ci_keyring_key ||
+                                key_validate(crypt_info->ci_keyring_key) == 0)
+                        return 0;
+                fscrypt_put_encryption_info(inode, crypt_info);
+                goto retry;
+        }
+        res = inode->i_sb->s_cop->get_context(inode, &ctx, sizeof(ctx));
+        if (res < 0) {
+                if (!fscrypt_dummy_context_enabled(inode))
+                        return res;
+                ctx.contents_encryption_mode = FS_ENCRYPTION_MODE_AES_256_XTS;
+                ctx.filenames_encryption_mode = FS_ENCRYPTION_MODE_AES_256_CTS;
+                ctx.flags = 0;
+        } else if (res != sizeof(ctx)) {
+                return -EINVAL;
+        }
+        res = 0;
+        crypt_info = kmem_cache_alloc(fscrypt_info_cachep, GFP_NOFS);
+        if (!crypt_info)
+                return -ENOMEM;
+        crypt_info->ci_flags = ctx.flags;
+        crypt_info->ci_data_mode = ctx.contents_encryption_mode;
+        crypt_info->ci_filename_mode = ctx.filenames_encryption_mode;
+        crypt_info->ci_ctfm = NULL;
+        crypt_info->ci_keyring_key = NULL;
+        memcpy(crypt_info->ci_master_key, ctx.master_key_descriptor,
+                                sizeof(crypt_info->ci_master_key));
+        if (S_ISREG(inode->i_mode))
+                mode = crypt_info->ci_data_mode;
+        else if (S_ISDIR(inode->i_mode) || S_ISLNK(inode->i_mode))
+                mode = crypt_info->ci_filename_mode;
+        else
+                BUG();
+        switch (mode) {
+        case FS_ENCRYPTION_MODE_AES_256_XTS:
+                cipher_str = "xts(aes)";
+                break;
+        case FS_ENCRYPTION_MODE_AES_256_CTS:
+                cipher_str = "cts(cbc(aes))";
+                break;
+        default:
+                printk_once(KERN_WARNING
+                            "%s: unsupported key mode %d (ino %u)\n",
+                            __func__, mode, (unsigned) inode->i_ino);
+                res = -ENOKEY;
+                goto out;
+        }
+        if (fscrypt_dummy_context_enabled(inode)) {
+                memset(raw_key, 0x42, FS_AES_256_XTS_KEY_SIZE);
+                goto got_key;
+        }
+        memcpy(full_key_descriptor, FS_KEY_DESC_PREFIX,
+                                        FS_KEY_DESC_PREFIX_SIZE);
+        sprintf(full_key_descriptor + FS_KEY_DESC_PREFIX_SIZE,
+                                        "%*phN", FS_KEY_DESCRIPTOR_SIZE,
+                                        ctx.master_key_descriptor);
+        full_key_descriptor[FS_KEY_DESC_PREFIX_SIZE +
+                                        (2 * FS_KEY_DESCRIPTOR_SIZE)] = '\0';
+        keyring_key = request_key(&key_type_logon, full_key_descriptor, NULL);
+        if (IS_ERR(keyring_key)) {
+                res = PTR_ERR(keyring_key);
+                keyring_key = NULL;
+                goto out;
+        }
+        crypt_info->ci_keyring_key = keyring_key;
+        if (keyring_key->type != &key_type_logon) {
+                printk_once(KERN_WARNING
+                                "%s: key type must be logon\n", __func__);
+                res = -ENOKEY;
+                goto out;
+        }
+        down_read(&keyring_key->sem);
+        ukp = user_key_payload(keyring_key);
+        if (ukp->datalen != sizeof(struct fscrypt_key)) {
+                res = -EINVAL;
+                up_read(&keyring_key->sem);
+                goto out;
+        }
+        master_key = (struct fscrypt_key *)ukp->data;
+        BUILD_BUG_ON(FS_AES_128_ECB_KEY_SIZE != FS_KEY_DERIVATION_NONCE_SIZE);
+        if (master_key->size != FS_AES_256_XTS_KEY_SIZE) {
+                printk_once(KERN_WARNING
+                                "%s: key size incorrect: %d\n",
+                                __func__, master_key->size);
+                res = -ENOKEY;
+                up_read(&keyring_key->sem);
+                goto out;
+        }
+        res = derive_key_aes(ctx.nonce, master_key->raw, raw_key);
+        up_read(&keyring_key->sem);
+        if (res)
+                goto out;
+got_key:
+        ctfm = crypto_alloc_skcipher(cipher_str, 0, 0);
+        if (!ctfm || IS_ERR(ctfm)) {
+                res = ctfm ? PTR_ERR(ctfm) : -ENOMEM;
+                printk(KERN_DEBUG
+                       "%s: error %d (inode %u) allocating crypto tfm\n",
+                       __func__, res, (unsigned) inode->i_ino);
+                goto out;
+        }
+        crypt_info->ci_ctfm = ctfm;
+        crypto_skcipher_clear_flags(ctfm, ~0);
+        crypto_skcipher_set_flags(ctfm, CRYPTO_TFM_REQ_WEAK_KEY);
+        res = crypto_skcipher_setkey(ctfm, raw_key, fscrypt_key_size(mode));
+        if (res)
+                goto out;
+        memzero_explicit(raw_key, sizeof(raw_key));
+        if (cmpxchg(&inode->i_crypt_info, NULL, crypt_info) != NULL) {
+                put_crypt_info(crypt_info);
+                goto retry;
+        }
+        return 0;
+out:
+        if (res == -ENOKEY)
+                res = 0;
+        put_crypt_info(crypt_info);
+        memzero_explicit(raw_key, sizeof(raw_key));
+        return res;
+}
+void fscrypt_put_encryption_info(struct inode *inode, struct fscrypt_info *ci)
+{
+        struct fscrypt_info *prev;
+        if (ci == NULL)
+                ci = ACCESS_ONCE(inode->i_crypt_info);
+        if (ci == NULL)
+                return;
+        prev = cmpxchg(&inode->i_crypt_info, ci, NULL);
+        if (prev != ci)
+                return;
+        put_crypt_info(ci);
+}
+EXPORT_SYMBOL(fscrypt_put_encryption_info);
+int fscrypt_get_encryption_info(struct inode *inode)
+{
+        struct fscrypt_info *ci = inode->i_crypt_info;
+        if (!ci ||
+                (ci->ci_keyring_key &&
+                 (ci->ci_keyring_key->flags & ((1 << KEY_FLAG_INVALIDATED) |
+                                               (1 << KEY_FLAG_REVOKED) |
+                                               (1 << KEY_FLAG_DEAD)))))
+                return get_crypt_info(inode);
+        return 0;
+}
+EXPORT_SYMBOL(fscrypt_get_encryption_info);
diff --git a/fs/crypto/policy.c b/fs/crypto/policy.c
new file mode 100644
index 000000000000..0f9961eede1e
--- /dev/null
+++ b/fs/crypto/policy.c
@@ -0,0 +1,229 @@
+/*
+ * Encryption policy functions for per-file encryption support.
+ *
+ * Copyright (C) 2015, Google, Inc.
+ * Copyright (C) 2015, Motorola Mobility.
+ *
+ * Written by Michael Halcrow, 2015.
+ * Modified by Jaegeuk Kim, 2015.
+ */
+#include <linux/random.h>
+#include <linux/string.h>
+#include <linux/fscrypto.h>
+static int inode_has_encryption_context(struct inode *inode)
+{
+        if (!inode->i_sb->s_cop->get_context)
+                return 0;
+        return (inode->i_sb->s_cop->get_context(inode, NULL, 0L) > 0);
+}
+/*
+ * check whether the policy is consistent with the encryption context
+ * for the inode
+ */
+static int is_encryption_context_consistent_with_policy(struct inode *inode,
+                                const struct fscrypt_policy *policy)
+{
+        struct fscrypt_context ctx;
+        int res;
+        if (!inode->i_sb->s_cop->get_context)
+                return 0;
+        res = inode->i_sb->s_cop->get_context(inode, &ctx, sizeof(ctx));
+        if (res != sizeof(ctx))
+                return 0;
+        return (memcmp(ctx.master_key_descriptor, policy->master_key_descriptor,
+                        FS_KEY_DESCRIPTOR_SIZE) == 0 &&
+                        (ctx.flags == policy->flags) &&
+                        (ctx.contents_encryption_mode ==
+                         policy->contents_encryption_mode) &&
+                        (ctx.filenames_encryption_mode ==
+                         policy->filenames_encryption_mode));
+}
+static int create_encryption_context_from_policy(struct inode *inode,
+                                const struct fscrypt_policy *policy)
+{
+        struct fscrypt_context ctx;
+        int res;
+        if (!inode->i_sb->s_cop->set_context)
+                return -EOPNOTSUPP;
+        if (inode->i_sb->s_cop->prepare_context) {
+                res = inode->i_sb->s_cop->prepare_context(inode);
+                if (res)
+                        return res;
+        }
+        ctx.format = FS_ENCRYPTION_CONTEXT_FORMAT_V1;
+        memcpy(ctx.master_key_descriptor, policy->master_key_descriptor,
+                                        FS_KEY_DESCRIPTOR_SIZE);
+        if (!fscrypt_valid_contents_enc_mode(
+                                policy->contents_encryption_mode)) {
+                printk(KERN_WARNING
+                       "%s: Invalid contents encryption mode %d\n", __func__,
+                        policy->contents_encryption_mode);
+                return -EINVAL;
+        }
+        if (!fscrypt_valid_filenames_enc_mode(
+                                policy->filenames_encryption_mode)) {
+                printk(KERN_WARNING
+                        "%s: Invalid filenames encryption mode %d\n", __func__,
+                        policy->filenames_encryption_mode);
+                return -EINVAL;
+        }
+        if (policy->flags & ~FS_POLICY_FLAGS_VALID)
+                return -EINVAL;
+        ctx.contents_encryption_mode = policy->contents_encryption_mode;
+        ctx.filenames_encryption_mode = policy->filenames_encryption_mode;
+        ctx.flags = policy->flags;
+        BUILD_BUG_ON(sizeof(ctx.nonce) != FS_KEY_DERIVATION_NONCE_SIZE);
+        get_random_bytes(ctx.nonce, FS_KEY_DERIVATION_NONCE_SIZE);
+        return inode->i_sb->s_cop->set_context(inode, &ctx, sizeof(ctx), NULL);
+}
+int fscrypt_process_policy(struct inode *inode,
+                                const struct fscrypt_policy *policy)
+{
+        if (policy->version != 0)
+                return -EINVAL;
+        if (!inode_has_encryption_context(inode)) {
+                if (!inode->i_sb->s_cop->empty_dir)
+                        return -EOPNOTSUPP;
+                if (!inode->i_sb->s_cop->empty_dir(inode))
+                        return -ENOTEMPTY;
+                return create_encryption_context_from_policy(inode, policy);
+        }
+        if (is_encryption_context_consistent_with_policy(inode, policy))
+                return 0;
+        printk(KERN_WARNING "%s: Policy inconsistent with encryption context\n",
+               __func__);
+        return -EINVAL;
+}
+EXPORT_SYMBOL(fscrypt_process_policy);
+int fscrypt_get_policy(struct inode *inode, struct fscrypt_policy *policy)
+{
+        struct fscrypt_context ctx;
+        int res;
+        if (!inode->i_sb->s_cop->get_context ||
+                        !inode->i_sb->s_cop->is_encrypted(inode))
+                return -ENODATA;
+        res = inode->i_sb->s_cop->get_context(inode, &ctx, sizeof(ctx));
+        if (res != sizeof(ctx))
+                return -ENODATA;
+        if (ctx.format != FS_ENCRYPTION_CONTEXT_FORMAT_V1)
+                return -EINVAL;
+        policy->version = 0;
+        policy->contents_encryption_mode = ctx.contents_encryption_mode;
+        policy->filenames_encryption_mode = ctx.filenames_encryption_mode;
+        policy->flags = ctx.flags;
+        memcpy(&policy->master_key_descriptor, ctx.master_key_descriptor,
+                                FS_KEY_DESCRIPTOR_SIZE);
+        return 0;
+}
+EXPORT_SYMBOL(fscrypt_get_policy);
+int fscrypt_has_permitted_context(struct inode *parent, struct inode *child)
+{
+        struct fscrypt_info *parent_ci, *child_ci;
+        int res;
+        if ((parent == NULL) || (child == NULL)) {
+                printk(KERN_ERR "parent %p child %p\n", parent, child);
+                BUG_ON(1);
+        }
+        /* no restrictions if the parent directory is not encrypted */
+        if (!parent->i_sb->s_cop->is_encrypted(parent))
+                return 1;
+        /* if the child directory is not encrypted, this is always a problem */
+        if (!parent->i_sb->s_cop->is_encrypted(child))
+                return 0;
+        res = fscrypt_get_encryption_info(parent);
+        if (res)
+                return 0;
+        res = fscrypt_get_encryption_info(child);
+        if (res)
+                return 0;
+        parent_ci = parent->i_crypt_info;
+        child_ci = child->i_crypt_info;
+        if (!parent_ci && !child_ci)
+                return 1;
+        if (!parent_ci || !child_ci)
+                return 0;
+        return (memcmp(parent_ci->ci_master_key,
+                        child_ci->ci_master_key,
+                        FS_KEY_DESCRIPTOR_SIZE) == 0 &&
+                (parent_ci->ci_data_mode == child_ci->ci_data_mode) &&
+                (parent_ci->ci_filename_mode == child_ci->ci_filename_mode) &&
+                (parent_ci->ci_flags == child_ci->ci_flags));
+}
+EXPORT_SYMBOL(fscrypt_has_permitted_context);
+/**
+ * fscrypt_inherit_context() - Sets a child context from its parent
+ * @parent: Parent inode from which the context is inherited.
+ * @child:  Child inode that inherits the context from @parent.
+ * @fs_data:  private data given by FS.
+ * @preload:  preload child i_crypt_info
+ *
+ * Return: Zero on success, non-zero otherwise
+ */
+int fscrypt_inherit_context(struct inode *parent, struct inode *child,
+                                                void *fs_data, bool preload)
+{
+        struct fscrypt_context ctx;
+        struct fscrypt_info *ci;
+        int res;
+        if (!parent->i_sb->s_cop->set_context)
+                return -EOPNOTSUPP;
+        res = fscrypt_get_encryption_info(parent);
+        if (res < 0)
+                return res;
+        ci = parent->i_crypt_info;
+        if (ci == NULL)
+                return -ENOKEY;
+        ctx.format = FS_ENCRYPTION_CONTEXT_FORMAT_V1;
+        if (fscrypt_dummy_context_enabled(parent)) {
+                ctx.contents_encryption_mode = FS_ENCRYPTION_MODE_AES_256_XTS;
+                ctx.filenames_encryption_mode = FS_ENCRYPTION_MODE_AES_256_CTS;
+                ctx.flags = 0;
+                memset(ctx.master_key_descriptor, 0x42, FS_KEY_DESCRIPTOR_SIZE);
+                res = 0;
+        } else {
+                ctx.contents_encryption_mode = ci->ci_data_mode;
+                ctx.filenames_encryption_mode = ci->ci_filename_mode;
+                ctx.flags = ci->ci_flags;
+                memcpy(ctx.master_key_descriptor, ci->ci_master_key,
+                                FS_KEY_DESCRIPTOR_SIZE);
+        }
+        get_random_bytes(ctx.nonce, FS_KEY_DERIVATION_NONCE_SIZE);
+        res = parent->i_sb->s_cop->set_context(child, &ctx,
+                                                sizeof(ctx), fs_data);
+        if (res)
+                return res;
+        return preload ? fscrypt_get_encryption_info(child): 0;
+}
+EXPORT_SYMBOL(fscrypt_inherit_context);
diff --git a/fs/dax.c b/fs/dax.c
index bbb2ad783770..90322eb7498c 100644
--- a/fs/dax.c
+++ b/fs/dax.c
@@ -286,8 +286,13 @@ ssize_t dax_do_io(struct kiocb *iocb, struct inode *inode,
        if ((flags & DIO_LOCKING) && iov_iter_rw(iter) == READ)
                inode_unlock(inode);
-        if ((retval > 0) && end_io)
+        if (end_io) {
-                end_io(iocb, pos, retval, bh.b_private);
+                int err;
+                err = end_io(iocb, pos, retval, bh.b_private);
+                if (err)
+                        retval = err;
+        }
        if (!(flags & DIO_SKIP_DIO_COUNT))
                inode_dio_end(inode);
diff --git a/fs/dcache.c b/fs/dcache.c
index 2398f9f94337..32ceae3e6112 100644
--- a/fs/dcache.c
+++ b/fs/dcache.c
@@ -1745,13 +1745,12 @@ static void __d_instantiate(struct dentry *dentry, struct inode *inode)
        unsigned add_flags = d_flags_for_inode(inode);
        spin_lock(&dentry->d_lock);
-        if (inode)
+        hlist_add_head(&dentry->d_u.d_alias, &inode->i_dentry);
-                hlist_add_head(&dentry->d_u.d_alias, &inode->i_dentry);
        raw_write_seqcount_begin(&dentry->d_seq);
        __d_set_inode_and_type(dentry, inode, add_flags);
        raw_write_seqcount_end(&dentry->d_seq);
+        __fsnotify_d_instantiate(dentry);
        spin_unlock(&dentry->d_lock);
-        fsnotify_d_instantiate(dentry, inode);
 }
 /**
@@ -1772,91 +1771,16 @@ static void __d_instantiate(struct dentry *dentry, struct inode *inode)
 void d_instantiate(struct dentry *entry, struct inode * inode)
 {
        BUG_ON(!hlist_unhashed(&entry->d_u.d_alias));
-        if (inode)
+        if (inode) {
                spin_lock(&inode->i_lock);
-        __d_instantiate(entry, inode);
+                __d_instantiate(entry, inode);
-        if (inode)
                spin_unlock(&inode->i_lock);
+        }
        security_d_instantiate(entry, inode);
 }
 EXPORT_SYMBOL(d_instantiate);
 /**
- * d_instantiate_unique - instantiate a non-aliased dentry
- * @entry: dentry to instantiate
- * @inode: inode to attach to this dentry
- *
- * Fill in inode information in the entry. On success, it returns NULL.
- * If an unhashed alias of "entry" already exists, then we return the
- * aliased dentry instead and drop one reference to inode.
- *
- * Note that in order to avoid conflicts with rename() etc, the caller
- * had better be holding the parent directory semaphore.
- *
- * This also assumes that the inode count has been incremented
- * (or otherwise set) by the caller to indicate that it is now
- * in use by the dcache.
- */
-static struct dentry *__d_instantiate_unique(struct dentry *entry,
-                                             struct inode *inode)
-{
-        struct dentry *alias;
-        int len = entry->d_name.len;
-        const char *name = entry->d_name.name;
-        unsigned int hash = entry->d_name.hash;
-        if (!inode) {
-                __d_instantiate(entry, NULL);
-                return NULL;
-        }
-        hlist_for_each_entry(alias, &inode->i_dentry, d_u.d_alias) {
-                /*
-                 * Don't need alias->d_lock here, because aliases with
-                 * d_parent == entry->d_parent are not subject to name or
-                 * parent changes, because the parent inode i_mutex is held.
-                 */
-                if (alias->d_name.hash != hash)
-                        continue;
-                if (alias->d_parent != entry->d_parent)
-                        continue;
-                if (alias->d_name.len != len)
-                        continue;
-                if (dentry_cmp(alias, name, len))
-                        continue;
-                __dget(alias);
-                return alias;
-        }
-        __d_instantiate(entry, inode);
-        return NULL;
-}
-struct dentry *d_instantiate_unique(struct dentry *entry, struct inode *inode)
-{
-        struct dentry *result;
-        BUG_ON(!hlist_unhashed(&entry->d_u.d_alias));
-        if (inode)
-                spin_lock(&inode->i_lock);
-        result = __d_instantiate_unique(entry, inode);
-        if (inode)
-                spin_unlock(&inode->i_lock);
-        if (!result) {
-                security_d_instantiate(entry, inode);
-                return NULL;
-        }
-        BUG_ON(!d_unhashed(result));
-        iput(inode);
-        return result;
-}
-EXPORT_SYMBOL(d_instantiate_unique);
-/**
 * d_instantiate_no_diralias - instantiate a non-aliased dentry
 * @entry: dentry to complete
 * @inode: inode to attach to this dentry
@@ -2436,6 +2360,86 @@ void d_rehash(struct dentry * entry)
 }
 EXPORT_SYMBOL(d_rehash);
+/* inode->i_lock held if inode is non-NULL */
+static inline void __d_add(struct dentry *dentry, struct inode *inode)
+{
+        if (inode) {
+                __d_instantiate(dentry, inode);
+                spin_unlock(&inode->i_lock);
+        }
+        security_d_instantiate(dentry, inode);
+        d_rehash(dentry);
+}
+/**
+ * d_add - add dentry to hash queues
+ * @entry: dentry to add
+ * @inode: The inode to attach to this dentry
+ *
+ * This adds the entry to the hash queues and initializes @inode.
+ * The entry was actually filled in earlier during d_alloc().
+ */
+void d_add(struct dentry *entry, struct inode *inode)
+{
+        if (inode)
+                spin_lock(&inode->i_lock);
+        __d_add(entry, inode);
+}
+EXPORT_SYMBOL(d_add);
+/**
+ * d_exact_alias - find and hash an exact unhashed alias
+ * @entry: dentry to add
+ * @inode: The inode to go with this dentry
+ *
+ * If an unhashed dentry with the same name/parent and desired
+ * inode already exists, hash and return it.  Otherwise, return
+ * NULL.
+ *
+ * Parent directory should be locked.
+ */
+struct dentry *d_exact_alias(struct dentry *entry, struct inode *inode)
+{
+        struct dentry *alias;
+        int len = entry->d_name.len;
+        const char *name = entry->d_name.name;
+        unsigned int hash = entry->d_name.hash;
+        spin_lock(&inode->i_lock);
+        hlist_for_each_entry(alias, &inode->i_dentry, d_u.d_alias) {
+                /*
+                 * Don't need alias->d_lock here, because aliases with
+                 * d_parent == entry->d_parent are not subject to name or
+                 * parent changes, because the parent inode i_mutex is held.
+                 */
+                if (alias->d_name.hash != hash)
+                        continue;
+                if (alias->d_parent != entry->d_parent)
+                        continue;
+                if (alias->d_name.len != len)
+                        continue;
+                if (dentry_cmp(alias, name, len))
+                        continue;
+                spin_lock(&alias->d_lock);
+                if (!d_unhashed(alias)) {
+                        spin_unlock(&alias->d_lock);
+                        alias = NULL;
+                } else {
+                        __dget_dlock(alias);
+                        _d_rehash(alias);
+                        spin_unlock(&alias->d_lock);
+                }
+                spin_unlock(&inode->i_lock);
+                return alias;
+        }
+        spin_unlock(&inode->i_lock);
+        return NULL;
+}
+EXPORT_SYMBOL(d_exact_alias);
 /**
 * dentry_update_name_case - update case insensitive dentry with a new name
 * @dentry: dentry to be updated
@@ -2772,10 +2776,9 @@ struct dentry *d_splice_alias(struct inode *inode, struct dentry *dentry)
        BUG_ON(!d_unhashed(dentry));
-        if (!inode) {
+        if (!inode)
-                __d_instantiate(dentry, NULL);
                goto out;
-        }
        spin_lock(&inode->i_lock);
        if (S_ISDIR(inode->i_mode)) {
                struct dentry *new = __d_find_any_alias(inode);
@@ -2809,12 +2812,8 @@ struct dentry *d_splice_alias(struct inode *inode, struct dentry *dentry)
                        return new;
                }
        }
-        /* already taking inode->i_lock, so d_add() by hand */
-        __d_instantiate(dentry, inode);
-        spin_unlock(&inode->i_lock);
 out:
-        security_d_instantiate(dentry, inode);
+        __d_add(dentry, inode);
-        d_rehash(dentry);
        return NULL;
 }
 EXPORT_SYMBOL(d_splice_alias);
diff --git a/fs/direct-io.c b/fs/direct-io.c
index d6a9012d42ad..476f1ecbd1f0 100644
--- a/fs/direct-io.c
+++ b/fs/direct-io.c
@@ -253,8 +253,13 @@ static ssize_t dio_complete(struct dio *dio, loff_t offset, ssize_t ret,
        if (ret == 0)
                ret = transferred;
-        if (dio->end_io && dio->result)
+        if (dio->end_io) {
-                dio->end_io(dio->iocb, offset, transferred, dio->private);
+                int err;
+                err = dio->end_io(dio->iocb, offset, ret, dio->private);
+                if (err)
+                        ret = err;
+        }
        if (!(dio->flags & DIO_SKIP_DIO_COUNT))
                inode_dio_end(dio->inode);
@@ -445,7 +450,8 @@ static struct bio *dio_await_one(struct dio *dio)
                __set_current_state(TASK_UNINTERRUPTIBLE);
                dio->waiter = current;
                spin_unlock_irqrestore(&dio->bio_lock, flags);
-                if (!blk_poll(bdev_get_queue(dio->bio_bdev), dio->bio_cookie))
+                if (!(dio->iocb->ki_flags & IOCB_HIPRI) ||
+                    !blk_poll(bdev_get_queue(dio->bio_bdev), dio->bio_cookie))
                        io_schedule();
                /* wake up sets us TASK_RUNNING */
                spin_lock_irqsave(&dio->bio_lock, flags);
diff --git a/fs/dlm/config.c b/fs/dlm/config.c
index 8e294fbbac39..1669f6291c95 100644
--- a/fs/dlm/config.c
+++ b/fs/dlm/config.c
@@ -343,24 +343,20 @@ static struct config_group *make_cluster(struct config_group *g,
        struct dlm_cluster *cl = NULL;
        struct dlm_spaces *sps = NULL;
        struct dlm_comms *cms = NULL;
-        void *gps = NULL;
        cl = kzalloc(sizeof(struct dlm_cluster), GFP_NOFS);
-        gps = kcalloc(3, sizeof(struct config_group *), GFP_NOFS);
        sps = kzalloc(sizeof(struct dlm_spaces), GFP_NOFS);
        cms = kzalloc(sizeof(struct dlm_comms), GFP_NOFS);
-        if (!cl || !gps || !sps || !cms)
+        if (!cl || !sps || !cms)
                goto fail;
        config_group_init_type_name(&cl->group, name, &cluster_type);
        config_group_init_type_name(&sps->ss_group, "spaces", &spaces_type);
        config_group_init_type_name(&cms->cs_group, "comms", &comms_type);
-        cl->group.default_groups = gps;
+        configfs_add_default_group(&sps->ss_group, &cl->group);
-        cl->group.default_groups[0] = &sps->ss_group;
+        configfs_add_default_group(&cms->cs_group, &cl->group);
-        cl->group.default_groups[1] = &cms->cs_group;
-        cl->group.default_groups[2] = NULL;
        cl->cl_tcp_port = dlm_config.ci_tcp_port;
        cl->cl_buffer_size = dlm_config.ci_buffer_size;
@@ -383,7 +379,6 @@ static struct config_group *make_cluster(struct config_group *g,
 fail:
        kfree(cl);
-        kfree(gps);
        kfree(sps);
        kfree(cms);
        return ERR_PTR(-ENOMEM);
@@ -392,14 +387,8 @@ static struct config_group *make_cluster(struct config_group *g,
 static void drop_cluster(struct config_group *g, struct config_item *i)
 {
        struct dlm_cluster *cl = config_item_to_cluster(i);
-        struct config_item *tmp;
-        int j;
-        for (j = 0; cl->group.default_groups[j]; j++) {
+        configfs_remove_default_groups(&cl->group);
-                tmp = &cl->group.default_groups[j]->cg_item;
-                cl->group.default_groups[j] = NULL;
-                config_item_put(tmp);
-        }
        space_list = NULL;
        comm_list = NULL;
@@ -410,7 +399,6 @@ static void drop_cluster(struct config_group *g, struct config_item *i)
 static void release_cluster(struct config_item *i)
 {
        struct dlm_cluster *cl = config_item_to_cluster(i);
-        kfree(cl->group.default_groups);
        kfree(cl);
 }
@@ -418,21 +406,17 @@ static struct config_group *make_space(struct config_group *g, const char *name)
 {
        struct dlm_space *sp = NULL;
        struct dlm_nodes *nds = NULL;
-        void *gps = NULL;
        sp = kzalloc(sizeof(struct dlm_space), GFP_NOFS);
-        gps = kcalloc(2, sizeof(struct config_group *), GFP_NOFS);
        nds = kzalloc(sizeof(struct dlm_nodes), GFP_NOFS);
-        if (!sp || !gps || !nds)
+        if (!sp || !nds)
                goto fail;
        config_group_init_type_name(&sp->group, name, &space_type);
-        config_group_init_type_name(&nds->ns_group, "nodes", &nodes_type);
-        sp->group.default_groups = gps;
+        config_group_init_type_name(&nds->ns_group, "nodes", &nodes_type);
-        sp->group.default_groups[0] = &nds->ns_group;
+        configfs_add_default_group(&nds->ns_group, &sp->group);
-        sp->group.default_groups[1] = NULL;
        INIT_LIST_HEAD(&sp->members);
        mutex_init(&sp->members_lock);
@@ -441,7 +425,6 @@ static struct config_group *make_space(struct config_group *g, const char *name)
 fail:
        kfree(sp);
-        kfree(gps);
        kfree(nds);
        return ERR_PTR(-ENOMEM);
 }
@@ -449,24 +432,16 @@ static struct config_group *make_space(struct config_group *g, const char *name)
 static void drop_space(struct config_group *g, struct config_item *i)
 {
        struct dlm_space *sp = config_item_to_space(i);
-        struct config_item *tmp;
-        int j;
        /* assert list_empty(&sp->members) */
-        for (j = 0; sp->group.default_groups[j]; j++) {
+        configfs_remove_default_groups(&sp->group);
-                tmp = &sp->group.default_groups[j]->cg_item;
-                sp->group.default_groups[j] = NULL;
-                config_item_put(tmp);
-        }
        config_item_put(i);
 }
 static void release_space(struct config_item *i)
 {
        struct dlm_space *sp = config_item_to_space(i);
-        kfree(sp->group.default_groups);
        kfree(sp);
 }
diff --git a/fs/dlm/lowcomms.c b/fs/dlm/lowcomms.c
index 3a37bd3f9637..00640e70ed7a 100644
--- a/fs/dlm/lowcomms.c
+++ b/fs/dlm/lowcomms.c
@@ -124,7 +124,10 @@ struct connection {
        struct connection *othercon;
        struct work_struct rwork; /* Receive workqueue */
        struct work_struct swork; /* Send workqueue */
-        void (*orig_error_report)(struct sock *sk);
+        void (*orig_error_report)(struct sock *);
+        void (*orig_data_ready)(struct sock *);
+        void (*orig_state_change)(struct sock *);
+        void (*orig_write_space)(struct sock *);
 };
 #define sock2con(x) ((struct connection *)(x)->sk_user_data)
@@ -467,16 +470,24 @@ int dlm_lowcomms_connect_node(int nodeid)
 static void lowcomms_error_report(struct sock *sk)
 {
-        struct connection *con = sock2con(sk);
+        struct connection *con;
        struct sockaddr_storage saddr;
+        int buflen;
+        void (*orig_report)(struct sock *) = NULL;
-        if (nodeid_to_addr(con->nodeid, &saddr, NULL, false)) {
+        read_lock_bh(&sk->sk_callback_lock);
+        con = sock2con(sk);
+        if (con == NULL)
+                goto out;
+        orig_report = con->orig_error_report;
+        if (con->sock == NULL ||
+            kernel_getpeername(con->sock, (struct sockaddr *)&saddr, &buflen)) {
                printk_ratelimited(KERN_ERR "dlm: node %d: socket error "
                                   "sending to node %d, port %d, "
                                   "sk_err=%d/%d\n", dlm_our_nodeid(),
                                   con->nodeid, dlm_config.ci_tcp_port,
                                   sk->sk_err, sk->sk_err_soft);
-                return;
        } else if (saddr.ss_family == AF_INET) {
                struct sockaddr_in *sin4 = (struct sockaddr_in *)&saddr;
@@ -499,22 +510,54 @@ static void lowcomms_error_report(struct sock *sk)
                                   dlm_config.ci_tcp_port, sk->sk_err,
                                   sk->sk_err_soft);
        }
-        con->orig_error_report(sk);
+out:
+        read_unlock_bh(&sk->sk_callback_lock);
+        if (orig_report)
+                orig_report(sk);
+}
+/* Note: sk_callback_lock must be locked before calling this function. */
+static void save_callbacks(struct connection *con, struct sock *sk)
+{
+        lock_sock(sk);
+        con->orig_data_ready = sk->sk_data_ready;
+        con->orig_state_change = sk->sk_state_change;
+        con->orig_write_space = sk->sk_write_space;
+        con->orig_error_report = sk->sk_error_report;
+        release_sock(sk);
+}
+static void restore_callbacks(struct connection *con, struct sock *sk)
+{
+        write_lock_bh(&sk->sk_callback_lock);
+        lock_sock(sk);
+        sk->sk_user_data = NULL;
+        sk->sk_data_ready = con->orig_data_ready;
+        sk->sk_state_change = con->orig_state_change;
+        sk->sk_write_space = con->orig_write_space;
+        sk->sk_error_report = con->orig_error_report;
+        release_sock(sk);
+        write_unlock_bh(&sk->sk_callback_lock);
 }
 /* Make a socket active */
 static void add_sock(struct socket *sock, struct connection *con)
 {
+        struct sock *sk = sock->sk;
+        write_lock_bh(&sk->sk_callback_lock);
        con->sock = sock;
+        sk->sk_user_data = con;
+        if (!test_bit(CF_IS_OTHERCON, &con->flags))
+                save_callbacks(con, sk);
        /* Install a data_ready callback */
-        con->sock->sk->sk_data_ready = lowcomms_data_ready;
+        sk->sk_data_ready = lowcomms_data_ready;
-        con->sock->sk->sk_write_space = lowcomms_write_space;
+        sk->sk_write_space = lowcomms_write_space;
-        con->sock->sk->sk_state_change = lowcomms_state_change;
+        sk->sk_state_change = lowcomms_state_change;
-        con->sock->sk->sk_user_data = con;
+        sk->sk_allocation = GFP_NOFS;
-        con->sock->sk->sk_allocation = GFP_NOFS;
+        sk->sk_error_report = lowcomms_error_report;
-        con->orig_error_report = con->sock->sk->sk_error_report;
+        write_unlock_bh(&sk->sk_callback_lock);
-        con->sock->sk->sk_error_report = lowcomms_error_report;
 }
 /* Add the port number to an IPv6 or 4 sockaddr and return the address
@@ -549,6 +592,8 @@ static void close_connection(struct connection *con, bool and_other,
        mutex_lock(&con->sock_mutex);
        if (con->sock) {
+                if (!test_bit(CF_IS_OTHERCON, &con->flags))
+                        restore_callbacks(con, con->sock->sk);
                sock_release(con->sock);
                con->sock = NULL;
        }
@@ -1190,6 +1235,8 @@ static struct socket *tcp_create_listen_sock(struct connection *con,
        if (result < 0) {
                log_print("Failed to set SO_REUSEADDR on socket: %d", result);
        }
+        sock->sk->sk_user_data = con;
        con->rx_action = tcp_accept_from_sock;
        con->connect_action = tcp_connect_to_sock;
@@ -1271,6 +1318,7 @@ static int sctp_listen_for_all(void)
        if (result < 0)
                log_print("Could not set SCTP NODELAY error %d\n", result);
+        write_lock_bh(&sock->sk->sk_callback_lock);
        /* Init con struct */
        sock->sk->sk_user_data = con;
        con->sock = sock;
@@ -1278,6 +1326,8 @@ static int sctp_listen_for_all(void)
        con->rx_action = sctp_accept_from_sock;
        con->connect_action = sctp_connect_to_sock;
+        write_unlock_bh(&sock->sk->sk_callback_lock);
        /* Bind to all addresses. */
        if (sctp_bind_addrs(con, dlm_config.ci_tcp_port))
                goto create_delsock;
diff --git a/fs/ecryptfs/crypto.c b/fs/ecryptfs/crypto.c
index 80d6901493cf..64026e53722a 100644
--- a/fs/ecryptfs/crypto.c
+++ b/fs/ecryptfs/crypto.c
@@ -23,6 +23,8 @@
 * 02111-1307, USA.
 */
+#include <crypto/hash.h>
+#include <crypto/skcipher.h>
 #include <linux/fs.h>
 #include <linux/mount.h>
 #include <linux/pagemap.h>
@@ -30,7 +32,6 @@
 #include <linux/compiler.h>
 #include <linux/key.h>
 #include <linux/namei.h>
-#include <linux/crypto.h>
 #include <linux/file.h>
 #include <linux/scatterlist.h>
 #include <linux/slab.h>
@@ -74,6 +75,19 @@ void ecryptfs_from_hex(char *dst, char *src, int dst_size)
        }
 }
+static int ecryptfs_hash_digest(struct crypto_shash *tfm,
+                                char *src, int len, char *dst)
+{
+        SHASH_DESC_ON_STACK(desc, tfm);
+        int err;
+        desc->tfm = tfm;
+        desc->flags = CRYPTO_TFM_REQ_MAY_SLEEP;
+        err = crypto_shash_digest(desc, src, len, dst);
+        shash_desc_zero(desc);
+        return err;
+}
 /**
 * ecryptfs_calculate_md5 - calculates the md5 of @src
 * @dst: Pointer to 16 bytes of allocated memory
@@ -88,45 +102,26 @@ static int ecryptfs_calculate_md5(char *dst,
                                  struct ecryptfs_crypt_stat *crypt_stat,
                                  char *src, int len)
 {
-        struct scatterlist sg;
+        struct crypto_shash *tfm;
-        struct hash_desc desc = {
-                .tfm = crypt_stat->hash_tfm,
-                .flags = CRYPTO_TFM_REQ_MAY_SLEEP
-        };
        int rc = 0;
        mutex_lock(&crypt_stat->cs_hash_tfm_mutex);
-        sg_init_one(&sg, (u8 *)src, len);
+        tfm = crypt_stat->hash_tfm;
-        if (!desc.tfm) {
+        if (!tfm) {
-                desc.tfm = crypto_alloc_hash(ECRYPTFS_DEFAULT_HASH, 0,
+                tfm = crypto_alloc_shash(ECRYPTFS_DEFAULT_HASH, 0, 0);
-                                             CRYPTO_ALG_ASYNC);
+                if (IS_ERR(tfm)) {
-                if (IS_ERR(desc.tfm)) {
+                        rc = PTR_ERR(tfm);
-                        rc = PTR_ERR(desc.tfm);
                        ecryptfs_printk(KERN_ERR, "Error attempting to "
                                        "allocate crypto context; rc = [%d]\n",
                                        rc);
                        goto out;
                }
-                crypt_stat->hash_tfm = desc.tfm;
+                crypt_stat->hash_tfm = tfm;
-        }
-        rc = crypto_hash_init(&desc);
-        if (rc) {
-                printk(KERN_ERR
-                       "%s: Error initializing crypto hash; rc = [%d]\n",
-                       __func__, rc);
-                goto out;
        }
-        rc = crypto_hash_update(&desc, &sg, len);
+        rc = ecryptfs_hash_digest(tfm, src, len, dst);
        if (rc) {
                printk(KERN_ERR
-                       "%s: Error updating crypto hash; rc = [%d]\n",
+                       "%s: Error computing crypto hash; rc = [%d]\n",
-                       __func__, rc);
-                goto out;
-        }
-        rc = crypto_hash_final(&desc, dst);
-        if (rc) {
-                printk(KERN_ERR
-                       "%s: Error finalizing crypto hash; rc = [%d]\n",
                       __func__, rc);
                goto out;
        }
@@ -234,10 +229,8 @@ void ecryptfs_destroy_crypt_stat(struct ecryptfs_crypt_stat *crypt_stat)
 {
        struct ecryptfs_key_sig *key_sig, *key_sig_tmp;
-        if (crypt_stat->tfm)
+        crypto_free_skcipher(crypt_stat->tfm);
-                crypto_free_ablkcipher(crypt_stat->tfm);
+        crypto_free_shash(crypt_stat->hash_tfm);
-        if (crypt_stat->hash_tfm)
-                crypto_free_hash(crypt_stat->hash_tfm);
        list_for_each_entry_safe(key_sig, key_sig_tmp,
                                 &crypt_stat->keysig_list, crypt_stat_list) {
                list_del(&key_sig->crypt_stat_list);
@@ -342,7 +335,7 @@ static int crypt_scatterlist(struct ecryptfs_crypt_stat *crypt_stat,
                             struct scatterlist *src_sg, int size,
                             unsigned char *iv, int op)
 {
-        struct ablkcipher_request *req = NULL;
+        struct skcipher_request *req = NULL;
        struct extent_crypt_result ecr;
        int rc = 0;
@@ -358,20 +351,20 @@ static int crypt_scatterlist(struct ecryptfs_crypt_stat *crypt_stat,
        init_completion(&ecr.completion);
        mutex_lock(&crypt_stat->cs_tfm_mutex);
-        req = ablkcipher_request_alloc(crypt_stat->tfm, GFP_NOFS);
+        req = skcipher_request_alloc(crypt_stat->tfm, GFP_NOFS);
        if (!req) {
                mutex_unlock(&crypt_stat->cs_tfm_mutex);
                rc = -ENOMEM;
                goto out;
        }
-        ablkcipher_request_set_callback(req,
+        skcipher_request_set_callback(req,
                        CRYPTO_TFM_REQ_MAY_BACKLOG | CRYPTO_TFM_REQ_MAY_SLEEP,
                        extent_crypt_complete, &ecr);
        /* Consider doing this once, when the file is opened */
        if (!(crypt_stat->flags & ECRYPTFS_KEY_SET)) {
-                rc = crypto_ablkcipher_setkey(crypt_stat->tfm, crypt_stat->key,
+                rc = crypto_skcipher_setkey(crypt_stat->tfm, crypt_stat->key,
-                                              crypt_stat->key_size);
+                                            crypt_stat->key_size);
                if (rc) {
                        ecryptfs_printk(KERN_ERR,
                                        "Error setting key; rc = [%d]\n",
@@ -383,9 +376,9 @@ static int crypt_scatterlist(struct ecryptfs_crypt_stat *crypt_stat,
                crypt_stat->flags |= ECRYPTFS_KEY_SET;
        }
        mutex_unlock(&crypt_stat->cs_tfm_mutex);
-        ablkcipher_request_set_crypt(req, src_sg, dst_sg, size, iv);
+        skcipher_request_set_crypt(req, src_sg, dst_sg, size, iv);
-        rc = op == ENCRYPT ? crypto_ablkcipher_encrypt(req) :
+        rc = op == ENCRYPT ? crypto_skcipher_encrypt(req) :
-                             crypto_ablkcipher_decrypt(req);
+                             crypto_skcipher_decrypt(req);
        if (rc == -EINPROGRESS || rc == -EBUSY) {
                struct extent_crypt_result *ecr = req->base.data;
@@ -394,7 +387,7 @@ static int crypt_scatterlist(struct ecryptfs_crypt_stat *crypt_stat,
                reinit_completion(&ecr->completion);
        }
 out:
-        ablkcipher_request_free(req);
+        skcipher_request_free(req);
        return rc;
 }
@@ -622,7 +615,7 @@ int ecryptfs_init_crypt_ctx(struct ecryptfs_crypt_stat *crypt_stat)
                                                    crypt_stat->cipher, "cbc");
        if (rc)
                goto out_unlock;
-        crypt_stat->tfm = crypto_alloc_ablkcipher(full_alg_name, 0, 0);
+        crypt_stat->tfm = crypto_alloc_skcipher(full_alg_name, 0, 0);
        if (IS_ERR(crypt_stat->tfm)) {
                rc = PTR_ERR(crypt_stat->tfm);
                crypt_stat->tfm = NULL;
@@ -631,7 +624,7 @@ int ecryptfs_init_crypt_ctx(struct ecryptfs_crypt_stat *crypt_stat)
                                full_alg_name);
                goto out_free;
        }
-        crypto_ablkcipher_set_flags(crypt_stat->tfm, CRYPTO_TFM_REQ_WEAK_KEY);
+        crypto_skcipher_set_flags(crypt_stat->tfm, CRYPTO_TFM_REQ_WEAK_KEY);
        rc = 0;
 out_free:
        kfree(full_alg_name);
@@ -1499,16 +1492,14 @@ out:
 */
 static int
 ecryptfs_encrypt_filename(struct ecryptfs_filename *filename,
-                          struct ecryptfs_crypt_stat *crypt_stat,
                          struct ecryptfs_mount_crypt_stat *mount_crypt_stat)
 {
        int rc = 0;
        filename->encrypted_filename = NULL;
        filename->encrypted_filename_size = 0;
-        if ((crypt_stat && (crypt_stat->flags & ECRYPTFS_ENCFN_USE_MOUNT_FNEK))
+        if (mount_crypt_stat && (mount_crypt_stat->flags
-            || (mount_crypt_stat && (mount_crypt_stat->flags
+                                     & ECRYPTFS_GLOBAL_ENCFN_USE_MOUNT_FNEK)) {
-                                     & ECRYPTFS_GLOBAL_ENCFN_USE_MOUNT_FNEK))) {
                size_t packet_size;
                size_t remaining_bytes;
@@ -1591,7 +1582,7 @@ out:
 * event, regardless of whether this function succeeds for fails.
 */
 static int
-ecryptfs_process_key_cipher(struct crypto_blkcipher **key_tfm,
+ecryptfs_process_key_cipher(struct crypto_skcipher **key_tfm,
                            char *cipher_name, size_t *key_size)
 {
        char dummy_key[ECRYPTFS_MAX_KEY_BYTES];
@@ -1609,21 +1600,18 @@ ecryptfs_process_key_cipher(struct crypto_blkcipher **key_tfm,
                                                    "ecb");
        if (rc)
                goto out;
-        *key_tfm = crypto_alloc_blkcipher(full_alg_name, 0, CRYPTO_ALG_ASYNC);
+        *key_tfm = crypto_alloc_skcipher(full_alg_name, 0, CRYPTO_ALG_ASYNC);
        if (IS_ERR(*key_tfm)) {
                rc = PTR_ERR(*key_tfm);
                printk(KERN_ERR "Unable to allocate crypto cipher with name "
                       "[%s]; rc = [%d]\n", full_alg_name, rc);
                goto out;
        }
-        crypto_blkcipher_set_flags(*key_tfm, CRYPTO_TFM_REQ_WEAK_KEY);
+        crypto_skcipher_set_flags(*key_tfm, CRYPTO_TFM_REQ_WEAK_KEY);
-        if (*key_size == 0) {
+        if (*key_size == 0)
-                struct blkcipher_alg *alg = crypto_blkcipher_alg(*key_tfm);
+                *key_size = crypto_skcipher_default_keysize(*key_tfm);
-                *key_size = alg->max_keysize;
-        }
        get_random_bytes(dummy_key, *key_size);
-        rc = crypto_blkcipher_setkey(*key_tfm, dummy_key, *key_size);
+        rc = crypto_skcipher_setkey(*key_tfm, dummy_key, *key_size);
        if (rc) {
                printk(KERN_ERR "Error attempting to set key of size [%zd] for "
                       "cipher [%s]; rc = [%d]\n", *key_size, full_alg_name,
@@ -1660,8 +1648,7 @@ int ecryptfs_destroy_crypto(void)
        list_for_each_entry_safe(key_tfm, key_tfm_tmp, &key_tfm_list,
                                 key_tfm_list) {
                list_del(&key_tfm->key_tfm_list);
-                if (key_tfm->key_tfm)
+                crypto_free_skcipher(key_tfm->key_tfm);
-                        crypto_free_blkcipher(key_tfm->key_tfm);
                kmem_cache_free(ecryptfs_key_tfm_cache, key_tfm);
        }
        mutex_unlock(&key_tfm_list_mutex);
@@ -1747,7 +1734,7 @@ int ecryptfs_tfm_exists(char *cipher_name, struct ecryptfs_key_tfm **key_tfm)
 * Searches for cached item first, and creates new if not found.
 * Returns 0 on success, non-zero if adding new cipher failed
 */
-int ecryptfs_get_tfm_and_mutex_for_cipher_name(struct crypto_blkcipher **tfm,
+int ecryptfs_get_tfm_and_mutex_for_cipher_name(struct crypto_skcipher **tfm,
                                               struct mutex **tfm_mutex,
                                               char *cipher_name)
 {
@@ -1944,7 +1931,6 @@ out:
 int ecryptfs_encrypt_and_encode_filename(
        char **encoded_name,
        size_t *encoded_name_size,
-        struct ecryptfs_crypt_stat *crypt_stat,
        struct ecryptfs_mount_crypt_stat *mount_crypt_stat,
        const char *name, size_t name_size)
 {
@@ -1953,9 +1939,8 @@ int ecryptfs_encrypt_and_encode_filename(
        (*encoded_name) = NULL;
        (*encoded_name_size) = 0;
-        if ((crypt_stat && (crypt_stat->flags & ECRYPTFS_ENCRYPT_FILENAMES))
+        if (mount_crypt_stat && (mount_crypt_stat->flags
-            || (mount_crypt_stat && (mount_crypt_stat->flags
+                                     & ECRYPTFS_GLOBAL_ENCRYPT_FILENAMES)) {
-                                     & ECRYPTFS_GLOBAL_ENCRYPT_FILENAMES))) {
                struct ecryptfs_filename *filename;
                filename = kzalloc(sizeof(*filename), GFP_KERNEL);
@@ -1968,8 +1953,7 @@ int ecryptfs_encrypt_and_encode_filename(
                }
                filename->filename = (char *)name;
                filename->filename_size = name_size;
-                rc = ecryptfs_encrypt_filename(filename, crypt_stat,
+                rc = ecryptfs_encrypt_filename(filename, mount_crypt_stat);
-                                               mount_crypt_stat);
                if (rc) {
                        printk(KERN_ERR "%s: Error attempting to encrypt "
                               "filename; rc = [%d]\n", __func__, rc);
@@ -1980,11 +1964,9 @@ int ecryptfs_encrypt_and_encode_filename(
                        NULL, &encoded_name_no_prefix_size,
                        filename->encrypted_filename,
                        filename->encrypted_filename_size);
-                if ((crypt_stat && (crypt_stat->flags
+                if (mount_crypt_stat
-                                    & ECRYPTFS_ENCFN_USE_MOUNT_FNEK))
-                    || (mount_crypt_stat
                        && (mount_crypt_stat->flags
-                            & ECRYPTFS_GLOBAL_ENCFN_USE_MOUNT_FNEK)))
+                            & ECRYPTFS_GLOBAL_ENCFN_USE_MOUNT_FNEK))
                        (*encoded_name_size) =
                                (ECRYPTFS_FNEK_ENCRYPTED_FILENAME_PREFIX_SIZE
                                 + encoded_name_no_prefix_size);
@@ -2002,11 +1984,9 @@ int ecryptfs_encrypt_and_encode_filename(
                        kfree(filename);
                        goto out;
                }
-                if ((crypt_stat && (crypt_stat->flags
+                if (mount_crypt_stat
-                                    & ECRYPTFS_ENCFN_USE_MOUNT_FNEK))
-                    || (mount_crypt_stat
                        && (mount_crypt_stat->flags
-                            & ECRYPTFS_GLOBAL_ENCFN_USE_MOUNT_FNEK))) {
+                            & ECRYPTFS_GLOBAL_ENCFN_USE_MOUNT_FNEK)) {
                        memcpy((*encoded_name),
                               ECRYPTFS_FNEK_ENCRYPTED_FILENAME_PREFIX,
                               ECRYPTFS_FNEK_ENCRYPTED_FILENAME_PREFIX_SIZE);
@@ -2120,7 +2100,7 @@ out:
 int ecryptfs_set_f_namelen(long *namelen, long lower_namelen,
                           struct ecryptfs_mount_crypt_stat *mount_crypt_stat)
 {
-        struct blkcipher_desc desc;
+        struct crypto_skcipher *tfm;
        struct mutex *tfm_mutex;
        size_t cipher_blocksize;
        int rc;
@@ -2130,7 +2110,7 @@ int ecryptfs_set_f_namelen(long *namelen, long lower_namelen,
                return 0;
        }
-        rc = ecryptfs_get_tfm_and_mutex_for_cipher_name(&desc.tfm, &tfm_mutex,
+        rc = ecryptfs_get_tfm_and_mutex_for_cipher_name(&tfm, &tfm_mutex,
                        mount_crypt_stat->global_default_fn_cipher_name);
        if (unlikely(rc)) {
                (*namelen) = 0;
@@ -2138,7 +2118,7 @@ int ecryptfs_set_f_namelen(long *namelen, long lower_namelen,
        }
        mutex_lock(tfm_mutex);
-        cipher_blocksize = crypto_blkcipher_blocksize(desc.tfm);
+        cipher_blocksize = crypto_skcipher_blocksize(tfm);
        mutex_unlock(tfm_mutex);
        /* Return an exact amount for the common cases */
diff --git a/fs/ecryptfs/ecryptfs_kernel.h b/fs/ecryptfs/ecryptfs_kernel.h
index 7b39260c7bba..d123fbaa28e0 100644
--- a/fs/ecryptfs/ecryptfs_kernel.h
+++ b/fs/ecryptfs/ecryptfs_kernel.h
@@ -28,6 +28,7 @@
 #ifndef ECRYPTFS_KERNEL_H
 #define ECRYPTFS_KERNEL_H
+#include <crypto/skcipher.h>
 #include <keys/user-type.h>
 #include <keys/encrypted-type.h>
 #include <linux/fs.h>
@@ -38,7 +39,6 @@
 #include <linux/nsproxy.h>
 #include <linux/backing-dev.h>
 #include <linux/ecryptfs.h>
-#include <linux/crypto.h>
 #define ECRYPTFS_DEFAULT_IV_BYTES 16
 #define ECRYPTFS_DEFAULT_EXTENT_SIZE 4096
@@ -233,9 +233,9 @@ struct ecryptfs_crypt_stat {
        size_t extent_shift;
        unsigned int extent_mask;
        struct ecryptfs_mount_crypt_stat *mount_crypt_stat;
-        struct crypto_ablkcipher *tfm;
+        struct crypto_skcipher *tfm;
-        struct crypto_hash *hash_tfm; /* Crypto context for generating
+        struct crypto_shash *hash_tfm; /* Crypto context for generating
-                                       * the initialization vectors */
+                                        * the initialization vectors */
        unsigned char cipher[ECRYPTFS_MAX_CIPHER_NAME_SIZE + 1];
        unsigned char key[ECRYPTFS_MAX_KEY_BYTES];
        unsigned char root_iv[ECRYPTFS_MAX_IV_BYTES];
@@ -309,7 +309,7 @@ struct ecryptfs_global_auth_tok {
 * keeps a list of crypto API contexts around to use when needed.
 */
 struct ecryptfs_key_tfm {
-        struct crypto_blkcipher *key_tfm;
+        struct crypto_skcipher *key_tfm;
        size_t key_size;
        struct mutex key_tfm_mutex;
        struct list_head key_tfm_list;
@@ -569,7 +569,6 @@ int ecryptfs_fill_zeros(struct file *file, loff_t new_length);
 int ecryptfs_encrypt_and_encode_filename(
        char **encoded_name,
        size_t *encoded_name_size,
-        struct ecryptfs_crypt_stat *crypt_stat,
        struct ecryptfs_mount_crypt_stat *mount_crypt_stat,
        const char *name, size_t name_size);
 struct dentry *ecryptfs_lower_dentry(struct dentry *this_dentry);
@@ -659,7 +658,7 @@ ecryptfs_add_new_key_tfm(struct ecryptfs_key_tfm **key_tfm, char *cipher_name,
 int ecryptfs_init_crypto(void);
 int ecryptfs_destroy_crypto(void);
 int ecryptfs_tfm_exists(char *cipher_name, struct ecryptfs_key_tfm **key_tfm);
-int ecryptfs_get_tfm_and_mutex_for_cipher_name(struct crypto_blkcipher **tfm,
+int ecryptfs_get_tfm_and_mutex_for_cipher_name(struct crypto_skcipher **tfm,
                                               struct mutex **tfm_mutex,
                                               char *cipher_name);
 int ecryptfs_keyring_auth_tok_for_sig(struct key **auth_tok_key,
diff --git a/fs/ecryptfs/inode.c b/fs/ecryptfs/inode.c
index 4e685ac1024d..121114e9a464 100644
--- a/fs/ecryptfs/inode.c
+++ b/fs/ecryptfs/inode.c
@@ -29,7 +29,6 @@
 #include <linux/dcache.h>
 #include <linux/namei.h>
 #include <linux/mount.h>
-#include <linux/crypto.h>
 #include <linux/fs_stack.h>
 #include <linux/slab.h>
 #include <linux/xattr.h>
@@ -397,11 +396,9 @@ static struct dentry *ecryptfs_lookup(struct inode *ecryptfs_dir_inode,
        int rc = 0;
        lower_dir_dentry = ecryptfs_dentry_to_lower(ecryptfs_dentry->d_parent);
-        inode_lock(d_inode(lower_dir_dentry));
+        lower_dentry = lookup_one_len_unlocked(ecryptfs_dentry->d_name.name,
-        lower_dentry = lookup_one_len(ecryptfs_dentry->d_name.name,
                                      lower_dir_dentry,
                                      ecryptfs_dentry->d_name.len);
-        inode_unlock(d_inode(lower_dir_dentry));
        if (IS_ERR(lower_dentry)) {
                rc = PTR_ERR(lower_dentry);
                ecryptfs_printk(KERN_DEBUG, "%s: lookup_one_len() returned "
@@ -419,18 +416,16 @@ static struct dentry *ecryptfs_lookup(struct inode *ecryptfs_dir_inode,
        dput(lower_dentry);
        rc = ecryptfs_encrypt_and_encode_filename(
                &encrypted_and_encoded_name, &encrypted_and_encoded_name_size,
-                NULL, mount_crypt_stat, ecryptfs_dentry->d_name.name,
+                mount_crypt_stat, ecryptfs_dentry->d_name.name,
                ecryptfs_dentry->d_name.len);
        if (rc) {
                printk(KERN_ERR "%s: Error attempting to encrypt and encode "
                       "filename; rc = [%d]\n", __func__, rc);
                goto out;
        }
-        inode_lock(d_inode(lower_dir_dentry));
+        lower_dentry = lookup_one_len_unlocked(encrypted_and_encoded_name,
-        lower_dentry = lookup_one_len(encrypted_and_encoded_name,
                                      lower_dir_dentry,
                                      encrypted_and_encoded_name_size);
-        inode_unlock(d_inode(lower_dir_dentry));
        if (IS_ERR(lower_dentry)) {
                rc = PTR_ERR(lower_dentry);
                ecryptfs_printk(KERN_DEBUG, "%s: lookup_one_len() returned "
@@ -502,7 +497,6 @@ static int ecryptfs_symlink(struct inode *dir, struct dentry *dentry,
                dir->i_sb)->mount_crypt_stat;
        rc = ecryptfs_encrypt_and_encode_filename(&encoded_symname,
                                                  &encoded_symlen,
-                                                  NULL,
                                                  mount_crypt_stat, symname,
                                                  strlen(symname));
        if (rc)
diff --git a/fs/ecryptfs/keystore.c b/fs/ecryptfs/keystore.c
index 6bd67e2011f0..9893d1538122 100644
--- a/fs/ecryptfs/keystore.c
+++ b/fs/ecryptfs/keystore.c
@@ -25,11 +25,12 @@
 * 02111-1307, USA.
 */
+#include <crypto/hash.h>
+#include <crypto/skcipher.h>
 #include <linux/string.h>
 #include <linux/pagemap.h>
 #include <linux/key.h>
 #include <linux/random.h>
-#include <linux/crypto.h>
 #include <linux/scatterlist.h>
 #include <linux/slab.h>
 #include "ecryptfs_kernel.h"
@@ -601,12 +602,13 @@ struct ecryptfs_write_tag_70_packet_silly_stack {
        struct ecryptfs_auth_tok *auth_tok;
        struct scatterlist src_sg[2];
        struct scatterlist dst_sg[2];
-        struct blkcipher_desc desc;
+        struct crypto_skcipher *skcipher_tfm;
+        struct skcipher_request *skcipher_req;
        char iv[ECRYPTFS_MAX_IV_BYTES];
        char hash[ECRYPTFS_TAG_70_DIGEST_SIZE];
        char tmp_hash[ECRYPTFS_TAG_70_DIGEST_SIZE];
-        struct hash_desc hash_desc;
+        struct crypto_shash *hash_tfm;
-        struct scatterlist hash_sg;
+        struct shash_desc *hash_desc;
 };
 /**
@@ -629,14 +631,12 @@ ecryptfs_write_tag_70_packet(char *dest, size_t *remaining_bytes,
        struct key *auth_tok_key = NULL;
        int rc = 0;
-        s = kmalloc(sizeof(*s), GFP_KERNEL);
+        s = kzalloc(sizeof(*s), GFP_KERNEL);
        if (!s) {
                printk(KERN_ERR "%s: Out of memory whilst trying to kmalloc "
                       "[%zd] bytes of kernel memory\n", __func__, sizeof(*s));
-                rc = -ENOMEM;
+                return -ENOMEM;
-                goto out;
        }
-        s->desc.flags = CRYPTO_TFM_REQ_MAY_SLEEP;
        (*packet_size) = 0;
        rc = ecryptfs_find_auth_tok_for_sig(
                &auth_tok_key,
@@ -649,7 +649,7 @@ ecryptfs_write_tag_70_packet(char *dest, size_t *remaining_bytes,
                goto out;
        }
        rc = ecryptfs_get_tfm_and_mutex_for_cipher_name(
-                &s->desc.tfm,
+                &s->skcipher_tfm,
                &s->tfm_mutex, mount_crypt_stat->global_default_fn_cipher_name);
        if (unlikely(rc)) {
                printk(KERN_ERR "Internal error whilst attempting to get "
@@ -658,7 +658,7 @@ ecryptfs_write_tag_70_packet(char *dest, size_t *remaining_bytes,
                goto out;
        }
        mutex_lock(s->tfm_mutex);
-        s->block_size = crypto_blkcipher_blocksize(s->desc.tfm);
+        s->block_size = crypto_skcipher_blocksize(s->skcipher_tfm);
        /* Plus one for the \0 separator between the random prefix
         * and the plaintext filename */
        s->num_rand_bytes = (ECRYPTFS_FILENAME_MIN_RANDOM_PREPEND_BYTES + 1);
@@ -691,6 +691,19 @@ ecryptfs_write_tag_70_packet(char *dest, size_t *remaining_bytes,
                rc = -EINVAL;
                goto out_unlock;
        }
+        s->skcipher_req = skcipher_request_alloc(s->skcipher_tfm, GFP_KERNEL);
+        if (!s->skcipher_req) {
+                printk(KERN_ERR "%s: Out of kernel memory whilst attempting to "
+                       "skcipher_request_alloc for %s\n", __func__,
+                       crypto_skcipher_driver_name(s->skcipher_tfm));
+                rc = -ENOMEM;
+                goto out_unlock;
+        }
+        skcipher_request_set_callback(s->skcipher_req,
+                                      CRYPTO_TFM_REQ_MAY_SLEEP, NULL, NULL);
        s->block_aligned_filename = kzalloc(s->block_aligned_filename_size,
                                            GFP_KERNEL);
        if (!s->block_aligned_filename) {
@@ -700,7 +713,6 @@ ecryptfs_write_tag_70_packet(char *dest, size_t *remaining_bytes,
                rc = -ENOMEM;
                goto out_unlock;
        }
-        s->i = 0;
        dest[s->i++] = ECRYPTFS_TAG_70_PACKET_TYPE;
        rc = ecryptfs_write_packet_length(&dest[s->i],
                                          (ECRYPTFS_SIG_SIZE
@@ -738,40 +750,36 @@ ecryptfs_write_tag_70_packet(char *dest, size_t *remaining_bytes,
                       "password tokens\n", __func__);
                goto out_free_unlock;
        }
-        sg_init_one(
+        s->hash_tfm = crypto_alloc_shash(ECRYPTFS_TAG_70_DIGEST, 0, 0);
-                &s->hash_sg,
+        if (IS_ERR(s->hash_tfm)) {
-                (u8 *)s->auth_tok->token.password.session_key_encryption_key,
+                        rc = PTR_ERR(s->hash_tfm);
-                s->auth_tok->token.password.session_key_encryption_key_bytes);
-        s->hash_desc.flags = CRYPTO_TFM_REQ_MAY_SLEEP;
-        s->hash_desc.tfm = crypto_alloc_hash(ECRYPTFS_TAG_70_DIGEST, 0,
-                                             CRYPTO_ALG_ASYNC);
-        if (IS_ERR(s->hash_desc.tfm)) {
-                        rc = PTR_ERR(s->hash_desc.tfm);
                        printk(KERN_ERR "%s: Error attempting to "
                               "allocate hash crypto context; rc = [%d]\n",
                               __func__, rc);
                        goto out_free_unlock;
        }
-        rc = crypto_hash_init(&s->hash_desc);
-        if (rc) {
+        s->hash_desc = kmalloc(sizeof(*s->hash_desc) +
-                printk(KERN_ERR
+                               crypto_shash_descsize(s->hash_tfm), GFP_KERNEL);
-                       "%s: Error initializing crypto hash; rc = [%d]\n",
+        if (!s->hash_desc) {
-                       __func__, rc);
+                printk(KERN_ERR "%s: Out of kernel memory whilst attempting to "
-                goto out_release_free_unlock;
+                       "kmalloc [%zd] bytes\n", __func__,
-        }
+                       sizeof(*s->hash_desc) +
-        rc = crypto_hash_update(
+                       crypto_shash_descsize(s->hash_tfm));
-                &s->hash_desc, &s->hash_sg,
+                rc = -ENOMEM;
-                s->auth_tok->token.password.session_key_encryption_key_bytes);
-        if (rc) {
-                printk(KERN_ERR
-                       "%s: Error updating crypto hash; rc = [%d]\n",
-                       __func__, rc);
                goto out_release_free_unlock;
        }
-        rc = crypto_hash_final(&s->hash_desc, s->hash);
+        s->hash_desc->tfm = s->hash_tfm;
+        s->hash_desc->flags = CRYPTO_TFM_REQ_MAY_SLEEP;
+        rc = crypto_shash_digest(s->hash_desc,
+                                 (u8 *)s->auth_tok->token.password.session_key_encryption_key,
+                                 s->auth_tok->token.password.session_key_encryption_key_bytes,
+                                 s->hash);
        if (rc) {
                printk(KERN_ERR
-                       "%s: Error finalizing crypto hash; rc = [%d]\n",
+                       "%s: Error computing crypto hash; rc = [%d]\n",
                       __func__, rc);
                goto out_release_free_unlock;
        }
@@ -780,27 +788,12 @@ ecryptfs_write_tag_70_packet(char *dest, size_t *remaining_bytes,
                        s->hash[(s->j % ECRYPTFS_TAG_70_DIGEST_SIZE)];
                if ((s->j % ECRYPTFS_TAG_70_DIGEST_SIZE)
                    == (ECRYPTFS_TAG_70_DIGEST_SIZE - 1)) {
-                        sg_init_one(&s->hash_sg, (u8 *)s->hash,
+                        rc = crypto_shash_digest(s->hash_desc, (u8 *)s->hash,
-                                    ECRYPTFS_TAG_70_DIGEST_SIZE);
+                                                ECRYPTFS_TAG_70_DIGEST_SIZE,
-                        rc = crypto_hash_init(&s->hash_desc);
+                                                s->tmp_hash);
-                        if (rc) {
-                                printk(KERN_ERR
-                                       "%s: Error initializing crypto hash; "
-                                       "rc = [%d]\n", __func__, rc);
-                                goto out_release_free_unlock;
-                        }
-                        rc = crypto_hash_update(&s->hash_desc, &s->hash_sg,
-                                                ECRYPTFS_TAG_70_DIGEST_SIZE);
                        if (rc) {
                                printk(KERN_ERR
-                                       "%s: Error updating crypto hash; "
+                                       "%s: Error computing crypto hash; "
-                                       "rc = [%d]\n", __func__, rc);
-                                goto out_release_free_unlock;
-                        }
-                        rc = crypto_hash_final(&s->hash_desc, s->tmp_hash);
-                        if (rc) {
-                                printk(KERN_ERR
-                                       "%s: Error finalizing crypto hash; "
                                       "rc = [%d]\n", __func__, rc);
                                goto out_release_free_unlock;
                        }
@@ -834,10 +827,8 @@ ecryptfs_write_tag_70_packet(char *dest, size_t *remaining_bytes,
         * of the IV here, so we just use 0's for the IV. Note the
         * constraint that ECRYPTFS_FILENAME_MIN_RANDOM_PREPEND_BYTES
         * >= ECRYPTFS_MAX_IV_BYTES. */
-        memset(s->iv, 0, ECRYPTFS_MAX_IV_BYTES);
+        rc = crypto_skcipher_setkey(
-        s->desc.info = s->iv;
+                s->skcipher_tfm,
-        rc = crypto_blkcipher_setkey(
-                s->desc.tfm,
                s->auth_tok->token.password.session_key_encryption_key,
                mount_crypt_stat->global_default_fn_cipher_key_bytes);
        if (rc < 0) {
@@ -850,8 +841,9 @@ ecryptfs_write_tag_70_packet(char *dest, size_t *remaining_bytes,
                       mount_crypt_stat->global_default_fn_cipher_key_bytes);
                goto out_release_free_unlock;
        }
-        rc = crypto_blkcipher_encrypt_iv(&s->desc, s->dst_sg, s->src_sg,
+        skcipher_request_set_crypt(s->skcipher_req, s->src_sg, s->dst_sg,
-                                         s->block_aligned_filename_size);
+                                   s->block_aligned_filename_size, s->iv);
+        rc = crypto_skcipher_encrypt(s->skcipher_req);
        if (rc) {
                printk(KERN_ERR "%s: Error attempting to encrypt filename; "
                       "rc = [%d]\n", __func__, rc);
@@ -861,7 +853,7 @@ ecryptfs_write_tag_70_packet(char *dest, size_t *remaining_bytes,
        (*packet_size) = s->i;
        (*remaining_bytes) -= (*packet_size);
 out_release_free_unlock:
-        crypto_free_hash(s->hash_desc.tfm);
+        crypto_free_shash(s->hash_tfm);
 out_free_unlock:
        kzfree(s->block_aligned_filename);
 out_unlock:
@@ -871,6 +863,8 @@ out:
                up_write(&(auth_tok_key->sem));
                key_put(auth_tok_key);
        }
+        skcipher_request_free(s->skcipher_req);
+        kzfree(s->hash_desc);
        kfree(s);
        return rc;
 }
@@ -888,7 +882,8 @@ struct ecryptfs_parse_tag_70_packet_silly_stack {
        struct ecryptfs_auth_tok *auth_tok;
        struct scatterlist src_sg[2];
        struct scatterlist dst_sg[2];
-        struct blkcipher_desc desc;
+        struct crypto_skcipher *skcipher_tfm;
+        struct skcipher_request *skcipher_req;
        char fnek_sig_hex[ECRYPTFS_SIG_SIZE_HEX + 1];
        char iv[ECRYPTFS_MAX_IV_BYTES];
        char cipher_string[ECRYPTFS_MAX_CIPHER_NAME_SIZE + 1];
@@ -922,14 +917,12 @@ ecryptfs_parse_tag_70_packet(char **filename, size_t *filename_size,
        (*packet_size) = 0;
        (*filename_size) = 0;
        (*filename) = NULL;
-        s = kmalloc(sizeof(*s), GFP_KERNEL);
+        s = kzalloc(sizeof(*s), GFP_KERNEL);
        if (!s) {
                printk(KERN_ERR "%s: Out of memory whilst trying to kmalloc "
                       "[%zd] bytes of kernel memory\n", __func__, sizeof(*s));
-                rc = -ENOMEM;
+                return -ENOMEM;
-                goto out;
        }
-        s->desc.flags = CRYPTO_TFM_REQ_MAY_SLEEP;
        if (max_packet_size < ECRYPTFS_TAG_70_MIN_METADATA_SIZE) {
                printk(KERN_WARNING "%s: max_packet_size is [%zd]; it must be "
                       "at least [%d]\n", __func__, max_packet_size,
@@ -992,7 +985,7 @@ ecryptfs_parse_tag_70_packet(char **filename, size_t *filename_size,
                       rc);
                goto out;
        }
-        rc = ecryptfs_get_tfm_and_mutex_for_cipher_name(&s->desc.tfm,
+        rc = ecryptfs_get_tfm_and_mutex_for_cipher_name(&s->skcipher_tfm,
                                                        &s->tfm_mutex,
                                                        s->cipher_string);
        if (unlikely(rc)) {
@@ -1030,12 +1023,23 @@ ecryptfs_parse_tag_70_packet(char **filename, size_t *filename_size,
                       __func__, rc, s->block_aligned_filename_size);
                goto out_free_unlock;
        }
+        s->skcipher_req = skcipher_request_alloc(s->skcipher_tfm, GFP_KERNEL);
+        if (!s->skcipher_req) {
+                printk(KERN_ERR "%s: Out of kernel memory whilst attempting to "
+                       "skcipher_request_alloc for %s\n", __func__,
+                       crypto_skcipher_driver_name(s->skcipher_tfm));
+                rc = -ENOMEM;
+                goto out_free_unlock;
+        }
+        skcipher_request_set_callback(s->skcipher_req,
+                                      CRYPTO_TFM_REQ_MAY_SLEEP, NULL, NULL);
        /* The characters in the first block effectively do the job of
         * the IV here, so we just use 0's for the IV. Note the
         * constraint that ECRYPTFS_FILENAME_MIN_RANDOM_PREPEND_BYTES
         * >= ECRYPTFS_MAX_IV_BYTES. */
-        memset(s->iv, 0, ECRYPTFS_MAX_IV_BYTES);
-        s->desc.info = s->iv;
        /* TODO: Support other key modules than passphrase for
         * filename encryption */
        if (s->auth_tok->token_type != ECRYPTFS_PASSWORD) {
@@ -1044,8 +1048,8 @@ ecryptfs_parse_tag_70_packet(char **filename, size_t *filename_size,
                       "password tokens\n", __func__);
                goto out_free_unlock;
        }
-        rc = crypto_blkcipher_setkey(
+        rc = crypto_skcipher_setkey(
-                s->desc.tfm,
+                s->skcipher_tfm,
                s->auth_tok->token.password.session_key_encryption_key,
                mount_crypt_stat->global_default_fn_cipher_key_bytes);
        if (rc < 0) {
@@ -1058,14 +1062,14 @@ ecryptfs_parse_tag_70_packet(char **filename, size_t *filename_size,
                       mount_crypt_stat->global_default_fn_cipher_key_bytes);
                goto out_free_unlock;
        }
-        rc = crypto_blkcipher_decrypt_iv(&s->desc, s->dst_sg, s->src_sg,
+        skcipher_request_set_crypt(s->skcipher_req, s->src_sg, s->dst_sg,
-                                         s->block_aligned_filename_size);
+                                   s->block_aligned_filename_size, s->iv);
+        rc = crypto_skcipher_decrypt(s->skcipher_req);
        if (rc) {
                printk(KERN_ERR "%s: Error attempting to decrypt filename; "
                       "rc = [%d]\n", __func__, rc);
                goto out_free_unlock;
        }
-        s->i = 0;
        while (s->decrypted_filename[s->i] != '\0'
               && s->i < s->block_aligned_filename_size)
                s->i++;
@@ -1108,6 +1112,7 @@ out:
                up_write(&(auth_tok_key->sem));
                key_put(auth_tok_key);
        }
+        skcipher_request_free(s->skcipher_req);
        kfree(s);
        return rc;
 }
@@ -1667,9 +1672,8 @@ decrypt_passphrase_encrypted_session_key(struct ecryptfs_auth_tok *auth_tok,
        struct scatterlist dst_sg[2];
        struct scatterlist src_sg[2];
        struct mutex *tfm_mutex;
-        struct blkcipher_desc desc = {
+        struct crypto_skcipher *tfm;
-                .flags = CRYPTO_TFM_REQ_MAY_SLEEP
+        struct skcipher_request *req = NULL;
-        };
        int rc = 0;
        if (unlikely(ecryptfs_verbosity > 0)) {
@@ -1680,7 +1684,7 @@ decrypt_passphrase_encrypted_session_key(struct ecryptfs_auth_tok *auth_tok,
                        auth_tok->token.password.session_key_encryption_key,
                        auth_tok->token.password.session_key_encryption_key_bytes);
        }
-        rc = ecryptfs_get_tfm_and_mutex_for_cipher_name(&desc.tfm, &tfm_mutex,
+        rc = ecryptfs_get_tfm_and_mutex_for_cipher_name(&tfm, &tfm_mutex,
                                                        crypt_stat->cipher);
        if (unlikely(rc)) {
                printk(KERN_ERR "Internal error whilst attempting to get "
@@ -1711,8 +1715,20 @@ decrypt_passphrase_encrypted_session_key(struct ecryptfs_auth_tok *auth_tok,
                goto out;
        }
        mutex_lock(tfm_mutex);
-        rc = crypto_blkcipher_setkey(
+        req = skcipher_request_alloc(tfm, GFP_KERNEL);
-                desc.tfm, auth_tok->token.password.session_key_encryption_key,
+        if (!req) {
+                mutex_unlock(tfm_mutex);
+                printk(KERN_ERR "%s: Out of kernel memory whilst attempting to "
+                       "skcipher_request_alloc for %s\n", __func__,
+                       crypto_skcipher_driver_name(tfm));
+                rc = -ENOMEM;
+                goto out;
+        }
+        skcipher_request_set_callback(req, CRYPTO_TFM_REQ_MAY_SLEEP,
+                                      NULL, NULL);
+        rc = crypto_skcipher_setkey(
+                tfm, auth_tok->token.password.session_key_encryption_key,
                crypt_stat->key_size);
        if (unlikely(rc < 0)) {
                mutex_unlock(tfm_mutex);
@@ -1720,8 +1736,10 @@ decrypt_passphrase_encrypted_session_key(struct ecryptfs_auth_tok *auth_tok,
                rc = -EINVAL;
                goto out;
        }
-        rc = crypto_blkcipher_decrypt(&desc, dst_sg, src_sg,
+        skcipher_request_set_crypt(req, src_sg, dst_sg,
-                                      auth_tok->session_key.encrypted_key_size);
+                                   auth_tok->session_key.encrypted_key_size,
+                                   NULL);
+        rc = crypto_skcipher_decrypt(req);
        mutex_unlock(tfm_mutex);
        if (unlikely(rc)) {
                printk(KERN_ERR "Error decrypting; rc = [%d]\n", rc);
@@ -1738,6 +1756,7 @@ decrypt_passphrase_encrypted_session_key(struct ecryptfs_auth_tok *auth_tok,
                                  crypt_stat->key_size);
        }
 out:
+        skcipher_request_free(req);
        return rc;
 }
@@ -2191,16 +2210,14 @@ write_tag_3_packet(char *dest, size_t *remaining_bytes,
        size_t max_packet_size;
        struct ecryptfs_mount_crypt_stat *mount_crypt_stat =
                crypt_stat->mount_crypt_stat;
-        struct blkcipher_desc desc = {
+        struct crypto_skcipher *tfm;
-                .tfm = NULL,
+        struct skcipher_request *req;
-                .flags = CRYPTO_TFM_REQ_MAY_SLEEP
-        };
        int rc = 0;
        (*packet_size) = 0;
        ecryptfs_from_hex(key_rec->sig, auth_tok->token.password.signature,
                          ECRYPTFS_SIG_SIZE);
-        rc = ecryptfs_get_tfm_and_mutex_for_cipher_name(&desc.tfm, &tfm_mutex,
+        rc = ecryptfs_get_tfm_and_mutex_for_cipher_name(&tfm, &tfm_mutex,
                                                        crypt_stat->cipher);
        if (unlikely(rc)) {
                printk(KERN_ERR "Internal error whilst attempting to get "
@@ -2209,12 +2226,11 @@ write_tag_3_packet(char *dest, size_t *remaining_bytes,
                goto out;
        }
        if (mount_crypt_stat->global_default_cipher_key_size == 0) {
-                struct blkcipher_alg *alg = crypto_blkcipher_alg(desc.tfm);
                printk(KERN_WARNING "No key size specified at mount; "
-                       "defaulting to [%d]\n", alg->max_keysize);
+                       "defaulting to [%d]\n",
+                       crypto_skcipher_default_keysize(tfm));
                mount_crypt_stat->global_default_cipher_key_size =
-                        alg->max_keysize;
+                        crypto_skcipher_default_keysize(tfm);
        }
        if (crypt_stat->key_size == 0)
                crypt_stat->key_size =
@@ -2284,20 +2300,36 @@ write_tag_3_packet(char *dest, size_t *remaining_bytes,
                goto out;
        }
        mutex_lock(tfm_mutex);
-        rc = crypto_blkcipher_setkey(desc.tfm, session_key_encryption_key,
+        rc = crypto_skcipher_setkey(tfm, session_key_encryption_key,
-                                     crypt_stat->key_size);
+                                    crypt_stat->key_size);
        if (rc < 0) {
                mutex_unlock(tfm_mutex);
                ecryptfs_printk(KERN_ERR, "Error setting key for crypto "
                                "context; rc = [%d]\n", rc);
                goto out;
        }
+        req = skcipher_request_alloc(tfm, GFP_KERNEL);
+        if (!req) {
+                mutex_unlock(tfm_mutex);
+                ecryptfs_printk(KERN_ERR, "Out of kernel memory whilst "
+                                "attempting to skcipher_request_alloc for "
+                                "%s\n", crypto_skcipher_driver_name(tfm));
+                rc = -ENOMEM;
+                goto out;
+        }
+        skcipher_request_set_callback(req, CRYPTO_TFM_REQ_MAY_SLEEP,
+                                      NULL, NULL);
        rc = 0;
        ecryptfs_printk(KERN_DEBUG, "Encrypting [%zd] bytes of the key\n",
                        crypt_stat->key_size);
-        rc = crypto_blkcipher_encrypt(&desc, dst_sg, src_sg,
+        skcipher_request_set_crypt(req, src_sg, dst_sg,
-                                      (*key_rec).enc_key_size);
+                                   (*key_rec).enc_key_size, NULL);
+        rc = crypto_skcipher_encrypt(req);
        mutex_unlock(tfm_mutex);
+        skcipher_request_free(req);
        if (rc) {
                printk(KERN_ERR "Error encrypting; rc = [%d]\n", rc);
                goto out;
diff --git a/fs/ecryptfs/main.c b/fs/ecryptfs/main.c
index e25b6b06bacf..8b0b4a73116d 100644
--- a/fs/ecryptfs/main.c
+++ b/fs/ecryptfs/main.c
@@ -29,7 +29,6 @@
 #include <linux/module.h>
 #include <linux/namei.h>
 #include <linux/skbuff.h>
-#include <linux/crypto.h>
 #include <linux/mount.h>
 #include <linux/pagemap.h>
 #include <linux/key.h>
diff --git a/fs/ecryptfs/mmap.c b/fs/ecryptfs/mmap.c
index c6ced4cbf0cf..1f5865263b3e 100644
--- a/fs/ecryptfs/mmap.c
+++ b/fs/ecryptfs/mmap.c
@@ -30,7 +30,6 @@
 #include <linux/page-flags.h>
 #include <linux/mount.h>
 #include <linux/file.h>
-#include <linux/crypto.h>
 #include <linux/scatterlist.h>
 #include <linux/slab.h>
 #include <asm/unaligned.h>
diff --git a/fs/ecryptfs/super.c b/fs/ecryptfs/super.c
index afa1b81c3418..77a486d3a51b 100644
--- a/fs/ecryptfs/super.c
+++ b/fs/ecryptfs/super.c
@@ -29,7 +29,6 @@
 #include <linux/slab.h>
 #include <linux/seq_file.h>
 #include <linux/file.h>
-#include <linux/crypto.h>
 #include <linux/statfs.h>
 #include <linux/magic.h>
 #include "ecryptfs_kernel.h"
diff --git a/fs/eventfd.c b/fs/eventfd.c
index ed70cf9fdc7b..1231cd1999d8 100644
--- a/fs/eventfd.c
+++ b/fs/eventfd.c
@@ -121,8 +121,46 @@ static unsigned int eventfd_poll(struct file *file, poll_table *wait)
        u64 count;
        poll_wait(file, &ctx->wqh, wait);
-        smp_rmb();
-        count = ctx->count;
+        /*
+         * All writes to ctx->count occur within ctx->wqh.lock.  This read
+         * can be done outside ctx->wqh.lock because we know that poll_wait
+         * takes that lock (through add_wait_queue) if our caller will sleep.
+         *
+         * The read _can_ therefore seep into add_wait_queue's critical
+         * section, but cannot move above it!  add_wait_queue's spin_lock acts
+         * as an acquire barrier and ensures that the read be ordered properly
+         * against the writes.  The following CAN happen and is safe:
+         *
+         *     poll                               write
+         *     -----------------                  ------------
+         *     lock ctx->wqh.lock (in poll_wait)
+         *     count = ctx->count
+         *     __add_wait_queue
+         *     unlock ctx->wqh.lock
+         *                                        lock ctx->qwh.lock
+         *                                        ctx->count += n
+         *                                        if (waitqueue_active)
+         *                                          wake_up_locked_poll
+         *                                        unlock ctx->qwh.lock
+         *     eventfd_poll returns 0
+         *
+         * but the following, which would miss a wakeup, cannot happen:
+         *
+         *     poll                               write
+         *     -----------------                  ------------
+         *     count = ctx->count (INVALID!)
+         *                                        lock ctx->qwh.lock
+         *                                        ctx->count += n
+         *                                        **waitqueue_active is false**
+         *                                        **no wake_up_locked_poll!**
+         *                                        unlock ctx->qwh.lock
+         *     lock ctx->wqh.lock (in poll_wait)
+         *     __add_wait_queue
+         *     unlock ctx->wqh.lock
+         *     eventfd_poll returns 0
+         */
+        count = READ_ONCE(ctx->count);
        if (count > 0)
                events |= POLLIN;
diff --git a/fs/eventpoll.c b/fs/eventpoll.c
index cde60741cad2..8a74a2a52e0f 100644
--- a/fs/eventpoll.c
+++ b/fs/eventpoll.c
@@ -1616,7 +1616,7 @@ static int ep_poll(struct eventpoll *ep, struct epoll_event __user *events,
 {
        int res = 0, eavail, timed_out = 0;
        unsigned long flags;
-        long slack = 0;
+        u64 slack = 0;
        wait_queue_t wait;
        ktime_t expires, *to = NULL;
diff --git a/fs/exec.c b/fs/exec.c
index dcd4ac7d3f1e..c4010b8207a1 100644
--- a/fs/exec.c
+++ b/fs/exec.c
@@ -56,6 +56,7 @@
 #include <linux/pipe_fs_i.h>
 #include <linux/oom.h>
 #include <linux/compat.h>
+#include <linux/vmalloc.h>
 #include <asm/uaccess.h>
 #include <asm/mmu_context.h>
@@ -198,8 +199,12 @@ static struct page *get_arg_page(struct linux_binprm *bprm, unsigned long pos,
                        return NULL;
        }
 #endif
-        ret = get_user_pages(current, bprm->mm, pos,
+        /*
-                        1, write, 1, &page, NULL);
+         * We are doing an exec().  'current' is the process
+         * doing the exec and bprm->mm is the new process's mm.
+         */
+        ret = get_user_pages_remote(current, bprm->mm, pos, 1, write,
+                        1, &page, NULL);
        if (ret <= 0)
                return NULL;
@@ -831,6 +836,97 @@ int kernel_read(struct file *file, loff_t offset,
 EXPORT_SYMBOL(kernel_read);
+int kernel_read_file(struct file *file, void **buf, loff_t *size,
+                     loff_t max_size, enum kernel_read_file_id id)
+{
+        loff_t i_size, pos;
+        ssize_t bytes = 0;
+        int ret;
+        if (!S_ISREG(file_inode(file)->i_mode) || max_size < 0)
+                return -EINVAL;
+        ret = security_kernel_read_file(file, id);
+        if (ret)
+                return ret;
+        i_size = i_size_read(file_inode(file));
+        if (max_size > 0 && i_size > max_size)
+                return -EFBIG;
+        if (i_size <= 0)
+                return -EINVAL;
+        *buf = vmalloc(i_size);
+        if (!*buf)
+                return -ENOMEM;
+        pos = 0;
+        while (pos < i_size) {
+                bytes = kernel_read(file, pos, (char *)(*buf) + pos,
+                                    i_size - pos);
+                if (bytes < 0) {
+                        ret = bytes;
+                        goto out;
+                }
+                if (bytes == 0)
+                        break;
+                pos += bytes;
+        }
+        if (pos != i_size) {
+                ret = -EIO;
+                goto out;
+        }
+        ret = security_kernel_post_read_file(file, *buf, i_size, id);
+        if (!ret)
+                *size = pos;
+out:
+        if (ret < 0) {
+                vfree(*buf);
+                *buf = NULL;
+        }
+        return ret;
+}
+EXPORT_SYMBOL_GPL(kernel_read_file);
+int kernel_read_file_from_path(char *path, void **buf, loff_t *size,
+                               loff_t max_size, enum kernel_read_file_id id)
+{
+        struct file *file;
+        int ret;
+        if (!path || !*path)
+                return -EINVAL;
+        file = filp_open(path, O_RDONLY, 0);
+        if (IS_ERR(file))
+                return PTR_ERR(file);
+        ret = kernel_read_file(file, buf, size, max_size, id);
+        fput(file);
+        return ret;
+}
+EXPORT_SYMBOL_GPL(kernel_read_file_from_path);
+int kernel_read_file_from_fd(int fd, void **buf, loff_t *size, loff_t max_size,
+                             enum kernel_read_file_id id)
+{
+        struct fd f = fdget(fd);
+        int ret = -EBADF;
+        if (!f.file)
+                goto out;
+        ret = kernel_read_file(f.file, buf, size, max_size, id);
+out:
+        fdput(f);
+        return ret;
+}
+EXPORT_SYMBOL_GPL(kernel_read_file_from_fd);
 ssize_t read_code(struct file *file, unsigned long addr, loff_t pos, size_t len)
 {
        ssize_t res = vfs_read(file, (void __user *)addr, len, &pos);
diff --git a/fs/ext2/ext2.h b/fs/ext2/ext2.h
index 4c69c94cafd8..170939f379d7 100644
--- a/fs/ext2/ext2.h
+++ b/fs/ext2/ext2.h
@@ -61,6 +61,8 @@ struct ext2_block_alloc_info {
 #define rsv_start rsv_window._rsv_start
 #define rsv_end rsv_window._rsv_end
+struct mb_cache;
 /*
 * second extended-fs super-block data in memory
 */
@@ -111,6 +113,7 @@ struct ext2_sb_info {
         * of the mount options.
         */
        spinlock_t s_lock;
+        struct mb_cache *s_mb_cache;
 };
 static inline spinlock_t *
diff --git a/fs/ext2/super.c b/fs/ext2/super.c
index 2a188413a2b0..b78caf25f746 100644
--- a/fs/ext2/super.c
+++ b/fs/ext2/super.c
@@ -131,7 +131,10 @@ static void ext2_put_super (struct super_block * sb)
        dquot_disable(sb, -1, DQUOT_USAGE_ENABLED | DQUOT_LIMITS_ENABLED);
-        ext2_xattr_put_super(sb);
+        if (sbi->s_mb_cache) {
+                ext2_xattr_destroy_cache(sbi->s_mb_cache);
+                sbi->s_mb_cache = NULL;
+        }
        if (!(sb->s_flags & MS_RDONLY)) {
                struct ext2_super_block *es = sbi->s_es;
@@ -1104,6 +1107,14 @@ static int ext2_fill_super(struct super_block *sb, void *data, int silent)
                ext2_msg(sb, KERN_ERR, "error: insufficient memory");
                goto failed_mount3;
        }
+#ifdef CONFIG_EXT2_FS_XATTR
+        sbi->s_mb_cache = ext2_xattr_create_cache();
+        if (!sbi->s_mb_cache) {
+                ext2_msg(sb, KERN_ERR, "Failed to create an mb_cache");
+                goto failed_mount3;
+        }
+#endif
        /*
         * set up enough so that it can read an inode
         */
@@ -1149,6 +1160,8 @@ cantfind_ext2:
                        sb->s_id);
        goto failed_mount;
 failed_mount3:
+        if (sbi->s_mb_cache)
+                ext2_xattr_destroy_cache(sbi->s_mb_cache);
        percpu_counter_destroy(&sbi->s_freeblocks_counter);
        percpu_counter_destroy(&sbi->s_freeinodes_counter);
        percpu_counter_destroy(&sbi->s_dirs_counter);
@@ -1555,20 +1568,17 @@ MODULE_ALIAS_FS("ext2");
 static int __init init_ext2_fs(void)
 {
-        int err = init_ext2_xattr();
+        int err;
-        if (err)
-                return err;
        err = init_inodecache();
        if (err)
-                goto out1;
+                return err;
        err = register_filesystem(&ext2_fs_type);
        if (err)
                goto out;
        return 0;
 out:
        destroy_inodecache();
-out1:
-        exit_ext2_xattr();
        return err;
 }
@@ -1576,7 +1586,6 @@ static void __exit exit_ext2_fs(void)
 {
        unregister_filesystem(&ext2_fs_type);
        destroy_inodecache();
-        exit_ext2_xattr();
 }
 MODULE_AUTHOR("Remy Card and others");
diff --git a/fs/ext2/xattr.c b/fs/ext2/xattr.c
index f57a7aba32eb..1a5e3bff0b63 100644
--- a/fs/ext2/xattr.c
+++ b/fs/ext2/xattr.c
@@ -90,14 +90,12 @@
 static int ext2_xattr_set2(struct inode *, struct buffer_head *,
                           struct ext2_xattr_header *);
-static int ext2_xattr_cache_insert(struct buffer_head *);
+static int ext2_xattr_cache_insert(struct mb_cache *, struct buffer_head *);
 static struct buffer_head *ext2_xattr_cache_find(struct inode *,
                                                 struct ext2_xattr_header *);
 static void ext2_xattr_rehash(struct ext2_xattr_header *,
                              struct ext2_xattr_entry *);
-static struct mb_cache *ext2_xattr_cache;
 static const struct xattr_handler *ext2_xattr_handler_map[] = {
        [EXT2_XATTR_INDEX_USER]              = &ext2_xattr_user_handler,
 #ifdef CONFIG_EXT2_FS_POSIX_ACL
@@ -152,6 +150,7 @@ ext2_xattr_get(struct inode *inode, int name_index, const char *name,
        size_t name_len, size;
        char *end;
        int error;
+        struct mb_cache *ext2_mb_cache = EXT2_SB(inode->i_sb)->s_mb_cache;
        ea_idebug(inode, "name=%d.%s, buffer=%p, buffer_size=%ld",
                  name_index, name, buffer, (long)buffer_size);
@@ -196,7 +195,7 @@ bad_block:	ext2_error(inode->i_sb, "ext2_xattr_get",
                        goto found;
                entry = next;
        }
-        if (ext2_xattr_cache_insert(bh))
+        if (ext2_xattr_cache_insert(ext2_mb_cache, bh))
                ea_idebug(inode, "cache insert failed");
        error = -ENODATA;
        goto cleanup;
@@ -209,7 +208,7 @@ found:
            le16_to_cpu(entry->e_value_offs) + size > inode->i_sb->s_blocksize)
                goto bad_block;
-        if (ext2_xattr_cache_insert(bh))
+        if (ext2_xattr_cache_insert(ext2_mb_cache, bh))
                ea_idebug(inode, "cache insert failed");
        if (buffer) {
                error = -ERANGE;
@@ -247,6 +246,7 @@ ext2_xattr_list(struct dentry *dentry, char *buffer, size_t buffer_size)
        char *end;
        size_t rest = buffer_size;
        int error;
+        struct mb_cache *ext2_mb_cache = EXT2_SB(inode->i_sb)->s_mb_cache;
        ea_idebug(inode, "buffer=%p, buffer_size=%ld",
                  buffer, (long)buffer_size);
@@ -281,7 +281,7 @@ bad_block:	ext2_error(inode->i_sb, "ext2_xattr_list",
                        goto bad_block;
                entry = next;
        }
-        if (ext2_xattr_cache_insert(bh))
+        if (ext2_xattr_cache_insert(ext2_mb_cache, bh))
                ea_idebug(inode, "cache insert failed");
        /* list the attribute names */
@@ -483,22 +483,23 @@ bad_block:		ext2_error(sb, "ext2_xattr_set",
        /* Here we know that we can set the new attribute. */
        if (header) {
-                struct mb_cache_entry *ce;
                /* assert(header == HDR(bh)); */
-                ce = mb_cache_entry_get(ext2_xattr_cache, bh->b_bdev,
-                                        bh->b_blocknr);
                lock_buffer(bh);
                if (header->h_refcount == cpu_to_le32(1)) {
+                        __u32 hash = le32_to_cpu(header->h_hash);
                        ea_bdebug(bh, "modifying in-place");
-                        if (ce)
+                        /*
-                                mb_cache_entry_free(ce);
+                         * This must happen under buffer lock for
+                         * ext2_xattr_set2() to reliably detect modified block
+                         */
+                        mb_cache_entry_delete_block(EXT2_SB(sb)->s_mb_cache,
+                                                    hash, bh->b_blocknr);
                        /* keep the buffer locked while modifying it. */
                } else {
                        int offset;
-                        if (ce)
-                                mb_cache_entry_release(ce);
                        unlock_buffer(bh);
                        ea_bdebug(bh, "cloning");
                        header = kmalloc(bh->b_size, GFP_KERNEL);
@@ -626,6 +627,7 @@ ext2_xattr_set2(struct inode *inode, struct buffer_head *old_bh,
        struct super_block *sb = inode->i_sb;
        struct buffer_head *new_bh = NULL;
        int error;
+        struct mb_cache *ext2_mb_cache = EXT2_SB(sb)->s_mb_cache;
        if (header) {
                new_bh = ext2_xattr_cache_find(inode, header);
@@ -653,7 +655,7 @@ ext2_xattr_set2(struct inode *inode, struct buffer_head *old_bh,
                           don't need to change the reference count. */
                        new_bh = old_bh;
                        get_bh(new_bh);
-                        ext2_xattr_cache_insert(new_bh);
+                        ext2_xattr_cache_insert(ext2_mb_cache, new_bh);
                } else {
                        /* We need to allocate a new block */
                        ext2_fsblk_t goal = ext2_group_first_block_no(sb,
@@ -674,7 +676,7 @@ ext2_xattr_set2(struct inode *inode, struct buffer_head *old_bh,
                        memcpy(new_bh->b_data, header, new_bh->b_size);
                        set_buffer_uptodate(new_bh);
                        unlock_buffer(new_bh);
-                        ext2_xattr_cache_insert(new_bh);
+                        ext2_xattr_cache_insert(ext2_mb_cache, new_bh);
                        
                        ext2_xattr_update_super_block(sb);
                }
@@ -707,19 +709,21 @@ ext2_xattr_set2(struct inode *inode, struct buffer_head *old_bh,
        error = 0;
        if (old_bh && old_bh != new_bh) {
-                struct mb_cache_entry *ce;
                /*
                 * If there was an old block and we are no longer using it,
                 * release the old block.
                 */
-                ce = mb_cache_entry_get(ext2_xattr_cache, old_bh->b_bdev,
-                                        old_bh->b_blocknr);
                lock_buffer(old_bh);
                if (HDR(old_bh)->h_refcount == cpu_to_le32(1)) {
+                        __u32 hash = le32_to_cpu(HDR(old_bh)->h_hash);
+                        /*
+                         * This must happen under buffer lock for
+                         * ext2_xattr_set2() to reliably detect freed block
+                         */
+                        mb_cache_entry_delete_block(ext2_mb_cache,
+                                                    hash, old_bh->b_blocknr);
                        /* Free the old block. */
-                        if (ce)
-                                mb_cache_entry_free(ce);
                        ea_bdebug(old_bh, "freeing");
                        ext2_free_blocks(inode, old_bh->b_blocknr, 1);
                        mark_inode_dirty(inode);
@@ -730,8 +734,6 @@ ext2_xattr_set2(struct inode *inode, struct buffer_head *old_bh,
                } else {
                        /* Decrement the refcount only. */
                        le32_add_cpu(&HDR(old_bh)->h_refcount, -1);
-                        if (ce)
-                                mb_cache_entry_release(ce);
                        dquot_free_block_nodirty(inode, 1);
                        mark_inode_dirty(inode);
                        mark_buffer_dirty(old_bh);
@@ -757,7 +759,6 @@ void
 ext2_xattr_delete_inode(struct inode *inode)
 {
        struct buffer_head *bh = NULL;
-        struct mb_cache_entry *ce;
        down_write(&EXT2_I(inode)->xattr_sem);
        if (!EXT2_I(inode)->i_file_acl)
@@ -777,19 +778,22 @@ ext2_xattr_delete_inode(struct inode *inode)
                        EXT2_I(inode)->i_file_acl);
                goto cleanup;
        }
-        ce = mb_cache_entry_get(ext2_xattr_cache, bh->b_bdev, bh->b_blocknr);
        lock_buffer(bh);
        if (HDR(bh)->h_refcount == cpu_to_le32(1)) {
-                if (ce)
+                __u32 hash = le32_to_cpu(HDR(bh)->h_hash);
-                        mb_cache_entry_free(ce);
+                /*
+                 * This must happen under buffer lock for ext2_xattr_set2() to
+                 * reliably detect freed block
+                 */
+                mb_cache_entry_delete_block(EXT2_SB(inode->i_sb)->s_mb_cache,
+                                            hash, bh->b_blocknr);
                ext2_free_blocks(inode, EXT2_I(inode)->i_file_acl, 1);
                get_bh(bh);
                bforget(bh);
                unlock_buffer(bh);
        } else {
                le32_add_cpu(&HDR(bh)->h_refcount, -1);
-                if (ce)
-                        mb_cache_entry_release(ce);
                ea_bdebug(bh, "refcount now=%d",
                        le32_to_cpu(HDR(bh)->h_refcount));
                unlock_buffer(bh);
@@ -806,18 +810,6 @@ cleanup:
 }
 /*
- * ext2_xattr_put_super()
- *
- * This is called when a file system is unmounted.
- */
-void
-ext2_xattr_put_super(struct super_block *sb)
-{
-        mb_cache_shrink(sb->s_bdev);
-}
-/*
 * ext2_xattr_cache_insert()
 *
 * Create a new entry in the extended attribute cache, and insert
@@ -826,28 +818,20 @@ ext2_xattr_put_super(struct super_block *sb)
 * Returns 0, or a negative error number on failure.
 */
 static int
-ext2_xattr_cache_insert(struct buffer_head *bh)
+ext2_xattr_cache_insert(struct mb_cache *cache, struct buffer_head *bh)
 {
        __u32 hash = le32_to_cpu(HDR(bh)->h_hash);
-        struct mb_cache_entry *ce;
        int error;
-        ce = mb_cache_entry_alloc(ext2_xattr_cache, GFP_NOFS);
+        error = mb_cache_entry_create(cache, GFP_NOFS, hash, bh->b_blocknr, 1);
-        if (!ce)
-                return -ENOMEM;
-        error = mb_cache_entry_insert(ce, bh->b_bdev, bh->b_blocknr, hash);
        if (error) {
-                mb_cache_entry_free(ce);
                if (error == -EBUSY) {
                        ea_bdebug(bh, "already in cache (%d cache entries)",
                                atomic_read(&ext2_xattr_cache->c_entry_count));
                        error = 0;
                }
-        } else {
+        } else
-                ea_bdebug(bh, "inserting [%x] (%d cache entries)", (int)hash,
+                ea_bdebug(bh, "inserting [%x]", (int)hash);
-                          atomic_read(&ext2_xattr_cache->c_entry_count));
-                mb_cache_entry_release(ce);
-        }
        return error;
 }
@@ -904,22 +888,16 @@ ext2_xattr_cache_find(struct inode *inode, struct ext2_xattr_header *header)
 {
        __u32 hash = le32_to_cpu(header->h_hash);
        struct mb_cache_entry *ce;
+        struct mb_cache *ext2_mb_cache = EXT2_SB(inode->i_sb)->s_mb_cache;
        if (!header->h_hash)
                return NULL;  /* never share */
        ea_idebug(inode, "looking for cached blocks [%x]", (int)hash);
 again:
-        ce = mb_cache_entry_find_first(ext2_xattr_cache, inode->i_sb->s_bdev,
+        ce = mb_cache_entry_find_first(ext2_mb_cache, hash);
-                                       hash);
        while (ce) {
                struct buffer_head *bh;
-                if (IS_ERR(ce)) {
-                        if (PTR_ERR(ce) == -EAGAIN)
-                                goto again;
-                        break;
-                }
                bh = sb_bread(inode->i_sb, ce->e_block);
                if (!bh) {
                        ext2_error(inode->i_sb, "ext2_xattr_cache_find",
@@ -927,7 +905,21 @@ again:
                                inode->i_ino, (unsigned long) ce->e_block);
                } else {
                        lock_buffer(bh);
-                        if (le32_to_cpu(HDR(bh)->h_refcount) >
+                        /*
+                         * We have to be careful about races with freeing or
+                         * rehashing of xattr block. Once we hold buffer lock
+                         * xattr block's state is stable so we can check
+                         * whether the block got freed / rehashed or not.
+                         * Since we unhash mbcache entry under buffer lock when
+                         * freeing / rehashing xattr block, checking whether
+                         * entry is still hashed is reliable.
+                         */
+                        if (hlist_bl_unhashed(&ce->e_hash_list)) {
+                                mb_cache_entry_put(ext2_mb_cache, ce);
+                                unlock_buffer(bh);
+                                brelse(bh);
+                                goto again;
+                        } else if (le32_to_cpu(HDR(bh)->h_refcount) >
                                   EXT2_XATTR_REFCOUNT_MAX) {
                                ea_idebug(inode, "block %ld refcount %d>%d",
                                          (unsigned long) ce->e_block,
@@ -936,13 +928,14 @@ again:
                        } else if (!ext2_xattr_cmp(header, HDR(bh))) {
                                ea_bdebug(bh, "b_count=%d",
                                          atomic_read(&(bh->b_count)));
-                                mb_cache_entry_release(ce);
+                                mb_cache_entry_touch(ext2_mb_cache, ce);
+                                mb_cache_entry_put(ext2_mb_cache, ce);
                                return bh;
                        }
                        unlock_buffer(bh);
                        brelse(bh);
                }
-                ce = mb_cache_entry_find_next(ce, inode->i_sb->s_bdev, hash);
+                ce = mb_cache_entry_find_next(ext2_mb_cache, ce);
        }
        return NULL;
 }
@@ -1015,17 +1008,15 @@ static void ext2_xattr_rehash(struct ext2_xattr_header *header,
 #undef BLOCK_HASH_SHIFT
-int __init
+#define HASH_BUCKET_BITS 10
-init_ext2_xattr(void)
+struct mb_cache *ext2_xattr_create_cache(void)
 {
-        ext2_xattr_cache = mb_cache_create("ext2_xattr", 6);
+        return mb_cache_create(HASH_BUCKET_BITS);
-        if (!ext2_xattr_cache)
-                return -ENOMEM;
-        return 0;
 }
-void
+void ext2_xattr_destroy_cache(struct mb_cache *cache)
-exit_ext2_xattr(void)
 {
-        mb_cache_destroy(ext2_xattr_cache);
+        if (cache)
+                mb_cache_destroy(cache);
 }
diff --git a/fs/ext2/xattr.h b/fs/ext2/xattr.h
index 60edf298644e..6f82ab1b00ca 100644
--- a/fs/ext2/xattr.h
+++ b/fs/ext2/xattr.h
@@ -53,6 +53,8 @@ struct ext2_xattr_entry {
 #define EXT2_XATTR_SIZE(size) \
        (((size) + EXT2_XATTR_ROUND) & ~EXT2_XATTR_ROUND)
+struct mb_cache;
 # ifdef CONFIG_EXT2_FS_XATTR
 extern const struct xattr_handler ext2_xattr_user_handler;
@@ -65,10 +67,9 @@ extern int ext2_xattr_get(struct inode *, int, const char *, void *, size_t);
 extern int ext2_xattr_set(struct inode *, int, const char *, const void *, size_t, int);
 extern void ext2_xattr_delete_inode(struct inode *);
-extern void ext2_xattr_put_super(struct super_block *);
-extern int init_ext2_xattr(void);
+extern struct mb_cache *ext2_xattr_create_cache(void);
-extern void exit_ext2_xattr(void);
+extern void ext2_xattr_destroy_cache(struct mb_cache *cache);
 extern const struct xattr_handler *ext2_xattr_handlers[];
@@ -93,19 +94,7 @@ ext2_xattr_delete_inode(struct inode *inode)
 {
 }
-static inline void
+static inline void ext2_xattr_destroy_cache(struct mb_cache *cache)
-ext2_xattr_put_super(struct super_block *sb)
-{
-}
-static inline int
-init_ext2_xattr(void)
-{
-        return 0;
-}
-static inline void
-exit_ext2_xattr(void)
 {
 }
diff --git a/fs/ext4/crypto.c b/fs/ext4/crypto.c
index 38f7562489bb..edc053a81914 100644
--- a/fs/ext4/crypto.c
+++ b/fs/ext4/crypto.c
@@ -18,11 +18,9 @@
 * Special Publication 800-38E and IEEE P1619/D16.
 */
-#include <crypto/hash.h>
+#include <crypto/skcipher.h>
-#include <crypto/sha.h>
 #include <keys/user-type.h>
 #include <keys/encrypted-type.h>
-#include <linux/crypto.h>
 #include <linux/ecryptfs.h>
 #include <linux/gfp.h>
 #include <linux/kernel.h>
@@ -261,21 +259,21 @@ static int ext4_page_crypto(struct inode *inode,
 {
        u8 xts_tweak[EXT4_XTS_TWEAK_SIZE];
-        struct ablkcipher_request *req = NULL;
+        struct skcipher_request *req = NULL;
        DECLARE_EXT4_COMPLETION_RESULT(ecr);
        struct scatterlist dst, src;
        struct ext4_crypt_info *ci = EXT4_I(inode)->i_crypt_info;
-        struct crypto_ablkcipher *tfm = ci->ci_ctfm;
+        struct crypto_skcipher *tfm = ci->ci_ctfm;
        int res = 0;
-        req = ablkcipher_request_alloc(tfm, GFP_NOFS);
+        req = skcipher_request_alloc(tfm, GFP_NOFS);
        if (!req) {
                printk_ratelimited(KERN_ERR
                                   "%s: crypto_request_alloc() failed\n",
                                   __func__);
                return -ENOMEM;
        }
-        ablkcipher_request_set_callback(
+        skcipher_request_set_callback(
                req, CRYPTO_TFM_REQ_MAY_BACKLOG | CRYPTO_TFM_REQ_MAY_SLEEP,
                ext4_crypt_complete, &ecr);
@@ -288,21 +286,21 @@ static int ext4_page_crypto(struct inode *inode,
        sg_set_page(&dst, dest_page, PAGE_CACHE_SIZE, 0);
        sg_init_table(&src, 1);
        sg_set_page(&src, src_page, PAGE_CACHE_SIZE, 0);
-        ablkcipher_request_set_crypt(req, &src, &dst, PAGE_CACHE_SIZE,
+        skcipher_request_set_crypt(req, &src, &dst, PAGE_CACHE_SIZE,
-                                     xts_tweak);
+                                   xts_tweak);
        if (rw == EXT4_DECRYPT)
-                res = crypto_ablkcipher_decrypt(req);
+                res = crypto_skcipher_decrypt(req);
        else
-                res = crypto_ablkcipher_encrypt(req);
+                res = crypto_skcipher_encrypt(req);
        if (res == -EINPROGRESS || res == -EBUSY) {
                wait_for_completion(&ecr.completion);
                res = ecr.res;
        }
-        ablkcipher_request_free(req);
+        skcipher_request_free(req);
        if (res) {
                printk_ratelimited(
                        KERN_ERR
-                        "%s: crypto_ablkcipher_encrypt() returned %d\n",
+                        "%s: crypto_skcipher_encrypt() returned %d\n",
                        __func__, res);
                return res;
        }
diff --git a/fs/ext4/crypto_fname.c b/fs/ext4/crypto_fname.c
index 2fbef8a14760..1a2f360405db 100644
--- a/fs/ext4/crypto_fname.c
+++ b/fs/ext4/crypto_fname.c
@@ -11,11 +11,9 @@
 *
 */
-#include <crypto/hash.h>
+#include <crypto/skcipher.h>
-#include <crypto/sha.h>
 #include <keys/encrypted-type.h>
 #include <keys/user-type.h>
-#include <linux/crypto.h>
 #include <linux/gfp.h>
 #include <linux/kernel.h>
 #include <linux/key.h>
@@ -65,10 +63,10 @@ static int ext4_fname_encrypt(struct inode *inode,
                              struct ext4_str *oname)
 {
        u32 ciphertext_len;
-        struct ablkcipher_request *req = NULL;
+        struct skcipher_request *req = NULL;
        DECLARE_EXT4_COMPLETION_RESULT(ecr);
        struct ext4_crypt_info *ci = EXT4_I(inode)->i_crypt_info;
-        struct crypto_ablkcipher *tfm = ci->ci_ctfm;
+        struct crypto_skcipher *tfm = ci->ci_ctfm;
        int res = 0;
        char iv[EXT4_CRYPTO_BLOCK_SIZE];
        struct scatterlist src_sg, dst_sg;
@@ -95,14 +93,14 @@ static int ext4_fname_encrypt(struct inode *inode,
        }
        /* Allocate request */
-        req = ablkcipher_request_alloc(tfm, GFP_NOFS);
+        req = skcipher_request_alloc(tfm, GFP_NOFS);
        if (!req) {
                printk_ratelimited(
                    KERN_ERR "%s: crypto_request_alloc() failed\n", __func__);
                kfree(alloc_buf);
                return -ENOMEM;
        }
-        ablkcipher_request_set_callback(req,
+        skcipher_request_set_callback(req,
                CRYPTO_TFM_REQ_MAY_BACKLOG | CRYPTO_TFM_REQ_MAY_SLEEP,
                ext4_dir_crypt_complete, &ecr);
@@ -117,14 +115,14 @@ static int ext4_fname_encrypt(struct inode *inode,
        /* Create encryption request */
        sg_init_one(&src_sg, workbuf, ciphertext_len);
        sg_init_one(&dst_sg, oname->name, ciphertext_len);
-        ablkcipher_request_set_crypt(req, &src_sg, &dst_sg, ciphertext_len, iv);
+        skcipher_request_set_crypt(req, &src_sg, &dst_sg, ciphertext_len, iv);
-        res = crypto_ablkcipher_encrypt(req);
+        res = crypto_skcipher_encrypt(req);
        if (res == -EINPROGRESS || res == -EBUSY) {
                wait_for_completion(&ecr.completion);
                res = ecr.res;
        }
        kfree(alloc_buf);
-        ablkcipher_request_free(req);
+        skcipher_request_free(req);
        if (res < 0) {
                printk_ratelimited(
                    KERN_ERR "%s: Error (error code %d)\n", __func__, res);
@@ -145,11 +143,11 @@ static int ext4_fname_decrypt(struct inode *inode,
                              struct ext4_str *oname)
 {
        struct ext4_str tmp_in[2], tmp_out[1];
-        struct ablkcipher_request *req = NULL;
+        struct skcipher_request *req = NULL;
        DECLARE_EXT4_COMPLETION_RESULT(ecr);
        struct scatterlist src_sg, dst_sg;
        struct ext4_crypt_info *ci = EXT4_I(inode)->i_crypt_info;
-        struct crypto_ablkcipher *tfm = ci->ci_ctfm;
+        struct crypto_skcipher *tfm = ci->ci_ctfm;
        int res = 0;
        char iv[EXT4_CRYPTO_BLOCK_SIZE];
        unsigned lim = max_name_len(inode);
@@ -162,13 +160,13 @@ static int ext4_fname_decrypt(struct inode *inode,
        tmp_out[0].name = oname->name;
        /* Allocate request */
-        req = ablkcipher_request_alloc(tfm, GFP_NOFS);
+        req = skcipher_request_alloc(tfm, GFP_NOFS);
        if (!req) {
                printk_ratelimited(
                    KERN_ERR "%s: crypto_request_alloc() failed\n",  __func__);
                return -ENOMEM;
        }
-        ablkcipher_request_set_callback(req,
+        skcipher_request_set_callback(req,
                CRYPTO_TFM_REQ_MAY_BACKLOG | CRYPTO_TFM_REQ_MAY_SLEEP,
                ext4_dir_crypt_complete, &ecr);
@@ -178,13 +176,13 @@ static int ext4_fname_decrypt(struct inode *inode,
        /* Create encryption request */
        sg_init_one(&src_sg, iname->name, iname->len);
        sg_init_one(&dst_sg, oname->name, oname->len);
-        ablkcipher_request_set_crypt(req, &src_sg, &dst_sg, iname->len, iv);
+        skcipher_request_set_crypt(req, &src_sg, &dst_sg, iname->len, iv);
-        res = crypto_ablkcipher_decrypt(req);
+        res = crypto_skcipher_decrypt(req);
        if (res == -EINPROGRESS || res == -EBUSY) {
                wait_for_completion(&ecr.completion);
                res = ecr.res;
        }
-        ablkcipher_request_free(req);
+        skcipher_request_free(req);
        if (res < 0) {
                printk_ratelimited(
                    KERN_ERR "%s: Error in ext4_fname_encrypt (error code %d)\n",
diff --git a/fs/ext4/crypto_key.c b/fs/ext4/crypto_key.c
index 9a16d1e75a49..0129d688d1f7 100644
--- a/fs/ext4/crypto_key.c
+++ b/fs/ext4/crypto_key.c
@@ -8,6 +8,7 @@
 * Written by Michael Halcrow, Ildar Muslukhov, and Uday Savagaonkar, 2015.
 */
+#include <crypto/skcipher.h>
 #include <keys/encrypted-type.h>
 #include <keys/user-type.h>
 #include <linux/random.h>
@@ -41,45 +42,42 @@ static int ext4_derive_key_aes(char deriving_key[EXT4_AES_128_ECB_KEY_SIZE],
                               char derived_key[EXT4_AES_256_XTS_KEY_SIZE])
 {
        int res = 0;
-        struct ablkcipher_request *req = NULL;
+        struct skcipher_request *req = NULL;
        DECLARE_EXT4_COMPLETION_RESULT(ecr);
        struct scatterlist src_sg, dst_sg;
-        struct crypto_ablkcipher *tfm = crypto_alloc_ablkcipher("ecb(aes)", 0,
+        struct crypto_skcipher *tfm = crypto_alloc_skcipher("ecb(aes)", 0, 0);
-                                                                0);
        if (IS_ERR(tfm)) {
                res = PTR_ERR(tfm);
                tfm = NULL;
                goto out;
        }
-        crypto_ablkcipher_set_flags(tfm, CRYPTO_TFM_REQ_WEAK_KEY);
+        crypto_skcipher_set_flags(tfm, CRYPTO_TFM_REQ_WEAK_KEY);
-        req = ablkcipher_request_alloc(tfm, GFP_NOFS);
+        req = skcipher_request_alloc(tfm, GFP_NOFS);
        if (!req) {
                res = -ENOMEM;
                goto out;
        }
-        ablkcipher_request_set_callback(req,
+        skcipher_request_set_callback(req,
                        CRYPTO_TFM_REQ_MAY_BACKLOG | CRYPTO_TFM_REQ_MAY_SLEEP,
                        derive_crypt_complete, &ecr);
-        res = crypto_ablkcipher_setkey(tfm, deriving_key,
+        res = crypto_skcipher_setkey(tfm, deriving_key,
-                                       EXT4_AES_128_ECB_KEY_SIZE);
+                                     EXT4_AES_128_ECB_KEY_SIZE);
        if (res < 0)
                goto out;
        sg_init_one(&src_sg, source_key, EXT4_AES_256_XTS_KEY_SIZE);
        sg_init_one(&dst_sg, derived_key, EXT4_AES_256_XTS_KEY_SIZE);
-        ablkcipher_request_set_crypt(req, &src_sg, &dst_sg,
+        skcipher_request_set_crypt(req, &src_sg, &dst_sg,
-                                     EXT4_AES_256_XTS_KEY_SIZE, NULL);
+                                   EXT4_AES_256_XTS_KEY_SIZE, NULL);
-        res = crypto_ablkcipher_encrypt(req);
+        res = crypto_skcipher_encrypt(req);
        if (res == -EINPROGRESS || res == -EBUSY) {
                wait_for_completion(&ecr.completion);
                res = ecr.res;
        }
 out:
-        if (req)
+        skcipher_request_free(req);
-                ablkcipher_request_free(req);
+        crypto_free_skcipher(tfm);
-        if (tfm)
-                crypto_free_ablkcipher(tfm);
        return res;
 }
@@ -90,7 +88,7 @@ void ext4_free_crypt_info(struct ext4_crypt_info *ci)
        if (ci->ci_keyring_key)
                key_put(ci->ci_keyring_key);
-        crypto_free_ablkcipher(ci->ci_ctfm);
+        crypto_free_skcipher(ci->ci_ctfm);
        kmem_cache_free(ext4_crypt_info_cachep, ci);
 }
@@ -122,7 +120,7 @@ int _ext4_get_encryption_info(struct inode *inode)
        struct ext4_encryption_context ctx;
        const struct user_key_payload *ukp;
        struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
-        struct crypto_ablkcipher *ctfm;
+        struct crypto_skcipher *ctfm;
        const char *cipher_str;
        char raw_key[EXT4_MAX_KEY_SIZE];
        char mode;
@@ -237,7 +235,7 @@ retry:
        if (res)
                goto out;
 got_key:
-        ctfm = crypto_alloc_ablkcipher(cipher_str, 0, 0);
+        ctfm = crypto_alloc_skcipher(cipher_str, 0, 0);
        if (!ctfm || IS_ERR(ctfm)) {
                res = ctfm ? PTR_ERR(ctfm) : -ENOMEM;
                printk(KERN_DEBUG
@@ -246,11 +244,11 @@ got_key:
                goto out;
        }
        crypt_info->ci_ctfm = ctfm;
-        crypto_ablkcipher_clear_flags(ctfm, ~0);
+        crypto_skcipher_clear_flags(ctfm, ~0);
-        crypto_tfm_set_flags(crypto_ablkcipher_tfm(ctfm),
+        crypto_tfm_set_flags(crypto_skcipher_tfm(ctfm),
                             CRYPTO_TFM_REQ_WEAK_KEY);
-        res = crypto_ablkcipher_setkey(ctfm, raw_key,
+        res = crypto_skcipher_setkey(ctfm, raw_key,
-                                       ext4_encryption_key_size(mode));
+                                     ext4_encryption_key_size(mode));
        if (res)
                goto out;
        memzero_explicit(raw_key, sizeof(raw_key));
diff --git a/fs/ext4/dir.c b/fs/ext4/dir.c
index 33f5e2a50cf8..50ba27cbed03 100644
--- a/fs/ext4/dir.c
+++ b/fs/ext4/dir.c
@@ -285,7 +285,7 @@ errout:
 static inline int is_32bit_api(void)
 {
 #ifdef CONFIG_COMPAT
-        return is_compat_task();
+        return in_compat_syscall();
 #else
        return (BITS_PER_LONG == 32);
 #endif
diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h
index 157b458a69d4..c04743519865 100644
--- a/fs/ext4/ext4.h
+++ b/fs/ext4/ext4.h
@@ -42,6 +42,18 @@
 */
 /*
+ * with AGGRESSIVE_CHECK allocator runs consistency checks over
+ * structures. these checks slow things down a lot
+ */
+#define AGGRESSIVE_CHECK__
+/*
+ * with DOUBLE_CHECK defined mballoc creates persistent in-core
+ * bitmaps, maintains and uses them to check for double allocations
+ */
+#define DOUBLE_CHECK__
+/*
 * Define EXT4FS_DEBUG to produce debug messages
 */
 #undef EXT4FS_DEBUG
@@ -182,9 +194,9 @@ typedef struct ext4_io_end {
        struct bio              *bio;           /* Linked list of completed
                                                 * bios covering the extent */
        unsigned int            flag;           /* unwritten or not */
+        atomic_t                count;          /* reference counter */
        loff_t                  offset;         /* offset in the file */
        ssize_t                 size;           /* size of the extent */
-        atomic_t                count;          /* reference counter */
 } ext4_io_end_t;
 struct ext4_io_submit {
@@ -1024,13 +1036,8 @@ struct ext4_inode_info {
         * transaction reserved
         */
        struct list_head i_rsv_conversion_list;
-        /*
-         * Completed IOs that need unwritten extents handling and don't have
-         * transaction reserved
-         */
-        atomic_t i_ioend_count; /* Number of outstanding io_end structs */
-        atomic_t i_unwritten; /* Nr. of inflight conversions pending */
        struct work_struct i_rsv_conversion_work;
+        atomic_t i_unwritten; /* Nr. of inflight conversions pending */
        spinlock_t i_block_reservation_lock;
@@ -1504,25 +1511,6 @@ static inline int ext4_valid_inum(struct super_block *sb, unsigned long ino)
                 ino <= le32_to_cpu(EXT4_SB(sb)->s_es->s_inodes_count));
 }
-static inline void ext4_set_io_unwritten_flag(struct inode *inode,
-                                              struct ext4_io_end *io_end)
-{
-        if (!(io_end->flag & EXT4_IO_END_UNWRITTEN)) {
-                io_end->flag |= EXT4_IO_END_UNWRITTEN;
-                atomic_inc(&EXT4_I(inode)->i_unwritten);
-        }
-}
-static inline ext4_io_end_t *ext4_inode_aio(struct inode *inode)
-{
-        return inode->i_private;
-}
-static inline void ext4_inode_aio_set(struct inode *inode, ext4_io_end_t *io)
-{
-        inode->i_private = io;
-}
 /*
 * Inode dynamic state flags
 */
@@ -2506,12 +2494,14 @@ extern int ext4_trim_fs(struct super_block *, struct fstrim_range *);
 int ext4_inode_is_fast_symlink(struct inode *inode);
 struct buffer_head *ext4_getblk(handle_t *, struct inode *, ext4_lblk_t, int);
 struct buffer_head *ext4_bread(handle_t *, struct inode *, ext4_lblk_t, int);
-int ext4_get_block_write(struct inode *inode, sector_t iblock,
+int ext4_get_block_unwritten(struct inode *inode, sector_t iblock,
-                         struct buffer_head *bh_result, int create);
+                             struct buffer_head *bh_result, int create);
 int ext4_dax_mmap_get_block(struct inode *inode, sector_t iblock,
                            struct buffer_head *bh_result, int create);
 int ext4_get_block(struct inode *inode, sector_t iblock,
-                                struct buffer_head *bh_result, int create);
+                   struct buffer_head *bh_result, int create);
+int ext4_dio_get_block(struct inode *inode, sector_t iblock,
+                       struct buffer_head *bh_result, int create);
 int ext4_da_get_block_prep(struct inode *inode, sector_t iblock,
                           struct buffer_head *bh, int create);
 int ext4_walk_page_buffers(handle_t *handle,
@@ -2559,6 +2549,9 @@ extern void ext4_da_update_reserve_space(struct inode *inode,
                                        int used, int quota_claim);
 extern int ext4_issue_zeroout(struct inode *inode, ext4_lblk_t lblk,
                              ext4_fsblk_t pblk, ext4_lblk_t len);
+extern int ext4_get_next_extent(struct inode *inode, ext4_lblk_t lblk,
+                                unsigned int map_len,
+                                struct extent_status *result);
 /* indirect.c */
 extern int ext4_ind_map_blocks(handle_t *handle, struct inode *inode,
@@ -3285,15 +3278,33 @@ static inline void ext4_inode_resume_unlocked_dio(struct inode *inode)
 #define EXT4_WQ_HASH_SZ         37
 #define ext4_ioend_wq(v)   (&ext4__ioend_wq[((unsigned long)(v)) %\
                                            EXT4_WQ_HASH_SZ])
-#define ext4_aio_mutex(v)  (&ext4__aio_mutex[((unsigned long)(v)) %\
-                                             EXT4_WQ_HASH_SZ])
 extern wait_queue_head_t ext4__ioend_wq[EXT4_WQ_HASH_SZ];
-extern struct mutex ext4__aio_mutex[EXT4_WQ_HASH_SZ];
 #define EXT4_RESIZING   0
 extern int ext4_resize_begin(struct super_block *sb);
 extern void ext4_resize_end(struct super_block *sb);
+static inline void ext4_set_io_unwritten_flag(struct inode *inode,
+                                              struct ext4_io_end *io_end)
+{
+        if (!(io_end->flag & EXT4_IO_END_UNWRITTEN)) {
+                io_end->flag |= EXT4_IO_END_UNWRITTEN;
+                atomic_inc(&EXT4_I(inode)->i_unwritten);
+        }
+}
+static inline void ext4_clear_io_unwritten_flag(ext4_io_end_t *io_end)
+{
+        struct inode *inode = io_end->inode;
+        if (io_end->flag & EXT4_IO_END_UNWRITTEN) {
+                io_end->flag &= ~EXT4_IO_END_UNWRITTEN;
+                /* Wake up anyone waiting on unwritten extent conversion */
+                if (atomic_dec_and_test(&EXT4_I(inode)->i_unwritten))
+                        wake_up_all(ext4_ioend_wq(inode));
+        }
+}
 #endif  /* __KERNEL__ */
 #define EFSBADCRC       EBADMSG         /* Bad CRC detected */
diff --git a/fs/ext4/ext4_crypto.h b/fs/ext4/ext4_crypto.h
index ac7d4e813796..1f73c29717e1 100644
--- a/fs/ext4/ext4_crypto.h
+++ b/fs/ext4/ext4_crypto.h
@@ -77,7 +77,7 @@ struct ext4_crypt_info {
        char            ci_data_mode;
        char            ci_filename_mode;
        char            ci_flags;
-        struct crypto_ablkcipher *ci_ctfm;
+        struct crypto_skcipher *ci_ctfm;
        struct key      *ci_keyring_key;
        char            ci_master_key[EXT4_KEY_DESCRIPTOR_SIZE];
 };
diff --git a/fs/ext4/ext4_extents.h b/fs/ext4/ext4_extents.h
index 3c9381547094..8ecf84b8f5a1 100644
--- a/fs/ext4/ext4_extents.h
+++ b/fs/ext4/ext4_extents.h
@@ -11,7 +11,7 @@
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 * GNU General Public License for more details.
 *
- * You should have received a copy of the GNU General Public Licens
+ * You should have received a copy of the GNU General Public License
 * along with this program; if not, write to the Free Software
 * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-
 */
diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c
index 3753ceb0b0dd..95bf4679ac54 100644
--- a/fs/ext4/extents.c
+++ b/fs/ext4/extents.c
@@ -15,7 +15,7 @@
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 * GNU General Public License for more details.
 *
- * You should have received a copy of the GNU General Public Licens
+ * You should have received a copy of the GNU General Public License
 * along with this program; if not, write to the Free Software
 * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-
 */
@@ -1736,6 +1736,12 @@ ext4_can_extents_be_merged(struct inode *inode, struct ext4_extent *ex1,
         */
        if (ext1_ee_len + ext2_ee_len > EXT_INIT_MAX_LEN)
                return 0;
+        /*
+         * The check for IO to unwritten extent is somewhat racy as we
+         * increment i_unwritten / set EXT4_STATE_DIO_UNWRITTEN only after
+         * dropping i_data_sem. But reserved blocks should save us in that
+         * case.
+         */
        if (ext4_ext_is_unwritten(ex1) &&
            (ext4_test_inode_state(inode, EXT4_STATE_DIO_UNWRITTEN) ||
             atomic_read(&EXT4_I(inode)->i_unwritten) ||
@@ -2293,59 +2299,69 @@ static int ext4_fill_fiemap_extents(struct inode *inode,
 }
 /*
- * ext4_ext_put_gap_in_cache:
+ * ext4_ext_determine_hole - determine hole around given block
- * calculate boundaries of the gap that the requested block fits into
+ * @inode:      inode we lookup in
- * and cache this gap
+ * @path:       path in extent tree to @lblk
+ * @lblk:       pointer to logical block around which we want to determine hole
+ *
+ * Determine hole length (and start if easily possible) around given logical
+ * block. We don't try too hard to find the beginning of the hole but @path
+ * actually points to extent before @lblk, we provide it.
+ *
+ * The function returns the length of a hole starting at @lblk. We update @lblk
+ * to the beginning of the hole if we managed to find it.
 */
-static void
+static ext4_lblk_t ext4_ext_determine_hole(struct inode *inode,
-ext4_ext_put_gap_in_cache(struct inode *inode, struct ext4_ext_path *path,
+                                           struct ext4_ext_path *path,
-                                ext4_lblk_t block)
+                                           ext4_lblk_t *lblk)
 {
        int depth = ext_depth(inode);
-        ext4_lblk_t len;
-        ext4_lblk_t lblock;
        struct ext4_extent *ex;
-        struct extent_status es;
+        ext4_lblk_t len;
        ex = path[depth].p_ext;
        if (ex == NULL) {
                /* there is no extent yet, so gap is [0;-] */
-                lblock = 0;
+                *lblk = 0;
                len = EXT_MAX_BLOCKS;
-                ext_debug("cache gap(whole file):");
+        } else if (*lblk < le32_to_cpu(ex->ee_block)) {
-        } else if (block < le32_to_cpu(ex->ee_block)) {
+                len = le32_to_cpu(ex->ee_block) - *lblk;
-                lblock = block;
+        } else if (*lblk >= le32_to_cpu(ex->ee_block)
-                len = le32_to_cpu(ex->ee_block) - block;
-                ext_debug("cache gap(before): %u [%u:%u]",
-                                block,
-                                le32_to_cpu(ex->ee_block),
-                                 ext4_ext_get_actual_len(ex));
-        } else if (block >= le32_to_cpu(ex->ee_block)
                        + ext4_ext_get_actual_len(ex)) {
                ext4_lblk_t next;
-                lblock = le32_to_cpu(ex->ee_block)
-                        + ext4_ext_get_actual_len(ex);
+                *lblk = le32_to_cpu(ex->ee_block) + ext4_ext_get_actual_len(ex);
                next = ext4_ext_next_allocated_block(path);
-                ext_debug("cache gap(after): [%u:%u] %u",
+                BUG_ON(next == *lblk);
-                                le32_to_cpu(ex->ee_block),
+                len = next - *lblk;
-                                ext4_ext_get_actual_len(ex),
-                                block);
-                BUG_ON(next == lblock);
-                len = next - lblock;
        } else {
                BUG();
        }
+        return len;
+}
-        ext4_es_find_delayed_extent_range(inode, lblock, lblock + len - 1, &es);
+/*
+ * ext4_ext_put_gap_in_cache:
+ * calculate boundaries of the gap that the requested block fits into
+ * and cache this gap
+ */
+static void
+ext4_ext_put_gap_in_cache(struct inode *inode, ext4_lblk_t hole_start,
+                          ext4_lblk_t hole_len)
+{
+        struct extent_status es;
+        ext4_es_find_delayed_extent_range(inode, hole_start,
+                                          hole_start + hole_len - 1, &es);
        if (es.es_len) {
                /* There's delayed extent containing lblock? */
-                if (es.es_lblk <= lblock)
+                if (es.es_lblk <= hole_start)
                        return;
-                len = min(es.es_lblk - lblock, len);
+                hole_len = min(es.es_lblk - hole_start, hole_len);
        }
-        ext_debug(" -> %u:%u\n", lblock, len);
+        ext_debug(" -> %u:%u\n", hole_start, hole_len);
-        ext4_es_insert_extent(inode, lblock, len, ~0, EXTENT_STATUS_HOLE);
+        ext4_es_insert_extent(inode, hole_start, hole_len, ~0,
+                              EXTENT_STATUS_HOLE);
 }
 /*
@@ -3927,7 +3943,7 @@ get_reserved_cluster_alloc(struct inode *inode, ext4_lblk_t lblk_start,
 static int
 convert_initialized_extent(handle_t *handle, struct inode *inode,
                           struct ext4_map_blocks *map,
-                           struct ext4_ext_path **ppath, int flags,
+                           struct ext4_ext_path **ppath,
                           unsigned int allocated)
 {
        struct ext4_ext_path *path = *ppath;
@@ -4007,7 +4023,6 @@ ext4_ext_handle_unwritten_extents(handle_t *handle, struct inode *inode,
        struct ext4_ext_path *path = *ppath;
        int ret = 0;
        int err = 0;
-        ext4_io_end_t *io = ext4_inode_aio(inode);
        ext_debug("ext4_ext_handle_unwritten_extents: inode %lu, logical "
                  "block %llu, max_blocks %u, flags %x, allocated %u\n",
@@ -4030,15 +4045,6 @@ ext4_ext_handle_unwritten_extents(handle_t *handle, struct inode *inode,
                                         flags | EXT4_GET_BLOCKS_CONVERT);
                if (ret <= 0)
                        goto out;
-                /*
-                 * Flag the inode(non aio case) or end_io struct (aio case)
-                 * that this IO needs to conversion to written when IO is
-                 * completed
-                 */
-                if (io)
-                        ext4_set_io_unwritten_flag(inode, io);
-                else
-                        ext4_set_inode_state(inode, EXT4_STATE_DIO_UNWRITTEN);
                map->m_flags |= EXT4_MAP_UNWRITTEN;
                goto out;
        }
@@ -4283,9 +4289,7 @@ int ext4_ext_map_blocks(handle_t *handle, struct inode *inode,
        unsigned int allocated = 0, offset = 0;
        unsigned int allocated_clusters = 0;
        struct ext4_allocation_request ar;
-        ext4_io_end_t *io = ext4_inode_aio(inode);
        ext4_lblk_t cluster_offset;
-        int set_unwritten = 0;
        bool map_from_cluster = false;
        ext_debug("blocks %u/%u requested for inode %lu\n",
@@ -4347,7 +4351,7 @@ int ext4_ext_map_blocks(handle_t *handle, struct inode *inode,
                            (flags & EXT4_GET_BLOCKS_CONVERT_UNWRITTEN)) {
                                allocated = convert_initialized_extent(
                                                handle, inode, map, &path,
-                                                flags, allocated);
+                                                allocated);
                                goto out2;
                        } else if (!ext4_ext_is_unwritten(ex))
                                goto out;
@@ -4368,11 +4372,22 @@ int ext4_ext_map_blocks(handle_t *handle, struct inode *inode,
         * we couldn't try to create block if create flag is zero
         */
        if ((flags & EXT4_GET_BLOCKS_CREATE) == 0) {
+                ext4_lblk_t hole_start, hole_len;
+                hole_start = map->m_lblk;
+                hole_len = ext4_ext_determine_hole(inode, path, &hole_start);
                /*
                 * put just found gap into cache to speed up
                 * subsequent requests
                 */
-                ext4_ext_put_gap_in_cache(inode, path, map->m_lblk);
+                ext4_ext_put_gap_in_cache(inode, hole_start, hole_len);
+                /* Update hole_len to reflect hole size after map->m_lblk */
+                if (hole_start != map->m_lblk)
+                        hole_len -= map->m_lblk - hole_start;
+                map->m_pblk = 0;
+                map->m_len = min_t(unsigned int, map->m_len, hole_len);
                goto out2;
        }
@@ -4482,15 +4497,6 @@ got_allocated_blocks:
        if (flags & EXT4_GET_BLOCKS_UNWRIT_EXT){
                ext4_ext_mark_unwritten(&newex);
                map->m_flags |= EXT4_MAP_UNWRITTEN;
-                /*
-                 * io_end structure was created for every IO write to an
-                 * unwritten extent. To avoid unnecessary conversion,
-                 * here we flag the IO that really needs the conversion.
-                 * For non asycn direct IO case, flag the inode state
-                 * that we need to perform conversion when IO is done.
-                 */
-                if (flags & EXT4_GET_BLOCKS_PRE_IO)
-                        set_unwritten = 1;
        }
        err = 0;
@@ -4501,14 +4507,6 @@ got_allocated_blocks:
                err = ext4_ext_insert_extent(handle, inode, &path,
                                             &newex, flags);
-        if (!err && set_unwritten) {
-                if (io)
-                        ext4_set_io_unwritten_flag(inode, io);
-                else
-                        ext4_set_inode_state(inode,
-                                             EXT4_STATE_DIO_UNWRITTEN);
-        }
        if (err && free_on_err) {
                int fb_flags = flags & EXT4_GET_BLOCKS_DELALLOC_RESERVE ?
                        EXT4_FREE_BLOCKS_NO_QUOT_UPDATE : 0;
diff --git a/fs/ext4/extents_status.c b/fs/ext4/extents_status.c
index ac748b3af1c1..e38b987ac7f5 100644
--- a/fs/ext4/extents_status.c
+++ b/fs/ext4/extents_status.c
@@ -823,8 +823,8 @@ out:
                es->es_lblk = es1->es_lblk;
                es->es_len = es1->es_len;
                es->es_pblk = es1->es_pblk;
-                if (!ext4_es_is_referenced(es))
+                if (!ext4_es_is_referenced(es1))
-                        ext4_es_set_referenced(es);
+                        ext4_es_set_referenced(es1);
                stats->es_stats_cache_hits++;
        } else {
                stats->es_stats_cache_misses++;
diff --git a/fs/ext4/file.c b/fs/ext4/file.c
index 4cd318f31cbe..6659e216385e 100644
--- a/fs/ext4/file.c
+++ b/fs/ext4/file.c
@@ -93,31 +93,29 @@ ext4_file_write_iter(struct kiocb *iocb, struct iov_iter *from)
 {
        struct file *file = iocb->ki_filp;
        struct inode *inode = file_inode(iocb->ki_filp);
-        struct mutex *aio_mutex = NULL;
        struct blk_plug plug;
        int o_direct = iocb->ki_flags & IOCB_DIRECT;
+        int unaligned_aio = 0;
        int overwrite = 0;
        ssize_t ret;
+        inode_lock(inode);
+        ret = generic_write_checks(iocb, from);
+        if (ret <= 0)
+                goto out;
        /*
-         * Unaligned direct AIO must be serialized; see comment above
+         * Unaligned direct AIO must be serialized among each other as zeroing
-         * In the case of O_APPEND, assume that we must always serialize
+         * of partial blocks of two competing unaligned AIOs can result in data
+         * corruption.
         */
-        if (o_direct &&
+        if (o_direct && ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS) &&
-            ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS) &&
            !is_sync_kiocb(iocb) &&
-            (iocb->ki_flags & IOCB_APPEND ||
+            ext4_unaligned_aio(inode, from, iocb->ki_pos)) {
-             ext4_unaligned_aio(inode, from, iocb->ki_pos))) {
+                unaligned_aio = 1;
-                aio_mutex = ext4_aio_mutex(inode);
-                mutex_lock(aio_mutex);
                ext4_unwritten_wait(inode);
        }
-        inode_lock(inode);
-        ret = generic_write_checks(iocb, from);
-        if (ret <= 0)
-                goto out;
        /*
         * If we have encountered a bitmap-format file, the size limit
         * is smaller than s_maxbytes, which is for extent-mapped files.
@@ -139,7 +137,7 @@ ext4_file_write_iter(struct kiocb *iocb, struct iov_iter *from)
                blk_start_plug(&plug);
                /* check whether we do a DIO overwrite or not */
-                if (ext4_should_dioread_nolock(inode) && !aio_mutex &&
+                if (ext4_should_dioread_nolock(inode) && !unaligned_aio &&
                    !file->f_mapping->nrpages && pos + length <= i_size_read(inode)) {
                        struct ext4_map_blocks map;
                        unsigned int blkbits = inode->i_blkbits;
@@ -181,14 +179,10 @@ ext4_file_write_iter(struct kiocb *iocb, struct iov_iter *from)
        if (o_direct)
                blk_finish_plug(&plug);
-        if (aio_mutex)
-                mutex_unlock(aio_mutex);
        return ret;
 out:
        inode_unlock(inode);
-        if (aio_mutex)
-                mutex_unlock(aio_mutex);
        return ret;
 }
@@ -417,7 +411,7 @@ static int ext4_file_open(struct inode * inode, struct file * filp)
 */
 static int ext4_find_unwritten_pgoff(struct inode *inode,
                                     int whence,
-                                     struct ext4_map_blocks *map,
+                                     ext4_lblk_t end_blk,
                                     loff_t *offset)
 {
        struct pagevec pvec;
@@ -432,7 +426,7 @@ static int ext4_find_unwritten_pgoff(struct inode *inode,
        blkbits = inode->i_sb->s_blocksize_bits;
        startoff = *offset;
        lastoff = startoff;
-        endoff = (loff_t)(map->m_lblk + map->m_len) << blkbits;
+        endoff = (loff_t)end_blk << blkbits;
        index = startoff >> PAGE_CACHE_SHIFT;
        end = endoff >> PAGE_CACHE_SHIFT;
@@ -550,12 +544,11 @@ out:
 static loff_t ext4_seek_data(struct file *file, loff_t offset, loff_t maxsize)
 {
        struct inode *inode = file->f_mapping->host;
-        struct ext4_map_blocks map;
        struct extent_status es;
        ext4_lblk_t start, last, end;
        loff_t dataoff, isize;
        int blkbits;
-        int ret = 0;
+        int ret;
        inode_lock(inode);
@@ -572,41 +565,32 @@ static loff_t ext4_seek_data(struct file *file, loff_t offset, loff_t maxsize)
        dataoff = offset;
        do {
-                map.m_lblk = last;
+                ret = ext4_get_next_extent(inode, last, end - last + 1, &es);
-                map.m_len = end - last + 1;
+                if (ret <= 0) {
-                ret = ext4_map_blocks(NULL, inode, &map, 0);
+                        /* No extent found -> no data */
-                if (ret > 0 && !(map.m_flags & EXT4_MAP_UNWRITTEN)) {
+                        if (ret == 0)
-                        if (last != start)
+                                ret = -ENXIO;
-                                dataoff = (loff_t)last << blkbits;
+                        inode_unlock(inode);
-                        break;
+                        return ret;
                }
-                /*
+                last = es.es_lblk;
-                 * If there is a delay extent at this offset,
+                if (last != start)
-                 * it will be as a data.
+                        dataoff = (loff_t)last << blkbits;
-                 */
+                if (!ext4_es_is_unwritten(&es))
-                ext4_es_find_delayed_extent_range(inode, last, last, &es);
-                if (es.es_len != 0 && in_range(last, es.es_lblk, es.es_len)) {
-                        if (last != start)
-                                dataoff = (loff_t)last << blkbits;
                        break;
-                }
                /*
                 * If there is a unwritten extent at this offset,
                 * it will be as a data or a hole according to page
                 * cache that has data or not.
                 */
-                if (map.m_flags & EXT4_MAP_UNWRITTEN) {
+                if (ext4_find_unwritten_pgoff(inode, SEEK_DATA,
-                        int unwritten;
+                                              es.es_lblk + es.es_len, &dataoff))
-                        unwritten = ext4_find_unwritten_pgoff(inode, SEEK_DATA,
+                        break;
-                                                              &map, &dataoff);
+                last += es.es_len;
-                        if (unwritten)
-                                break;
-                }
-                last++;
                dataoff = (loff_t)last << blkbits;
+                cond_resched();
        } while (last <= end);
        inode_unlock(inode);
@@ -623,12 +607,11 @@ static loff_t ext4_seek_data(struct file *file, loff_t offset, loff_t maxsize)
 static loff_t ext4_seek_hole(struct file *file, loff_t offset, loff_t maxsize)
 {
        struct inode *inode = file->f_mapping->host;
-        struct ext4_map_blocks map;
        struct extent_status es;
        ext4_lblk_t start, last, end;
        loff_t holeoff, isize;
        int blkbits;
-        int ret = 0;
+        int ret;
        inode_lock(inode);
@@ -645,44 +628,30 @@ static loff_t ext4_seek_hole(struct file *file, loff_t offset, loff_t maxsize)
        holeoff = offset;
        do {
-                map.m_lblk = last;
+                ret = ext4_get_next_extent(inode, last, end - last + 1, &es);
-                map.m_len = end - last + 1;
+                if (ret < 0) {
-                ret = ext4_map_blocks(NULL, inode, &map, 0);
+                        inode_unlock(inode);
-                if (ret > 0 && !(map.m_flags & EXT4_MAP_UNWRITTEN)) {
+                        return ret;
-                        last += ret;
-                        holeoff = (loff_t)last << blkbits;
-                        continue;
                }
+                /* Found a hole? */
-                /*
+                if (ret == 0 || es.es_lblk > last) {
-                 * If there is a delay extent at this offset,
+                        if (last != start)
-                 * we will skip this extent.
+                                holeoff = (loff_t)last << blkbits;
-                 */
+                        break;
-                ext4_es_find_delayed_extent_range(inode, last, last, &es);
-                if (es.es_len != 0 && in_range(last, es.es_lblk, es.es_len)) {
-                        last = es.es_lblk + es.es_len;
-                        holeoff = (loff_t)last << blkbits;
-                        continue;
                }
                /*
                 * If there is a unwritten extent at this offset,
                 * it will be as a data or a hole according to page
                 * cache that has data or not.
                 */
-                if (map.m_flags & EXT4_MAP_UNWRITTEN) {
+                if (ext4_es_is_unwritten(&es) &&
-                        int unwritten;
+                    ext4_find_unwritten_pgoff(inode, SEEK_HOLE,
-                        unwritten = ext4_find_unwritten_pgoff(inode, SEEK_HOLE,
+                                              last + es.es_len, &holeoff))
-                                                              &map, &holeoff);
+                        break;
-                        if (!unwritten) {
-                                last += ret;
-                                holeoff = (loff_t)last << blkbits;
-                                continue;
-                        }
-                }
-                /* find a hole */
+                last += es.es_len;
-                break;
+                holeoff = (loff_t)last << blkbits;
+                cond_resched();
        } while (last <= end);
        inode_unlock(inode);
diff --git a/fs/ext4/ialloc.c b/fs/ext4/ialloc.c
index acc0ad56bf2f..237b877d316d 100644
--- a/fs/ext4/ialloc.c
+++ b/fs/ext4/ialloc.c
@@ -787,7 +787,7 @@ struct inode *__ext4_new_inode(handle_t *handle, struct inode *dir,
        sbi = EXT4_SB(sb);
        /*
-         * Initalize owners and quota early so that we don't have to account
+         * Initialize owners and quota early so that we don't have to account
         * for quota initialization worst case in standard inode creating
         * transaction
         */
diff --git a/fs/ext4/indirect.c b/fs/ext4/indirect.c
index 355ef9c36c87..3027fa681de5 100644
--- a/fs/ext4/indirect.c
+++ b/fs/ext4/indirect.c
@@ -555,8 +555,23 @@ int ext4_ind_map_blocks(handle_t *handle, struct inode *inode,
                goto got_it;
        }
-        /* Next simple case - plain lookup or failed read of indirect block */
+        /* Next simple case - plain lookup failed */
-        if ((flags & EXT4_GET_BLOCKS_CREATE) == 0 || err == -EIO)
+        if ((flags & EXT4_GET_BLOCKS_CREATE) == 0) {
+                unsigned epb = inode->i_sb->s_blocksize / sizeof(u32);
+                int i;
+                /* Count number blocks in a subtree under 'partial' */
+                count = 1;
+                for (i = 0; partial + i != chain + depth - 1; i++)
+                        count *= epb;
+                /* Fill in size of a hole we found */
+                map->m_pblk = 0;
+                map->m_len = min_t(unsigned int, map->m_len, count);
+                goto cleanup;
+        }
+        /* Failed read of indirect block */
+        if (err == -EIO)
                goto cleanup;
        /*
@@ -693,21 +708,21 @@ retry:
                }
                if (IS_DAX(inode))
                        ret = dax_do_io(iocb, inode, iter, offset,
-                                        ext4_get_block, NULL, 0);
+                                        ext4_dio_get_block, NULL, 0);
                else
                        ret = __blockdev_direct_IO(iocb, inode,
                                                   inode->i_sb->s_bdev, iter,
-                                                   offset, ext4_get_block, NULL,
+                                                   offset, ext4_dio_get_block,
-                                                   NULL, 0);
+                                                   NULL, NULL, 0);
                inode_dio_end(inode);
        } else {
 locked:
                if (IS_DAX(inode))
                        ret = dax_do_io(iocb, inode, iter, offset,
-                                        ext4_get_block, NULL, DIO_LOCKING);
+                                        ext4_dio_get_block, NULL, DIO_LOCKING);
                else
                        ret = blockdev_direct_IO(iocb, inode, iter, offset,
-                                                 ext4_get_block);
+                                                 ext4_dio_get_block);
                if (unlikely(iov_iter_rw(iter) == WRITE && ret < 0)) {
                        loff_t isize = i_size_read(inode);
diff --git a/fs/ext4/inline.c b/fs/ext4/inline.c
index dfe3b9bafc0d..7cbdd3752ba5 100644
--- a/fs/ext4/inline.c
+++ b/fs/ext4/inline.c
@@ -581,9 +581,10 @@ retry:
        if (ret)
                goto out;
-        if (ext4_should_dioread_nolock(inode))
+        if (ext4_should_dioread_nolock(inode)) {
-                ret = __block_write_begin(page, from, to, ext4_get_block_write);
+                ret = __block_write_begin(page, from, to,
-        else
+                                          ext4_get_block_unwritten);
+        } else
                ret = __block_write_begin(page, from, to, ext4_get_block);
        if (!ret && ext4_should_journal_data(inode)) {
@@ -1696,7 +1697,6 @@ int ext4_delete_inline_entry(handle_t *handle,
        if (err)
                goto out;
-        BUFFER_TRACE(bh, "call ext4_handle_dirty_metadata");
        err = ext4_mark_inode_dirty(handle, dir);
        if (unlikely(err))
                goto out;
diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
index aee960b1af34..dab84a2530ff 100644
--- a/fs/ext4/inode.c
+++ b/fs/ext4/inode.c
@@ -216,7 +216,6 @@ void ext4_evict_inode(struct inode *inode)
                }
                truncate_inode_pages_final(&inode->i_data);
-                WARN_ON(atomic_read(&EXT4_I(inode)->i_ioend_count));
                goto no_delete;
        }
@@ -228,8 +227,6 @@ void ext4_evict_inode(struct inode *inode)
                ext4_begin_ordered_truncate(inode, 0);
        truncate_inode_pages_final(&inode->i_data);
-        WARN_ON(atomic_read(&EXT4_I(inode)->i_ioend_count));
        /*
         * Protect us against freezing - iput() caller didn't have to have any
         * protection against it
@@ -458,13 +455,13 @@ static void ext4_map_blocks_es_recheck(handle_t *handle,
 * Otherwise, call with ext4_ind_map_blocks() to handle indirect mapping
 * based files
 *
- * On success, it returns the number of blocks being mapped or allocated.
+ * On success, it returns the number of blocks being mapped or allocated.  if
- * if create==0 and the blocks are pre-allocated and unwritten block,
+ * create==0 and the blocks are pre-allocated and unwritten, the resulting @map
- * the result buffer head is unmapped. If the create ==1, it will make sure
+ * is marked as unwritten. If the create == 1, it will mark @map as mapped.
- * the buffer head is mapped.
 *
 * It returns 0 if plain look up failed (blocks have not been allocated), in
- * that case, buffer head is unmapped
+ * that case, @map is returned as unmapped but we still do fill map->m_len to
+ * indicate the length of a hole starting at map->m_lblk.
 *
 * It returns the error in case of allocation failure.
 */
@@ -507,6 +504,11 @@ int ext4_map_blocks(handle_t *handle, struct inode *inode,
                                retval = map->m_len;
                        map->m_len = retval;
                } else if (ext4_es_is_delayed(&es) || ext4_es_is_hole(&es)) {
+                        map->m_pblk = 0;
+                        retval = es.es_len - (map->m_lblk - es.es_lblk);
+                        if (retval > map->m_len)
+                                retval = map->m_len;
+                        map->m_len = retval;
                        retval = 0;
                } else {
                        BUG_ON(1);
@@ -714,16 +716,11 @@ static void ext4_update_bh_state(struct buffer_head *bh, unsigned long flags)
                 cmpxchg(&bh->b_state, old_state, new_state) != old_state));
 }
-/* Maximum number of blocks we map for direct IO at once. */
-#define DIO_MAX_BLOCKS 4096
 static int _ext4_get_block(struct inode *inode, sector_t iblock,
                           struct buffer_head *bh, int flags)
 {
-        handle_t *handle = ext4_journal_current_handle();
        struct ext4_map_blocks map;
-        int ret = 0, started = 0;
+        int ret = 0;
-        int dio_credits;
        if (ext4_has_inline_data(inode))
                return -ERANGE;
@@ -731,33 +728,14 @@ static int _ext4_get_block(struct inode *inode, sector_t iblock,
        map.m_lblk = iblock;
        map.m_len = bh->b_size >> inode->i_blkbits;
-        if (flags && !handle) {
+        ret = ext4_map_blocks(ext4_journal_current_handle(), inode, &map,
-                /* Direct IO write... */
+                              flags);
-                if (map.m_len > DIO_MAX_BLOCKS)
-                        map.m_len = DIO_MAX_BLOCKS;
-                dio_credits = ext4_chunk_trans_blocks(inode, map.m_len);
-                handle = ext4_journal_start(inode, EXT4_HT_MAP_BLOCKS,
-                                            dio_credits);
-                if (IS_ERR(handle)) {
-                        ret = PTR_ERR(handle);
-                        return ret;
-                }
-                started = 1;
-        }
-        ret = ext4_map_blocks(handle, inode, &map, flags);
        if (ret > 0) {
-                ext4_io_end_t *io_end = ext4_inode_aio(inode);
                map_bh(bh, inode->i_sb, map.m_pblk);
                ext4_update_bh_state(bh, map.m_flags);
-                if (io_end && io_end->flag & EXT4_IO_END_UNWRITTEN)
-                        set_buffer_defer_completion(bh);
                bh->b_size = inode->i_sb->s_blocksize * map.m_len;
                ret = 0;
        }
-        if (started)
-                ext4_journal_stop(handle);
        return ret;
 }
@@ -769,6 +747,155 @@ int ext4_get_block(struct inode *inode, sector_t iblock,
 }
 /*
+ * Get block function used when preparing for buffered write if we require
+ * creating an unwritten extent if blocks haven't been allocated.  The extent
+ * will be converted to written after the IO is complete.
+ */
+int ext4_get_block_unwritten(struct inode *inode, sector_t iblock,
+                             struct buffer_head *bh_result, int create)
+{
+        ext4_debug("ext4_get_block_unwritten: inode %lu, create flag %d\n",
+                   inode->i_ino, create);
+        return _ext4_get_block(inode, iblock, bh_result,
+                               EXT4_GET_BLOCKS_IO_CREATE_EXT);
+}
+/* Maximum number of blocks we map for direct IO at once. */
+#define DIO_MAX_BLOCKS 4096
+static handle_t *start_dio_trans(struct inode *inode,
+                                 struct buffer_head *bh_result)
+{
+        int dio_credits;
+        /* Trim mapping request to maximum we can map at once for DIO */
+        if (bh_result->b_size >> inode->i_blkbits > DIO_MAX_BLOCKS)
+                bh_result->b_size = DIO_MAX_BLOCKS << inode->i_blkbits;
+        dio_credits = ext4_chunk_trans_blocks(inode,
+                                      bh_result->b_size >> inode->i_blkbits);
+        return ext4_journal_start(inode, EXT4_HT_MAP_BLOCKS, dio_credits);
+}
+/* Get block function for DIO reads and writes to inodes without extents */
+int ext4_dio_get_block(struct inode *inode, sector_t iblock,
+                       struct buffer_head *bh, int create)
+{
+        handle_t *handle;
+        int ret;
+        /* We don't expect handle for direct IO */
+        WARN_ON_ONCE(ext4_journal_current_handle());
+        if (create) {
+                handle = start_dio_trans(inode, bh);
+                if (IS_ERR(handle))
+                        return PTR_ERR(handle);
+        }
+        ret = _ext4_get_block(inode, iblock, bh,
+                              create ? EXT4_GET_BLOCKS_CREATE : 0);
+        if (create)
+                ext4_journal_stop(handle);
+        return ret;
+}
+/*
+ * Get block function for AIO DIO writes when we create unwritten extent if
+ * blocks are not allocated yet. The extent will be converted to written
+ * after IO is complete.
+ */
+static int ext4_dio_get_block_unwritten_async(struct inode *inode,
+                sector_t iblock, struct buffer_head *bh_result, int create)
+{
+        handle_t *handle;
+        int ret;
+        /* We don't expect handle for direct IO */
+        WARN_ON_ONCE(ext4_journal_current_handle());
+        handle = start_dio_trans(inode, bh_result);
+        if (IS_ERR(handle))
+                return PTR_ERR(handle);
+        ret = _ext4_get_block(inode, iblock, bh_result,
+                              EXT4_GET_BLOCKS_IO_CREATE_EXT);
+        ext4_journal_stop(handle);
+        /*
+         * When doing DIO using unwritten extents, we need io_end to convert
+         * unwritten extents to written on IO completion. We allocate io_end
+         * once we spot unwritten extent and store it in b_private. Generic
+         * DIO code keeps b_private set and furthermore passes the value to
+         * our completion callback in 'private' argument.
+         */
+        if (!ret && buffer_unwritten(bh_result)) {
+                if (!bh_result->b_private) {
+                        ext4_io_end_t *io_end;
+                        io_end = ext4_init_io_end(inode, GFP_KERNEL);
+                        if (!io_end)
+                                return -ENOMEM;
+                        bh_result->b_private = io_end;
+                        ext4_set_io_unwritten_flag(inode, io_end);
+                }
+                set_buffer_defer_completion(bh_result);
+        }
+        return ret;
+}
+/*
+ * Get block function for non-AIO DIO writes when we create unwritten extent if
+ * blocks are not allocated yet. The extent will be converted to written
+ * after IO is complete from ext4_ext_direct_IO() function.
+ */
+static int ext4_dio_get_block_unwritten_sync(struct inode *inode,
+                sector_t iblock, struct buffer_head *bh_result, int create)
+{
+        handle_t *handle;
+        int ret;
+        /* We don't expect handle for direct IO */
+        WARN_ON_ONCE(ext4_journal_current_handle());
+        handle = start_dio_trans(inode, bh_result);
+        if (IS_ERR(handle))
+                return PTR_ERR(handle);
+        ret = _ext4_get_block(inode, iblock, bh_result,
+                              EXT4_GET_BLOCKS_IO_CREATE_EXT);
+        ext4_journal_stop(handle);
+        /*
+         * Mark inode as having pending DIO writes to unwritten extents.
+         * ext4_ext_direct_IO() checks this flag and converts extents to
+         * written.
+         */
+        if (!ret && buffer_unwritten(bh_result))
+                ext4_set_inode_state(inode, EXT4_STATE_DIO_UNWRITTEN);
+        return ret;
+}
+static int ext4_dio_get_block_overwrite(struct inode *inode, sector_t iblock,
+                   struct buffer_head *bh_result, int create)
+{
+        int ret;
+        ext4_debug("ext4_dio_get_block_overwrite: inode %lu, create flag %d\n",
+                   inode->i_ino, create);
+        /* We don't expect handle for direct IO */
+        WARN_ON_ONCE(ext4_journal_current_handle());
+        ret = _ext4_get_block(inode, iblock, bh_result, 0);
+        /*
+         * Blocks should have been preallocated! ext4_file_write_iter() checks
+         * that.
+         */
+        WARN_ON_ONCE(!buffer_mapped(bh_result) || buffer_unwritten(bh_result));
+        return ret;
+}
+/*
 * `handle' can be NULL if create is zero
 */
 struct buffer_head *ext4_getblk(handle_t *handle, struct inode *inode,
@@ -1079,13 +1206,14 @@ retry_journal:
 #ifdef CONFIG_EXT4_FS_ENCRYPTION
        if (ext4_should_dioread_nolock(inode))
                ret = ext4_block_write_begin(page, pos, len,
-                                             ext4_get_block_write);
+                                             ext4_get_block_unwritten);
        else
                ret = ext4_block_write_begin(page, pos, len,
                                             ext4_get_block);
 #else
        if (ext4_should_dioread_nolock(inode))
-                ret = __block_write_begin(page, pos, len, ext4_get_block_write);
+                ret = __block_write_begin(page, pos, len,
+                                          ext4_get_block_unwritten);
        else
                ret = __block_write_begin(page, pos, len, ext4_get_block);
 #endif
@@ -3088,37 +3216,6 @@ static int ext4_releasepage(struct page *page, gfp_t wait)
                return try_to_free_buffers(page);
 }
-/*
- * ext4_get_block used when preparing for a DIO write or buffer write.
- * We allocate an uinitialized extent if blocks haven't been allocated.
- * The extent will be converted to initialized after the IO is complete.
- */
-int ext4_get_block_write(struct inode *inode, sector_t iblock,
-                   struct buffer_head *bh_result, int create)
-{
-        ext4_debug("ext4_get_block_write: inode %lu, create flag %d\n",
-                   inode->i_ino, create);
-        return _ext4_get_block(inode, iblock, bh_result,
-                               EXT4_GET_BLOCKS_IO_CREATE_EXT);
-}
-static int ext4_get_block_overwrite(struct inode *inode, sector_t iblock,
-                   struct buffer_head *bh_result, int create)
-{
-        int ret;
-        ext4_debug("ext4_get_block_overwrite: inode %lu, create flag %d\n",
-                   inode->i_ino, create);
-        ret = _ext4_get_block(inode, iblock, bh_result, 0);
-        /*
-         * Blocks should have been preallocated! ext4_file_write_iter() checks
-         * that.
-         */
-        WARN_ON_ONCE(!buffer_mapped(bh_result));
-        return ret;
-}
 #ifdef CONFIG_FS_DAX
 int ext4_dax_mmap_get_block(struct inode *inode, sector_t iblock,
                            struct buffer_head *bh_result, int create)
@@ -3179,13 +3276,12 @@ out:
        WARN_ON_ONCE(ret == 0 && create);
        if (ret > 0) {
                map_bh(bh_result, inode->i_sb, map.m_pblk);
-                bh_result->b_state = (bh_result->b_state & ~EXT4_MAP_FLAGS) |
-                                        map.m_flags;
                /*
                 * At least for now we have to clear BH_New so that DAX code
                 * doesn't attempt to zero blocks again in a racy way.
                 */
-                bh_result->b_state &= ~(1 << BH_New);
+                map.m_flags &= ~EXT4_MAP_NEW;
+                ext4_update_bh_state(bh_result, map.m_flags);
                bh_result->b_size = map.m_len << inode->i_blkbits;
                ret = 0;
        }
@@ -3193,24 +3289,32 @@ out:
 }
 #endif
-static void ext4_end_io_dio(struct kiocb *iocb, loff_t offset,
+static int ext4_end_io_dio(struct kiocb *iocb, loff_t offset,
                            ssize_t size, void *private)
 {
-        ext4_io_end_t *io_end = iocb->private;
+        ext4_io_end_t *io_end = private;
        /* if not async direct IO just return */
        if (!io_end)
-                return;
+                return 0;
        ext_debug("ext4_end_io_dio(): io_end 0x%p "
                  "for inode %lu, iocb 0x%p, offset %llu, size %zd\n",
-                  iocb->private, io_end->inode->i_ino, iocb, offset,
+                  io_end, io_end->inode->i_ino, iocb, offset, size);
-                  size);
-        iocb->private = NULL;
+        /*
+         * Error during AIO DIO. We cannot convert unwritten extents as the
+         * data was not written. Just clear the unwritten flag and drop io_end.
+         */
+        if (size <= 0) {
+                ext4_clear_io_unwritten_flag(io_end);
+                size = 0;
+        }
        io_end->offset = offset;
        io_end->size = size;
        ext4_put_io_end(io_end);
+        return 0;
 }
 /*
@@ -3243,7 +3347,6 @@ static ssize_t ext4_ext_direct_IO(struct kiocb *iocb, struct iov_iter *iter,
        get_block_t *get_block_func = NULL;
        int dio_flags = 0;
        loff_t final_size = offset + count;
-        ext4_io_end_t *io_end = NULL;
        /* Use the old path for reads and writes beyond i_size. */
        if (iov_iter_rw(iter) != WRITE || final_size > inode->i_size)
@@ -3268,16 +3371,17 @@ static ssize_t ext4_ext_direct_IO(struct kiocb *iocb, struct iov_iter *iter,
        /*
         * We could direct write to holes and fallocate.
         *
-         * Allocated blocks to fill the hole are marked as
+         * Allocated blocks to fill the hole are marked as unwritten to prevent
-         * unwritten to prevent parallel buffered read to expose
+         * parallel buffered read to expose the stale data before DIO complete
-         * the stale data before DIO complete the data IO.
+         * the data IO.
         *
-         * As to previously fallocated extents, ext4 get_block will
+         * As to previously fallocated extents, ext4 get_block will just simply
-         * just simply mark the buffer mapped but still keep the
+         * mark the buffer mapped but still keep the extents unwritten.
-         * extents unwritten.
         *
-         * For non AIO case, we will convert those unwritten extents
+         * For non AIO case, we will convert those unwritten extents to written
-         * to written after return back from blockdev_direct_IO.
+         * after return back from blockdev_direct_IO. That way we save us from
+         * allocating io_end structure and also the overhead of offloading
+         * the extent convertion to a workqueue.
         *
         * For async DIO, the conversion needs to be deferred when the
         * IO is completed. The ext4 end_io callback function will be
@@ -3285,30 +3389,13 @@ static ssize_t ext4_ext_direct_IO(struct kiocb *iocb, struct iov_iter *iter,
         * case, we allocate an io_end structure to hook to the iocb.
         */
        iocb->private = NULL;
-        if (overwrite) {
+        if (overwrite)
-                get_block_func = ext4_get_block_overwrite;
+                get_block_func = ext4_dio_get_block_overwrite;
+        else if (is_sync_kiocb(iocb)) {
+                get_block_func = ext4_dio_get_block_unwritten_sync;
+                dio_flags = DIO_LOCKING;
        } else {
-                ext4_inode_aio_set(inode, NULL);
+                get_block_func = ext4_dio_get_block_unwritten_async;
-                if (!is_sync_kiocb(iocb)) {
-                        io_end = ext4_init_io_end(inode, GFP_NOFS);
-                        if (!io_end) {
-                                ret = -ENOMEM;
-                                goto retake_lock;
-                        }
-                        /*
-                         * Grab reference for DIO. Will be dropped in
-                         * ext4_end_io_dio()
-                         */
-                        iocb->private = ext4_get_io_end(io_end);
-                        /*
-                         * we save the io structure for current async direct
-                         * IO, so that later ext4_map_blocks() could flag the
-                         * io structure whether there is a unwritten extents
-                         * needs to be converted when IO is completed.
-                         */
-                        ext4_inode_aio_set(inode, io_end);
-                }
-                get_block_func = ext4_get_block_write;
                dio_flags = DIO_LOCKING;
        }
 #ifdef CONFIG_EXT4_FS_ENCRYPTION
@@ -3323,27 +3410,6 @@ static ssize_t ext4_ext_direct_IO(struct kiocb *iocb, struct iov_iter *iter,
                                           get_block_func,
                                           ext4_end_io_dio, NULL, dio_flags);
-        /*
-         * Put our reference to io_end. This can free the io_end structure e.g.
-         * in sync IO case or in case of error. It can even perform extent
-         * conversion if all bios we submitted finished before we got here.
-         * Note that in that case iocb->private can be already set to NULL
-         * here.
-         */
-        if (io_end) {
-                ext4_inode_aio_set(inode, NULL);
-                ext4_put_io_end(io_end);
-                /*
-                 * When no IO was submitted ext4_end_io_dio() was not
-                 * called so we have to put iocb's reference.
-                 */
-                if (ret <= 0 && ret != -EIOCBQUEUED && iocb->private) {
-                        WARN_ON(iocb->private != io_end);
-                        WARN_ON(io_end->flag & EXT4_IO_END_UNWRITTEN);
-                        ext4_put_io_end(io_end);
-                        iocb->private = NULL;
-                }
-        }
        if (ret > 0 && !overwrite && ext4_test_inode_state(inode,
                                                EXT4_STATE_DIO_UNWRITTEN)) {
                int err;
@@ -3358,7 +3424,6 @@ static ssize_t ext4_ext_direct_IO(struct kiocb *iocb, struct iov_iter *iter,
                ext4_clear_inode_state(inode, EXT4_STATE_DIO_UNWRITTEN);
        }
-retake_lock:
        if (iov_iter_rw(iter) == WRITE)
                inode_dio_end(inode);
        /* take i_mutex locking again if we do a ovewrite dio */
@@ -5261,6 +5326,8 @@ int ext4_mark_inode_dirty(handle_t *handle, struct inode *inode)
        might_sleep();
        trace_ext4_mark_inode_dirty(inode, _RET_IP_);
        err = ext4_reserve_inode_write(handle, inode, &iloc);
+        if (err)
+                return err;
        if (ext4_handle_valid(handle) &&
            EXT4_I(inode)->i_extra_isize < sbi->s_want_extra_isize &&
            !ext4_test_inode_state(inode, EXT4_STATE_NO_EXPAND)) {
@@ -5291,9 +5358,7 @@ int ext4_mark_inode_dirty(handle_t *handle, struct inode *inode)
                        }
                }
        }
-        if (!err)
+        return ext4_mark_iloc_dirty(handle, inode, &iloc);
-                err = ext4_mark_iloc_dirty(handle, inode, &iloc);
-        return err;
 }
 /*
@@ -5502,7 +5567,7 @@ int ext4_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf)
        unlock_page(page);
        /* OK, we need to fill the hole... */
        if (ext4_should_dioread_nolock(inode))
-                get_block = ext4_get_block_write;
+                get_block = ext4_get_block_unwritten;
        else
                get_block = ext4_get_block;
 retry_alloc:
@@ -5545,3 +5610,70 @@ int ext4_filemap_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
        return err;
 }
+/*
+ * Find the first extent at or after @lblk in an inode that is not a hole.
+ * Search for @map_len blocks at most. The extent is returned in @result.
+ *
+ * The function returns 1 if we found an extent. The function returns 0 in
+ * case there is no extent at or after @lblk and in that case also sets
+ * @result->es_len to 0. In case of error, the error code is returned.
+ */
+int ext4_get_next_extent(struct inode *inode, ext4_lblk_t lblk,
+                         unsigned int map_len, struct extent_status *result)
+{
+        struct ext4_map_blocks map;
+        struct extent_status es = {};
+        int ret;
+        map.m_lblk = lblk;
+        map.m_len = map_len;
+        /*
+         * For non-extent based files this loop may iterate several times since
+         * we do not determine full hole size.
+         */
+        while (map.m_len > 0) {
+                ret = ext4_map_blocks(NULL, inode, &map, 0);
+                if (ret < 0)
+                        return ret;
+                /* There's extent covering m_lblk? Just return it. */
+                if (ret > 0) {
+                        int status;
+                        ext4_es_store_pblock(result, map.m_pblk);
+                        result->es_lblk = map.m_lblk;
+                        result->es_len = map.m_len;
+                        if (map.m_flags & EXT4_MAP_UNWRITTEN)
+                                status = EXTENT_STATUS_UNWRITTEN;
+                        else
+                                status = EXTENT_STATUS_WRITTEN;
+                        ext4_es_store_status(result, status);
+                        return 1;
+                }
+                ext4_es_find_delayed_extent_range(inode, map.m_lblk,
+                                                  map.m_lblk + map.m_len - 1,
+                                                  &es);
+                /* Is delalloc data before next block in extent tree? */
+                if (es.es_len && es.es_lblk < map.m_lblk + map.m_len) {
+                        ext4_lblk_t offset = 0;
+                        if (es.es_lblk < lblk)
+                                offset = lblk - es.es_lblk;
+                        result->es_lblk = es.es_lblk + offset;
+                        ext4_es_store_pblock(result,
+                                             ext4_es_pblock(&es) + offset);
+                        result->es_len = es.es_len - offset;
+                        ext4_es_store_status(result, ext4_es_status(&es));
+                        return 1;
+                }
+                /* There's a hole at m_lblk, advance us after it */
+                map.m_lblk += map.m_len;
+                map_len -= map.m_len;
+                map.m_len = map_len;
+                cond_resched();
+        }
+        result->es_len = 0;
+        return 0;
+}
diff --git a/fs/ext4/mballoc.c b/fs/ext4/mballoc.c
index 4424b7bf8ac6..50e05df28f66 100644
--- a/fs/ext4/mballoc.c
+++ b/fs/ext4/mballoc.c
@@ -11,7 +11,7 @@
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 * GNU General Public License for more details.
 *
- * You should have received a copy of the GNU General Public Licens
+ * You should have received a copy of the GNU General Public License
 * along with this program; if not, write to the Free Software
 * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-
 */
@@ -815,7 +815,7 @@ static void mb_regenerate_buddy(struct ext4_buddy *e4b)
 * for this page; do not hold this lock when calling this routine!
 */
-static int ext4_mb_init_cache(struct page *page, char *incore)
+static int ext4_mb_init_cache(struct page *page, char *incore, gfp_t gfp)
 {
        ext4_group_t ngroups;
        int blocksize;
@@ -848,7 +848,7 @@ static int ext4_mb_init_cache(struct page *page, char *incore)
        /* allocate buffer_heads to read bitmaps */
        if (groups_per_page > 1) {
                i = sizeof(struct buffer_head *) * groups_per_page;
-                bh = kzalloc(i, GFP_NOFS);
+                bh = kzalloc(i, gfp);
                if (bh == NULL) {
                        err = -ENOMEM;
                        goto out;
@@ -983,7 +983,7 @@ out:
 * are on the same page e4b->bd_buddy_page is NULL and return value is 0.
 */
 static int ext4_mb_get_buddy_page_lock(struct super_block *sb,
-                ext4_group_t group, struct ext4_buddy *e4b)
+                ext4_group_t group, struct ext4_buddy *e4b, gfp_t gfp)
 {
        struct inode *inode = EXT4_SB(sb)->s_buddy_cache;
        int block, pnum, poff;
@@ -1002,7 +1002,7 @@ static int ext4_mb_get_buddy_page_lock(struct super_block *sb,
        block = group * 2;
        pnum = block / blocks_per_page;
        poff = block % blocks_per_page;
-        page = find_or_create_page(inode->i_mapping, pnum, GFP_NOFS);
+        page = find_or_create_page(inode->i_mapping, pnum, gfp);
        if (!page)
                return -ENOMEM;
        BUG_ON(page->mapping != inode->i_mapping);
@@ -1016,7 +1016,7 @@ static int ext4_mb_get_buddy_page_lock(struct super_block *sb,
        block++;
        pnum = block / blocks_per_page;
-        page = find_or_create_page(inode->i_mapping, pnum, GFP_NOFS);
+        page = find_or_create_page(inode->i_mapping, pnum, gfp);
        if (!page)
                return -ENOMEM;
        BUG_ON(page->mapping != inode->i_mapping);
@@ -1042,7 +1042,7 @@ static void ext4_mb_put_buddy_page_lock(struct ext4_buddy *e4b)
 * calling this routine!
 */
 static noinline_for_stack
-int ext4_mb_init_group(struct super_block *sb, ext4_group_t group)
+int ext4_mb_init_group(struct super_block *sb, ext4_group_t group, gfp_t gfp)
 {
        struct ext4_group_info *this_grp;
@@ -1062,7 +1062,7 @@ int ext4_mb_init_group(struct super_block *sb, ext4_group_t group)
         * The call to ext4_mb_get_buddy_page_lock will mark the
         * page accessed.
         */
-        ret = ext4_mb_get_buddy_page_lock(sb, group, &e4b);
+        ret = ext4_mb_get_buddy_page_lock(sb, group, &e4b, gfp);
        if (ret || !EXT4_MB_GRP_NEED_INIT(this_grp)) {
                /*
                 * somebody initialized the group
@@ -1072,7 +1072,7 @@ int ext4_mb_init_group(struct super_block *sb, ext4_group_t group)
        }
        page = e4b.bd_bitmap_page;
-        ret = ext4_mb_init_cache(page, NULL);
+        ret = ext4_mb_init_cache(page, NULL, gfp);
        if (ret)
                goto err;
        if (!PageUptodate(page)) {
@@ -1091,7 +1091,7 @@ int ext4_mb_init_group(struct super_block *sb, ext4_group_t group)
        }
        /* init buddy cache */
        page = e4b.bd_buddy_page;
-        ret = ext4_mb_init_cache(page, e4b.bd_bitmap);
+        ret = ext4_mb_init_cache(page, e4b.bd_bitmap, gfp);
        if (ret)
                goto err;
        if (!PageUptodate(page)) {
@@ -1109,8 +1109,8 @@ err:
 * calling this routine!
 */
 static noinline_for_stack int
-ext4_mb_load_buddy(struct super_block *sb, ext4_group_t group,
+ext4_mb_load_buddy_gfp(struct super_block *sb, ext4_group_t group,
-                                        struct ext4_buddy *e4b)
+                       struct ext4_buddy *e4b, gfp_t gfp)
 {
        int blocks_per_page;
        int block;
@@ -1140,7 +1140,7 @@ ext4_mb_load_buddy(struct super_block *sb, ext4_group_t group,
                 * we need full data about the group
                 * to make a good selection
                 */
-                ret = ext4_mb_init_group(sb, group);
+                ret = ext4_mb_init_group(sb, group, gfp);
                if (ret)
                        return ret;
        }
@@ -1168,11 +1168,11 @@ ext4_mb_load_buddy(struct super_block *sb, ext4_group_t group,
                         * wait for it to initialize.
                         */
                        page_cache_release(page);
-                page = find_or_create_page(inode->i_mapping, pnum, GFP_NOFS);
+                page = find_or_create_page(inode->i_mapping, pnum, gfp);
                if (page) {
                        BUG_ON(page->mapping != inode->i_mapping);
                        if (!PageUptodate(page)) {
-                                ret = ext4_mb_init_cache(page, NULL);
+                                ret = ext4_mb_init_cache(page, NULL, gfp);
                                if (ret) {
                                        unlock_page(page);
                                        goto err;
@@ -1204,11 +1204,12 @@ ext4_mb_load_buddy(struct super_block *sb, ext4_group_t group,
        if (page == NULL || !PageUptodate(page)) {
                if (page)
                        page_cache_release(page);
-                page = find_or_create_page(inode->i_mapping, pnum, GFP_NOFS);
+                page = find_or_create_page(inode->i_mapping, pnum, gfp);
                if (page) {
                        BUG_ON(page->mapping != inode->i_mapping);
                        if (!PageUptodate(page)) {
-                                ret = ext4_mb_init_cache(page, e4b->bd_bitmap);
+                                ret = ext4_mb_init_cache(page, e4b->bd_bitmap,
+                                                         gfp);
                                if (ret) {
                                        unlock_page(page);
                                        goto err;
@@ -1247,6 +1248,12 @@ err:
        return ret;
 }
+static int ext4_mb_load_buddy(struct super_block *sb, ext4_group_t group,
+                              struct ext4_buddy *e4b)
+{
+        return ext4_mb_load_buddy_gfp(sb, group, e4b, GFP_NOFS);
+}
 static void ext4_mb_unload_buddy(struct ext4_buddy *e4b)
 {
        if (e4b->bd_bitmap_page)
@@ -2045,7 +2052,7 @@ static int ext4_mb_good_group(struct ext4_allocation_context *ac,
        /* We only do this if the grp has never been initialized */
        if (unlikely(EXT4_MB_GRP_NEED_INIT(grp))) {
-                int ret = ext4_mb_init_group(ac->ac_sb, group);
+                int ret = ext4_mb_init_group(ac->ac_sb, group, GFP_NOFS);
                if (ret)
                        return ret;
        }
@@ -4695,16 +4702,6 @@ void ext4_free_blocks(handle_t *handle, struct inode *inode,
        }
        /*
-         * We need to make sure we don't reuse the freed block until
-         * after the transaction is committed, which we can do by
-         * treating the block as metadata, below.  We make an
-         * exception if the inode is to be written in writeback mode
-         * since writeback mode has weak data consistency guarantees.
-         */
-        if (!ext4_should_writeback_data(inode))
-                flags |= EXT4_FREE_BLOCKS_METADATA;
-        /*
         * If the extent to be freed does not begin on a cluster
         * boundary, we need to deal with partial clusters at the
         * beginning and end of the extent.  Normally we will free
@@ -4738,14 +4735,13 @@ void ext4_free_blocks(handle_t *handle, struct inode *inode,
        if (!bh && (flags & EXT4_FREE_BLOCKS_FORGET)) {
                int i;
+                int is_metadata = flags & EXT4_FREE_BLOCKS_METADATA;
                for (i = 0; i < count; i++) {
                        cond_resched();
-                        bh = sb_find_get_block(inode->i_sb, block + i);
+                        if (is_metadata)
-                        if (!bh)
+                                bh = sb_find_get_block(inode->i_sb, block + i);
-                                continue;
+                        ext4_forget(handle, is_metadata, inode, bh, block + i);
-                        ext4_forget(handle, flags & EXT4_FREE_BLOCKS_METADATA,
-                                    inode, bh, block + i);
                }
        }
@@ -4815,16 +4811,23 @@ do_more:
 #endif
        trace_ext4_mballoc_free(sb, inode, block_group, bit, count_clusters);
-        err = ext4_mb_load_buddy(sb, block_group, &e4b);
+        /* __GFP_NOFAIL: retry infinitely, ignore TIF_MEMDIE and memcg limit. */
+        err = ext4_mb_load_buddy_gfp(sb, block_group, &e4b,
+                                     GFP_NOFS|__GFP_NOFAIL);
        if (err)
                goto error_return;
-        if ((flags & EXT4_FREE_BLOCKS_METADATA) && ext4_handle_valid(handle)) {
+        /*
+         * We need to make sure we don't reuse the freed block until after the
+         * transaction is committed. We make an exception if the inode is to be
+         * written in writeback mode since writeback mode has weak data
+         * consistency guarantees.
+         */
+        if (ext4_handle_valid(handle) &&
+            ((flags & EXT4_FREE_BLOCKS_METADATA) ||
+             !ext4_should_writeback_data(inode))) {
                struct ext4_free_data *new_entry;
                /*
-                 * blocks being freed are metadata. these blocks shouldn't
-                 * be used until this transaction is committed
-                 *
                 * We use __GFP_NOFAIL because ext4_free_blocks() is not allowed
                 * to fail.
                 */
@@ -5217,7 +5220,7 @@ int ext4_trim_fs(struct super_block *sb, struct fstrim_range *range)
                grp = ext4_get_group_info(sb, group);
                /* We only do this if the grp has never been initialized */
                if (unlikely(EXT4_MB_GRP_NEED_INIT(grp))) {
-                        ret = ext4_mb_init_group(sb, group);
+                        ret = ext4_mb_init_group(sb, group, GFP_NOFS);
                        if (ret)
                                break;
                }
diff --git a/fs/ext4/mballoc.h b/fs/ext4/mballoc.h
index d634e183b4d4..3ef1df6ae9ec 100644
--- a/fs/ext4/mballoc.h
+++ b/fs/ext4/mballoc.h
@@ -23,18 +23,6 @@
 #include "ext4.h"
 /*
- * with AGGRESSIVE_CHECK allocator runs consistency checks over
- * structures. these checks slow things down a lot
- */
-#define AGGRESSIVE_CHECK__
-/*
- * with DOUBLE_CHECK defined mballoc creates persistent in-core
- * bitmaps, maintains and uses them to check for double allocations
- */
-#define DOUBLE_CHECK__
-/*
 */
 #ifdef CONFIG_EXT4_DEBUG
 extern ushort ext4_mballoc_debug;
diff --git a/fs/ext4/migrate.c b/fs/ext4/migrate.c
index a4651894cc33..364ea4d4a943 100644
--- a/fs/ext4/migrate.c
+++ b/fs/ext4/migrate.c
@@ -361,7 +361,7 @@ static int ext4_ext_swap_inode_data(handle_t *handle, struct inode *inode,
         * blocks.
         *
         * While converting to extents we need not
-         * update the orignal inode i_blocks for extent blocks
+         * update the original inode i_blocks for extent blocks
         * via quota APIs. The quota update happened via tmp_inode already.
         */
        spin_lock(&inode->i_lock);
diff --git a/fs/ext4/mmp.c b/fs/ext4/mmp.c
index 0a512aa81bf7..24445275d330 100644
--- a/fs/ext4/mmp.c
+++ b/fs/ext4/mmp.c
@@ -91,21 +91,22 @@ static int read_mmp_block(struct super_block *sb, struct buffer_head **bh,
        submit_bh(READ_SYNC | REQ_META | REQ_PRIO, *bh);
        wait_on_buffer(*bh);
        if (!buffer_uptodate(*bh)) {
-                brelse(*bh);
-                *bh = NULL;
                ret = -EIO;
                goto warn_exit;
        }
        mmp = (struct mmp_struct *)((*bh)->b_data);
-        if (le32_to_cpu(mmp->mmp_magic) != EXT4_MMP_MAGIC)
+        if (le32_to_cpu(mmp->mmp_magic) != EXT4_MMP_MAGIC) {
                ret = -EFSCORRUPTED;
-        else if (!ext4_mmp_csum_verify(sb, mmp))
+                goto warn_exit;
+        }
+        if (!ext4_mmp_csum_verify(sb, mmp)) {
                ret = -EFSBADCRC;
-        else
+                goto warn_exit;
-                return 0;
+        }
+        return 0;
 warn_exit:
+        brelse(*bh);
+        *bh = NULL;
        ext4_warning(sb, "Error %d while reading MMP block %llu",
                     ret, mmp_block);
        return ret;
@@ -181,15 +182,13 @@ static int kmmpd(void *data)
                    EXT4_FEATURE_INCOMPAT_MMP)) {
                        ext4_warning(sb, "kmmpd being stopped since MMP feature"
                                     " has been disabled.");
-                        EXT4_SB(sb)->s_mmp_tsk = NULL;
+                        goto exit_thread;
-                        goto failed;
                }
                if (sb->s_flags & MS_RDONLY) {
                        ext4_warning(sb, "kmmpd being stopped since filesystem "
                                     "has been remounted as readonly.");
-                        EXT4_SB(sb)->s_mmp_tsk = NULL;
+                        goto exit_thread;
-                        goto failed;
                }
                diff = jiffies - last_update_time;
@@ -211,9 +210,7 @@ static int kmmpd(void *data)
                        if (retval) {
                                ext4_error(sb, "error reading MMP data: %d",
                                           retval);
+                                goto exit_thread;
-                                EXT4_SB(sb)->s_mmp_tsk = NULL;
-                                goto failed;
                        }
                        mmp_check = (struct mmp_struct *)(bh_check->b_data);
@@ -225,7 +222,9 @@ static int kmmpd(void *data)
                                             "The filesystem seems to have been"
                                             " multiply mounted.");
                                ext4_error(sb, "abort");
-                                goto failed;
+                                put_bh(bh_check);
+                                retval = -EBUSY;
+                                goto exit_thread;
                        }
                        put_bh(bh_check);
                }
@@ -248,7 +247,8 @@ static int kmmpd(void *data)
        retval = write_mmp_block(sb, bh);
-failed:
+exit_thread:
+        EXT4_SB(sb)->s_mmp_tsk = NULL;
        kfree(data);
        brelse(bh);
        return retval;
diff --git a/fs/ext4/page-io.c b/fs/ext4/page-io.c
index 090b3498638e..d77d15f4b674 100644
--- a/fs/ext4/page-io.c
+++ b/fs/ext4/page-io.c
@@ -128,9 +128,6 @@ static void ext4_release_io_end(ext4_io_end_t *io_end)
        BUG_ON(io_end->flag & EXT4_IO_END_UNWRITTEN);
        WARN_ON(io_end->handle);
-        if (atomic_dec_and_test(&EXT4_I(io_end->inode)->i_ioend_count))
-                wake_up_all(ext4_ioend_wq(io_end->inode));
        for (bio = io_end->bio; bio; bio = next_bio) {
                next_bio = bio->bi_private;
                ext4_finish_bio(bio);
@@ -139,16 +136,6 @@ static void ext4_release_io_end(ext4_io_end_t *io_end)
        kmem_cache_free(io_end_cachep, io_end);
 }
-static void ext4_clear_io_unwritten_flag(ext4_io_end_t *io_end)
-{
-        struct inode *inode = io_end->inode;
-        io_end->flag &= ~EXT4_IO_END_UNWRITTEN;
-        /* Wake up anyone waiting on unwritten extent conversion */
-        if (atomic_dec_and_test(&EXT4_I(inode)->i_unwritten))
-                wake_up_all(ext4_ioend_wq(inode));
-}
 /*
 * Check a range of space and convert unwritten extents to written. Note that
 * we are protected from truncate touching same part of extent tree by the
@@ -265,7 +252,6 @@ ext4_io_end_t *ext4_init_io_end(struct inode *inode, gfp_t flags)
 {
        ext4_io_end_t *io = kmem_cache_zalloc(io_end_cachep, flags);
        if (io) {
-                atomic_inc(&EXT4_I(inode)->i_ioend_count);
                io->inode = inode;
                INIT_LIST_HEAD(&io->list);
                atomic_set(&io->count, 1);
diff --git a/fs/ext4/super.c b/fs/ext4/super.c
index 3ed01ec011d7..539297515896 100644
--- a/fs/ext4/super.c
+++ b/fs/ext4/super.c
@@ -55,7 +55,6 @@
 static struct ext4_lazy_init *ext4_li_info;
 static struct mutex ext4_li_mtx;
-static int ext4_mballoc_ready;
 static struct ratelimit_state ext4_mount_msg_ratelimit;
 static int ext4_load_journal(struct super_block *, struct ext4_super_block *,
@@ -844,7 +843,6 @@ static void ext4_put_super(struct super_block *sb)
        ext4_release_system_zone(sb);
        ext4_mb_release(sb);
        ext4_ext_release(sb);
-        ext4_xattr_put_super(sb);
        if (!(sb->s_flags & MS_RDONLY)) {
                ext4_clear_feature_journal_needs_recovery(sb);
@@ -944,7 +942,6 @@ static struct inode *ext4_alloc_inode(struct super_block *sb)
        spin_lock_init(&ei->i_completed_io_lock);
        ei->i_sync_tid = 0;
        ei->i_datasync_tid = 0;
-        atomic_set(&ei->i_ioend_count, 0);
        atomic_set(&ei->i_unwritten, 0);
        INIT_WORK(&ei->i_rsv_conversion_work, ext4_end_io_rsv_work);
 #ifdef CONFIG_EXT4_FS_ENCRYPTION
@@ -1132,6 +1129,7 @@ static const struct dquot_operations ext4_quota_operations = {
        .alloc_dquot    = dquot_alloc,
        .destroy_dquot  = dquot_destroy,
        .get_projid     = ext4_get_projid,
+        .get_next_id    = dquot_get_next_id,
 };
 static const struct quotactl_ops ext4_qctl_operations = {
@@ -1141,7 +1139,8 @@ static const struct quotactl_ops ext4_qctl_operations = {
        .get_state      = dquot_get_state,
        .set_info       = dquot_set_dqinfo,
        .get_dqblk      = dquot_get_dqblk,
-        .set_dqblk      = dquot_set_dqblk
+        .set_dqblk      = dquot_set_dqblk,
+        .get_nextdqblk  = dquot_get_next_dqblk,
 };
 #endif
@@ -1425,9 +1424,9 @@ static const struct mount_opts {
        {Opt_err_ro, EXT4_MOUNT_ERRORS_RO, MOPT_SET | MOPT_CLEAR_ERR},
        {Opt_err_cont, EXT4_MOUNT_ERRORS_CONT, MOPT_SET | MOPT_CLEAR_ERR},
        {Opt_data_err_abort, EXT4_MOUNT_DATA_ERR_ABORT,
-         MOPT_NO_EXT2 | MOPT_SET},
+         MOPT_NO_EXT2},
        {Opt_data_err_ignore, EXT4_MOUNT_DATA_ERR_ABORT,
-         MOPT_NO_EXT2 | MOPT_CLEAR},
+         MOPT_NO_EXT2},
        {Opt_barrier, EXT4_MOUNT_BARRIER, MOPT_SET},
        {Opt_nobarrier, EXT4_MOUNT_BARRIER, MOPT_CLEAR},
        {Opt_noauto_da_alloc, EXT4_MOUNT_NO_AUTO_DA_ALLOC, MOPT_SET},
@@ -1705,6 +1704,10 @@ static int handle_mount_opt(struct super_block *sb, char *opt, int token,
                ext4_msg(sb, KERN_INFO, "dax option not supported");
                return -1;
 #endif
+        } else if (token == Opt_data_err_abort) {
+                sbi->s_mount_opt |= m->mount_opt;
+        } else if (token == Opt_data_err_ignore) {
+                sbi->s_mount_opt &= ~m->mount_opt;
        } else {
                if (!args->from)
                        arg = 1;
@@ -1914,6 +1917,8 @@ static int _ext4_show_options(struct seq_file *seq, struct super_block *sb,
                SEQ_OPTS_PRINT("init_itable=%u", sbi->s_li_wait_mult);
        if (nodefs || sbi->s_max_dir_size_kb)
                SEQ_OPTS_PRINT("max_dir_size_kb=%u", sbi->s_max_dir_size_kb);
+        if (test_opt(sb, DATA_ERR_ABORT))
+                SEQ_OPTS_PUTS("data_err=abort");
        ext4_show_quota_options(seq, sb);
        return 0;
@@ -3796,12 +3801,10 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
        sbi->s_journal->j_commit_callback = ext4_journal_commit_callback;
 no_journal:
-        if (ext4_mballoc_ready) {
+        sbi->s_mb_cache = ext4_xattr_create_cache();
-                sbi->s_mb_cache = ext4_xattr_create_cache(sb->s_id);
+        if (!sbi->s_mb_cache) {
-                if (!sbi->s_mb_cache) {
+                ext4_msg(sb, KERN_ERR, "Failed to create an mb_cache");
-                        ext4_msg(sb, KERN_ERR, "Failed to create an mb_cache");
+                goto failed_mount_wq;
-                        goto failed_mount_wq;
-                }
        }
        if ((DUMMY_ENCRYPTION_ENABLED(sbi) || ext4_has_feature_encrypt(sb)) &&
@@ -4027,6 +4030,10 @@ failed_mount4:
        if (EXT4_SB(sb)->rsv_conversion_wq)
                destroy_workqueue(EXT4_SB(sb)->rsv_conversion_wq);
 failed_mount_wq:
+        if (sbi->s_mb_cache) {
+                ext4_xattr_destroy_cache(sbi->s_mb_cache);
+                sbi->s_mb_cache = NULL;
+        }
        if (sbi->s_journal) {
                jbd2_journal_destroy(sbi->s_journal);
                sbi->s_journal = NULL;
@@ -5321,7 +5328,6 @@ MODULE_ALIAS_FS("ext4");
 /* Shared across all ext4 file systems */
 wait_queue_head_t ext4__ioend_wq[EXT4_WQ_HASH_SZ];
-struct mutex ext4__aio_mutex[EXT4_WQ_HASH_SZ];
 static int __init ext4_init_fs(void)
 {
@@ -5334,10 +5340,8 @@ static int __init ext4_init_fs(void)
        /* Build-time check for flags consistency */
        ext4_check_flag_values();
-        for (i = 0; i < EXT4_WQ_HASH_SZ; i++) {
+        for (i = 0; i < EXT4_WQ_HASH_SZ; i++)
-                mutex_init(&ext4__aio_mutex[i]);
                init_waitqueue_head(&ext4__ioend_wq[i]);
-        }
        err = ext4_init_es();
        if (err)
@@ -5358,8 +5362,6 @@ static int __init ext4_init_fs(void)
        err = ext4_init_mballoc();
        if (err)
                goto out2;
-        else
-                ext4_mballoc_ready = 1;
        err = init_inodecache();
        if (err)
                goto out1;
@@ -5375,7 +5377,6 @@ out:
        unregister_as_ext3();
        destroy_inodecache();
 out1:
-        ext4_mballoc_ready = 0;
        ext4_exit_mballoc();
 out2:
        ext4_exit_sysfs();
diff --git a/fs/ext4/xattr.c b/fs/ext4/xattr.c
index a95151e875bd..0441e055c8e8 100644
--- a/fs/ext4/xattr.c
+++ b/fs/ext4/xattr.c
@@ -545,30 +545,44 @@ static void
 ext4_xattr_release_block(handle_t *handle, struct inode *inode,
                         struct buffer_head *bh)
 {
-        struct mb_cache_entry *ce = NULL;
-        int error = 0;
        struct mb_cache *ext4_mb_cache = EXT4_GET_MB_CACHE(inode);
+        u32 hash, ref;
+        int error = 0;
-        ce = mb_cache_entry_get(ext4_mb_cache, bh->b_bdev, bh->b_blocknr);
        BUFFER_TRACE(bh, "get_write_access");
        error = ext4_journal_get_write_access(handle, bh);
        if (error)
                goto out;
        lock_buffer(bh);
-        if (BHDR(bh)->h_refcount == cpu_to_le32(1)) {
+        hash = le32_to_cpu(BHDR(bh)->h_hash);
+        ref = le32_to_cpu(BHDR(bh)->h_refcount);
+        if (ref == 1) {
                ea_bdebug(bh, "refcount now=0; freeing");
-                if (ce)
+                /*
-                        mb_cache_entry_free(ce);
+                 * This must happen under buffer lock for
+                 * ext4_xattr_block_set() to reliably detect freed block
+                 */
+                mb_cache_entry_delete_block(ext4_mb_cache, hash, bh->b_blocknr);
                get_bh(bh);
                unlock_buffer(bh);
                ext4_free_blocks(handle, inode, bh, 0, 1,
                                 EXT4_FREE_BLOCKS_METADATA |
                                 EXT4_FREE_BLOCKS_FORGET);
        } else {
-                le32_add_cpu(&BHDR(bh)->h_refcount, -1);
+                ref--;
-                if (ce)
+                BHDR(bh)->h_refcount = cpu_to_le32(ref);
-                        mb_cache_entry_release(ce);
+                if (ref == EXT4_XATTR_REFCOUNT_MAX - 1) {
+                        struct mb_cache_entry *ce;
+                        ce = mb_cache_entry_get(ext4_mb_cache, hash,
+                                                bh->b_blocknr);
+                        if (ce) {
+                                ce->e_reusable = 1;
+                                mb_cache_entry_put(ext4_mb_cache, ce);
+                        }
+                }
                /*
                 * Beware of this ugliness: Releasing of xattr block references
                 * from different inodes can race and so we have to protect
@@ -790,8 +804,6 @@ ext4_xattr_block_set(handle_t *handle, struct inode *inode,
        if (i->value && i->value_len > sb->s_blocksize)
                return -ENOSPC;
        if (s->base) {
-                ce = mb_cache_entry_get(ext4_mb_cache, bs->bh->b_bdev,
-                                        bs->bh->b_blocknr);
                BUFFER_TRACE(bs->bh, "get_write_access");
                error = ext4_journal_get_write_access(handle, bs->bh);
                if (error)
@@ -799,10 +811,15 @@ ext4_xattr_block_set(handle_t *handle, struct inode *inode,
                lock_buffer(bs->bh);
                if (header(s->base)->h_refcount == cpu_to_le32(1)) {
-                        if (ce) {
+                        __u32 hash = le32_to_cpu(BHDR(bs->bh)->h_hash);
-                                mb_cache_entry_free(ce);
-                                ce = NULL;
+                        /*
-                        }
+                         * This must happen under buffer lock for
+                         * ext4_xattr_block_set() to reliably detect modified
+                         * block
+                         */
+                        mb_cache_entry_delete_block(ext4_mb_cache, hash,
+                                                    bs->bh->b_blocknr);
                        ea_bdebug(bs->bh, "modifying in-place");
                        error = ext4_xattr_set_entry(i, s);
                        if (!error) {
@@ -826,10 +843,6 @@ ext4_xattr_block_set(handle_t *handle, struct inode *inode,
                        int offset = (char *)s->here - bs->bh->b_data;
                        unlock_buffer(bs->bh);
-                        if (ce) {
-                                mb_cache_entry_release(ce);
-                                ce = NULL;
-                        }
                        ea_bdebug(bs->bh, "cloning");
                        s->base = kmalloc(bs->bh->b_size, GFP_NOFS);
                        error = -ENOMEM;
@@ -872,6 +885,8 @@ inserted:
                        if (new_bh == bs->bh)
                                ea_bdebug(new_bh, "keeping");
                        else {
+                                u32 ref;
                                /* The old block is released after updating
                                   the inode. */
                                error = dquot_alloc_block(inode,
@@ -884,9 +899,40 @@ inserted:
                                if (error)
                                        goto cleanup_dquot;
                                lock_buffer(new_bh);
-                                le32_add_cpu(&BHDR(new_bh)->h_refcount, 1);
+                                /*
+                                 * We have to be careful about races with
+                                 * freeing, rehashing or adding references to
+                                 * xattr block. Once we hold buffer lock xattr
+                                 * block's state is stable so we can check
+                                 * whether the block got freed / rehashed or
+                                 * not.  Since we unhash mbcache entry under
+                                 * buffer lock when freeing / rehashing xattr
+                                 * block, checking whether entry is still
+                                 * hashed is reliable. Same rules hold for
+                                 * e_reusable handling.
+                                 */
+                                if (hlist_bl_unhashed(&ce->e_hash_list) ||
+                                    !ce->e_reusable) {
+                                        /*
+                                         * Undo everything and check mbcache
+                                         * again.
+                                         */
+                                        unlock_buffer(new_bh);
+                                        dquot_free_block(inode,
+                                                         EXT4_C2B(EXT4_SB(sb),
+                                                                  1));
+                                        brelse(new_bh);
+                                        mb_cache_entry_put(ext4_mb_cache, ce);
+                                        ce = NULL;
+                                        new_bh = NULL;
+                                        goto inserted;
+                                }
+                                ref = le32_to_cpu(BHDR(new_bh)->h_refcount) + 1;
+                                BHDR(new_bh)->h_refcount = cpu_to_le32(ref);
+                                if (ref >= EXT4_XATTR_REFCOUNT_MAX)
+                                        ce->e_reusable = 0;
                                ea_bdebug(new_bh, "reusing; refcount now=%d",
-                                        le32_to_cpu(BHDR(new_bh)->h_refcount));
+                                          ref);
                                unlock_buffer(new_bh);
                                error = ext4_handle_dirty_xattr_block(handle,
                                                                      inode,
@@ -894,7 +940,8 @@ inserted:
                                if (error)
                                        goto cleanup_dquot;
                        }
-                        mb_cache_entry_release(ce);
+                        mb_cache_entry_touch(ext4_mb_cache, ce);
+                        mb_cache_entry_put(ext4_mb_cache, ce);
                        ce = NULL;
                } else if (bs->bh && s->base == bs->bh->b_data) {
                        /* We were modifying this block in-place. */
@@ -959,7 +1006,7 @@ getblk_failed:
 cleanup:
        if (ce)
-                mb_cache_entry_release(ce);
+                mb_cache_entry_put(ext4_mb_cache, ce);
        brelse(new_bh);
        if (!(bs->bh && s->base == bs->bh->b_data))
                kfree(s->base);
@@ -1070,6 +1117,17 @@ static int ext4_xattr_ibody_set(handle_t *handle, struct inode *inode,
        return 0;
 }
+static int ext4_xattr_value_same(struct ext4_xattr_search *s,
+                                 struct ext4_xattr_info *i)
+{
+        void *value;
+        if (le32_to_cpu(s->here->e_value_size) != i->value_len)
+                return 0;
+        value = ((void *)s->base) + le16_to_cpu(s->here->e_value_offs);
+        return !memcmp(value, i->value, i->value_len);
+}
 /*
 * ext4_xattr_set_handle()
 *
@@ -1146,6 +1204,13 @@ ext4_xattr_set_handle(handle_t *handle, struct inode *inode, int name_index,
                else if (!bs.s.not_found)
                        error = ext4_xattr_block_set(handle, inode, &i, &bs);
        } else {
+                error = 0;
+                /* Xattr value did not change? Save us some work and bail out */
+                if (!is.s.not_found && ext4_xattr_value_same(&is.s, &i))
+                        goto cleanup;
+                if (!bs.s.not_found && ext4_xattr_value_same(&bs.s, &i))
+                        goto cleanup;
                error = ext4_xattr_ibody_set(handle, inode, &i, &is);
                if (!error && !bs.s.not_found) {
                        i.value = NULL;
@@ -1512,17 +1577,6 @@ cleanup:
 }
 /*
- * ext4_xattr_put_super()
- *
- * This is called when a file system is unmounted.
- */
-void
-ext4_xattr_put_super(struct super_block *sb)
-{
-        mb_cache_shrink(sb->s_bdev);
-}
-/*
 * ext4_xattr_cache_insert()
 *
 * Create a new entry in the extended attribute cache, and insert
@@ -1533,26 +1587,19 @@ ext4_xattr_put_super(struct super_block *sb)
 static void
 ext4_xattr_cache_insert(struct mb_cache *ext4_mb_cache, struct buffer_head *bh)
 {
-        __u32 hash = le32_to_cpu(BHDR(bh)->h_hash);
+        struct ext4_xattr_header *header = BHDR(bh);
-        struct mb_cache_entry *ce;
+        __u32 hash = le32_to_cpu(header->h_hash);
+        int reusable = le32_to_cpu(header->h_refcount) <
+                       EXT4_XATTR_REFCOUNT_MAX;
        int error;
-        ce = mb_cache_entry_alloc(ext4_mb_cache, GFP_NOFS);
+        error = mb_cache_entry_create(ext4_mb_cache, GFP_NOFS, hash,
-        if (!ce) {
+                                      bh->b_blocknr, reusable);
-                ea_bdebug(bh, "out of memory");
-                return;
-        }
-        error = mb_cache_entry_insert(ce, bh->b_bdev, bh->b_blocknr, hash);
        if (error) {
-                mb_cache_entry_free(ce);
+                if (error == -EBUSY)
-                if (error == -EBUSY) {
                        ea_bdebug(bh, "already in cache");
-                        error = 0;
+        } else
-                }
-        } else {
                ea_bdebug(bh, "inserting [%x]", (int)hash);
-                mb_cache_entry_release(ce);
-        }
 }
 /*
@@ -1614,33 +1661,20 @@ ext4_xattr_cache_find(struct inode *inode, struct ext4_xattr_header *header,
        if (!header->h_hash)
                return NULL;  /* never share */
        ea_idebug(inode, "looking for cached blocks [%x]", (int)hash);
-again:
+        ce = mb_cache_entry_find_first(ext4_mb_cache, hash);
-        ce = mb_cache_entry_find_first(ext4_mb_cache, inode->i_sb->s_bdev,
-                                       hash);
        while (ce) {
                struct buffer_head *bh;
-                if (IS_ERR(ce)) {
-                        if (PTR_ERR(ce) == -EAGAIN)
-                                goto again;
-                        break;
-                }
                bh = sb_bread(inode->i_sb, ce->e_block);
                if (!bh) {
                        EXT4_ERROR_INODE(inode, "block %lu read error",
                                         (unsigned long) ce->e_block);
-                } else if (le32_to_cpu(BHDR(bh)->h_refcount) >=
-                                EXT4_XATTR_REFCOUNT_MAX) {
-                        ea_idebug(inode, "block %lu refcount %d>=%d",
-                                  (unsigned long) ce->e_block,
-                                  le32_to_cpu(BHDR(bh)->h_refcount),
-                                          EXT4_XATTR_REFCOUNT_MAX);
                } else if (ext4_xattr_cmp(header, BHDR(bh)) == 0) {
                        *pce = ce;
                        return bh;
                }
                brelse(bh);
-                ce = mb_cache_entry_find_next(ce, inode->i_sb->s_bdev, hash);
+                ce = mb_cache_entry_find_next(ext4_mb_cache, ce);
        }
        return NULL;
 }
@@ -1716,9 +1750,9 @@ static void ext4_xattr_rehash(struct ext4_xattr_header *header,
 #define HASH_BUCKET_BITS        10
 struct mb_cache *
-ext4_xattr_create_cache(char *name)
+ext4_xattr_create_cache(void)
 {
-        return mb_cache_create(name, HASH_BUCKET_BITS);
+        return mb_cache_create(HASH_BUCKET_BITS);
 }
 void ext4_xattr_destroy_cache(struct mb_cache *cache)
diff --git a/fs/ext4/xattr.h b/fs/ext4/xattr.h
index ddc0957760ba..69dd3e6566e0 100644
--- a/fs/ext4/xattr.h
+++ b/fs/ext4/xattr.h
@@ -108,7 +108,6 @@ extern int ext4_xattr_set(struct inode *, int, const char *, const void *, size_
 extern int ext4_xattr_set_handle(handle_t *, struct inode *, int, const char *, const void *, size_t, int);
 extern void ext4_xattr_delete_inode(handle_t *, struct inode *);
-extern void ext4_xattr_put_super(struct super_block *);
 extern int ext4_expand_extra_isize_ea(struct inode *inode, int new_extra_isize,
                            struct ext4_inode *raw_inode, handle_t *handle);
@@ -124,7 +123,7 @@ extern int ext4_xattr_ibody_inline_set(handle_t *handle, struct inode *inode,
                                       struct ext4_xattr_info *i,
                                       struct ext4_xattr_ibody_find *is);
-extern struct mb_cache *ext4_xattr_create_cache(char *name);
+extern struct mb_cache *ext4_xattr_create_cache(void);
 extern void ext4_xattr_destroy_cache(struct mb_cache *);
 #ifdef CONFIG_EXT4_FS_SECURITY
diff --git a/fs/f2fs/Kconfig b/fs/f2fs/Kconfig
index b0a9dc929f88..1f8982a957f1 100644
--- a/fs/f2fs/Kconfig
+++ b/fs/f2fs/Kconfig
@@ -1,6 +1,8 @@
 config F2FS_FS
        tristate "F2FS filesystem support"
        depends on BLOCK
+        select CRYPTO
+        select CRYPTO_CRC32
        help
          F2FS is based on Log-structured File System (LFS), which supports
          versatile "flash-friendly" features. The design has been focused on
@@ -76,15 +78,7 @@ config F2FS_FS_ENCRYPTION
        bool "F2FS Encryption"
        depends on F2FS_FS
        depends on F2FS_FS_XATTR
-        select CRYPTO_AES
+        select FS_ENCRYPTION
-        select CRYPTO_CBC
-        select CRYPTO_ECB
-        select CRYPTO_XTS
-        select CRYPTO_CTS
-        select CRYPTO_CTR
-        select CRYPTO_SHA256
-        select KEYS
-        select ENCRYPTED_KEYS
        help
          Enable encryption of f2fs files and directories.  This
          feature is similar to ecryptfs, but it is more memory
diff --git a/fs/f2fs/Makefile b/fs/f2fs/Makefile
index 08e101ed914c..ca949ea7c02f 100644
--- a/fs/f2fs/Makefile
+++ b/fs/f2fs/Makefile
@@ -7,5 +7,3 @@ f2fs-$(CONFIG_F2FS_STAT_FS) += debug.o
 f2fs-$(CONFIG_F2FS_FS_XATTR) += xattr.o
 f2fs-$(CONFIG_F2FS_FS_POSIX_ACL) += acl.o
 f2fs-$(CONFIG_F2FS_IO_TRACE) += trace.o
-f2fs-$(CONFIG_F2FS_FS_ENCRYPTION) += crypto_policy.o crypto.o \
-                crypto_key.o crypto_fname.o
diff --git a/fs/f2fs/checkpoint.c b/fs/f2fs/checkpoint.c
index 3842af954cd5..0955312e5ca0 100644
--- a/fs/f2fs/checkpoint.c
+++ b/fs/f2fs/checkpoint.c
@@ -39,7 +39,7 @@ repeat:
                cond_resched();
                goto repeat;
        }
-        f2fs_wait_on_page_writeback(page, META);
+        f2fs_wait_on_page_writeback(page, META, true);
        SetPageUptodate(page);
        return page;
 }
@@ -56,7 +56,8 @@ static struct page *__get_meta_page(struct f2fs_sb_info *sbi, pgoff_t index,
                .sbi = sbi,
                .type = META,
                .rw = READ_SYNC | REQ_META | REQ_PRIO,
-                .blk_addr = index,
+                .old_blkaddr = index,
+                .new_blkaddr = index,
                .encrypted_page = NULL,
        };
@@ -143,7 +144,6 @@ bool is_valid_blkaddr(struct f2fs_sb_info *sbi, block_t blkaddr, int type)
 int ra_meta_pages(struct f2fs_sb_info *sbi, block_t start, int nrpages,
                                                        int type, bool sync)
 {
-        block_t prev_blk_addr = 0;
        struct page *page;
        block_t blkno = start;
        struct f2fs_io_info fio = {
@@ -152,10 +152,12 @@ int ra_meta_pages(struct f2fs_sb_info *sbi, block_t start, int nrpages,
                .rw = sync ? (READ_SYNC | REQ_META | REQ_PRIO) : READA,
                .encrypted_page = NULL,
        };
+        struct blk_plug plug;
        if (unlikely(type == META_POR))
                fio.rw &= ~REQ_META;
+        blk_start_plug(&plug);
        for (; nrpages-- > 0; blkno++) {
                if (!is_valid_blkaddr(sbi, blkno, type))
@@ -167,27 +169,24 @@ int ra_meta_pages(struct f2fs_sb_info *sbi, block_t start, int nrpages,
                                        NAT_BLOCK_OFFSET(NM_I(sbi)->max_nid)))
                                blkno = 0;
                        /* get nat block addr */
-                        fio.blk_addr = current_nat_addr(sbi,
+                        fio.new_blkaddr = current_nat_addr(sbi,
                                        blkno * NAT_ENTRY_PER_BLOCK);
                        break;
                case META_SIT:
                        /* get sit block addr */
-                        fio.blk_addr = current_sit_addr(sbi,
+                        fio.new_blkaddr = current_sit_addr(sbi,
                                        blkno * SIT_ENTRY_PER_BLOCK);
-                        if (blkno != start && prev_blk_addr + 1 != fio.blk_addr)
-                                goto out;
-                        prev_blk_addr = fio.blk_addr;
                        break;
                case META_SSA:
                case META_CP:
                case META_POR:
-                        fio.blk_addr = blkno;
+                        fio.new_blkaddr = blkno;
                        break;
                default:
                        BUG();
                }
-                page = grab_cache_page(META_MAPPING(sbi), fio.blk_addr);
+                page = grab_cache_page(META_MAPPING(sbi), fio.new_blkaddr);
                if (!page)
                        continue;
                if (PageUptodate(page)) {
@@ -196,11 +195,13 @@ int ra_meta_pages(struct f2fs_sb_info *sbi, block_t start, int nrpages,
                }
                fio.page = page;
+                fio.old_blkaddr = fio.new_blkaddr;
                f2fs_submit_page_mbio(&fio);
                f2fs_put_page(page, 0);
        }
 out:
        f2fs_submit_merged_bio(sbi, META, READ);
+        blk_finish_plug(&plug);
        return blkno - start;
 }
@@ -232,13 +233,17 @@ static int f2fs_write_meta_page(struct page *page,
        if (unlikely(f2fs_cp_error(sbi)))
                goto redirty_out;
-        f2fs_wait_on_page_writeback(page, META);
        write_meta_page(sbi, page);
        dec_page_count(sbi, F2FS_DIRTY_META);
+        if (wbc->for_reclaim)
+                f2fs_submit_merged_bio_cond(sbi, NULL, page, 0, META, WRITE);
        unlock_page(page);
-        if (wbc->for_reclaim || unlikely(f2fs_cp_error(sbi)))
+        if (unlikely(f2fs_cp_error(sbi)))
                f2fs_submit_merged_bio(sbi, META, WRITE);
        return 0;
 redirty_out:
@@ -252,13 +257,13 @@ static int f2fs_write_meta_pages(struct address_space *mapping,
        struct f2fs_sb_info *sbi = F2FS_M_SB(mapping);
        long diff, written;
-        trace_f2fs_writepages(mapping->host, wbc, META);
        /* collect a number of dirty meta pages and write together */
        if (wbc->for_kupdate ||
                get_pages(sbi, F2FS_DIRTY_META) < nr_pages_to_skip(sbi, META))
                goto skip_write;
+        trace_f2fs_writepages(mapping->host, wbc, META);
        /* if mounting is failed, skip writing node pages */
        mutex_lock(&sbi->cp_mutex);
        diff = nr_pages_to_write(sbi, META, wbc);
@@ -269,6 +274,7 @@ static int f2fs_write_meta_pages(struct address_space *mapping,
 skip_write:
        wbc->pages_skipped += get_pages(sbi, F2FS_DIRTY_META);
+        trace_f2fs_writepages(mapping->host, wbc, META);
        return 0;
 }
@@ -276,15 +282,18 @@ long sync_meta_pages(struct f2fs_sb_info *sbi, enum page_type type,
                                                long nr_to_write)
 {
        struct address_space *mapping = META_MAPPING(sbi);
-        pgoff_t index = 0, end = LONG_MAX, prev = LONG_MAX;
+        pgoff_t index = 0, end = ULONG_MAX, prev = ULONG_MAX;
        struct pagevec pvec;
        long nwritten = 0;
        struct writeback_control wbc = {
                .for_reclaim = 0,
        };
+        struct blk_plug plug;
        pagevec_init(&pvec, 0);
+        blk_start_plug(&plug);
        while (index <= end) {
                int i, nr_pages;
                nr_pages = pagevec_lookup_tag(&pvec, mapping, &index,
@@ -296,7 +305,7 @@ long sync_meta_pages(struct f2fs_sb_info *sbi, enum page_type type,
                for (i = 0; i < nr_pages; i++) {
                        struct page *page = pvec.pages[i];
-                        if (prev == LONG_MAX)
+                        if (prev == ULONG_MAX)
                                prev = page->index - 1;
                        if (nr_to_write != LONG_MAX && page->index != prev + 1) {
                                pagevec_release(&pvec);
@@ -315,6 +324,9 @@ continue_unlock:
                                goto continue_unlock;
                        }
+                        f2fs_wait_on_page_writeback(page, META, true);
+                        BUG_ON(PageWriteback(page));
                        if (!clear_page_dirty_for_io(page))
                                goto continue_unlock;
@@ -334,6 +346,8 @@ stop:
        if (nwritten)
                f2fs_submit_merged_bio(sbi, type, WRITE);
+        blk_finish_plug(&plug);
        return nwritten;
 }
@@ -621,7 +635,7 @@ static struct page *validate_checkpoint(struct f2fs_sb_info *sbi,
                goto invalid_cp1;
        crc = le32_to_cpu(*((__le32 *)((unsigned char *)cp_block + crc_offset)));
-        if (!f2fs_crc_valid(crc, cp_block, crc_offset))
+        if (!f2fs_crc_valid(sbi, crc, cp_block, crc_offset))
                goto invalid_cp1;
        pre_version = cur_cp_version(cp_block);
@@ -636,7 +650,7 @@ static struct page *validate_checkpoint(struct f2fs_sb_info *sbi,
                goto invalid_cp2;
        crc = le32_to_cpu(*((__le32 *)((unsigned char *)cp_block + crc_offset)));
-        if (!f2fs_crc_valid(crc, cp_block, crc_offset))
+        if (!f2fs_crc_valid(sbi, crc, cp_block, crc_offset))
                goto invalid_cp2;
        cur_version = cur_cp_version(cp_block);
@@ -696,6 +710,10 @@ int get_valid_checkpoint(struct f2fs_sb_info *sbi)
        cp_block = (struct f2fs_checkpoint *)page_address(cur_page);
        memcpy(sbi->ckpt, cp_block, blk_size);
+        /* Sanity checking of checkpoint */
+        if (sanity_check_ckpt(sbi))
+                goto fail_no_cp;
        if (cp_blks <= 1)
                goto done;
@@ -902,7 +920,7 @@ static void wait_on_all_pages_writeback(struct f2fs_sb_info *sbi)
                if (!get_pages(sbi, F2FS_WRITEBACK))
                        break;
-                io_schedule();
+                io_schedule_timeout(5*HZ);
        }
        finish_wait(&sbi->cp_wait, &wait);
 }
@@ -921,6 +939,9 @@ static int do_checkpoint(struct f2fs_sb_info *sbi, struct cp_control *cpc)
        int cp_payload_blks = __cp_payload(sbi);
        block_t discard_blk = NEXT_FREE_BLKADDR(sbi, curseg);
        bool invalidate = false;
+        struct super_block *sb = sbi->sb;
+        struct curseg_info *seg_i = CURSEG_I(sbi, CURSEG_HOT_NODE);
+        u64 kbytes_written;
        /*
         * This avoids to conduct wrong roll-forward operations and uses
@@ -1008,7 +1029,7 @@ static int do_checkpoint(struct f2fs_sb_info *sbi, struct cp_control *cpc)
        get_sit_bitmap(sbi, __bitmap_ptr(sbi, SIT_BITMAP));
        get_nat_bitmap(sbi, __bitmap_ptr(sbi, NAT_BITMAP));
-        crc32 = f2fs_crc32(ckpt, le32_to_cpu(ckpt->checksum_offset));
+        crc32 = f2fs_crc32(sbi, ckpt, le32_to_cpu(ckpt->checksum_offset));
        *((__le32 *)((unsigned char *)ckpt +
                                le32_to_cpu(ckpt->checksum_offset)))
                                = cpu_to_le32(crc32);
@@ -1034,6 +1055,14 @@ static int do_checkpoint(struct f2fs_sb_info *sbi, struct cp_control *cpc)
        write_data_summaries(sbi, start_blk);
        start_blk += data_sum_blocks;
+        /* Record write statistics in the hot node summary */
+        kbytes_written = sbi->kbytes_written;
+        if (sb->s_bdev->bd_part)
+                kbytes_written += BD_PART_WRITTEN(sbi);
+        seg_i->journal->info.kbytes_written = cpu_to_le64(kbytes_written);
        if (__remain_node_summaries(cpc->reason)) {
                write_node_summaries(sbi, start_blk);
                start_blk += NR_CURSEG_NODE_TYPE;
@@ -1048,8 +1077,8 @@ static int do_checkpoint(struct f2fs_sb_info *sbi, struct cp_control *cpc)
        if (unlikely(f2fs_cp_error(sbi)))
                return -EIO;
-        filemap_fdatawait_range(NODE_MAPPING(sbi), 0, LONG_MAX);
+        filemap_fdatawait_range(NODE_MAPPING(sbi), 0, LLONG_MAX);
-        filemap_fdatawait_range(META_MAPPING(sbi), 0, LONG_MAX);
+        filemap_fdatawait_range(META_MAPPING(sbi), 0, LLONG_MAX);
        /* update user_block_counts */
        sbi->last_valid_block_count = sbi->total_valid_block_count;
@@ -1112,9 +1141,7 @@ int write_checkpoint(struct f2fs_sb_info *sbi, struct cp_control *cpc)
        trace_f2fs_write_checkpoint(sbi->sb, cpc->reason, "finish block_ops");
-        f2fs_submit_merged_bio(sbi, DATA, WRITE);
+        f2fs_flush_merged_bios(sbi);
-        f2fs_submit_merged_bio(sbi, NODE, WRITE);
-        f2fs_submit_merged_bio(sbi, META, WRITE);
        /*
         * update checkpoint pack index
diff --git a/fs/f2fs/crypto.c b/fs/f2fs/crypto.c
deleted file mode 100644
index 4a62ef14e932..000000000000
--- a/fs/f2fs/crypto.c
+++ /dev/null
@@ -1,491 +0,0 @@
-/*
- * linux/fs/f2fs/crypto.c
- *
- * Copied from linux/fs/ext4/crypto.c
- *
- * Copyright (C) 2015, Google, Inc.
- * Copyright (C) 2015, Motorola Mobility
- *
- * This contains encryption functions for f2fs
- *
- * Written by Michael Halcrow, 2014.
- *
- * Filename encryption additions
- *      Uday Savagaonkar, 2014
- * Encryption policy handling additions
- *      Ildar Muslukhov, 2014
- * Remove ext4_encrypted_zeroout(),
- *   add f2fs_restore_and_release_control_page()
- *      Jaegeuk Kim, 2015.
- *
- * This has not yet undergone a rigorous security audit.
- *
- * The usage of AES-XTS should conform to recommendations in NIST
- * Special Publication 800-38E and IEEE P1619/D16.
- */
-#include <crypto/hash.h>
-#include <crypto/sha.h>
-#include <keys/user-type.h>
-#include <keys/encrypted-type.h>
-#include <linux/crypto.h>
-#include <linux/ecryptfs.h>
-#include <linux/gfp.h>
-#include <linux/kernel.h>
-#include <linux/key.h>
-#include <linux/list.h>
-#include <linux/mempool.h>
-#include <linux/module.h>
-#include <linux/mutex.h>
-#include <linux/random.h>
-#include <linux/scatterlist.h>
-#include <linux/spinlock_types.h>
-#include <linux/f2fs_fs.h>
-#include <linux/ratelimit.h>
-#include <linux/bio.h>
-#include "f2fs.h"
-#include "xattr.h"
-/* Encryption added and removed here! (L: */
-static unsigned int num_prealloc_crypto_pages = 32;
-static unsigned int num_prealloc_crypto_ctxs = 128;
-module_param(num_prealloc_crypto_pages, uint, 0444);
-MODULE_PARM_DESC(num_prealloc_crypto_pages,
-                "Number of crypto pages to preallocate");
-module_param(num_prealloc_crypto_ctxs, uint, 0444);
-MODULE_PARM_DESC(num_prealloc_crypto_ctxs,
-                "Number of crypto contexts to preallocate");
-static mempool_t *f2fs_bounce_page_pool;
-static LIST_HEAD(f2fs_free_crypto_ctxs);
-static DEFINE_SPINLOCK(f2fs_crypto_ctx_lock);
-static struct workqueue_struct *f2fs_read_workqueue;
-static DEFINE_MUTEX(crypto_init);
-static struct kmem_cache *f2fs_crypto_ctx_cachep;
-struct kmem_cache *f2fs_crypt_info_cachep;
-/**
- * f2fs_release_crypto_ctx() - Releases an encryption context
- * @ctx: The encryption context to release.
- *
- * If the encryption context was allocated from the pre-allocated pool, returns
- * it to that pool. Else, frees it.
- *
- * If there's a bounce page in the context, this frees that.
- */
-void f2fs_release_crypto_ctx(struct f2fs_crypto_ctx *ctx)
-{
-        unsigned long flags;
-        if (ctx->flags & F2FS_WRITE_PATH_FL && ctx->w.bounce_page) {
-                mempool_free(ctx->w.bounce_page, f2fs_bounce_page_pool);
-                ctx->w.bounce_page = NULL;
-        }
-        ctx->w.control_page = NULL;
-        if (ctx->flags & F2FS_CTX_REQUIRES_FREE_ENCRYPT_FL) {
-                kmem_cache_free(f2fs_crypto_ctx_cachep, ctx);
-        } else {
-                spin_lock_irqsave(&f2fs_crypto_ctx_lock, flags);
-                list_add(&ctx->free_list, &f2fs_free_crypto_ctxs);
-                spin_unlock_irqrestore(&f2fs_crypto_ctx_lock, flags);
-        }
-}
-/**
- * f2fs_get_crypto_ctx() - Gets an encryption context
- * @inode:       The inode for which we are doing the crypto
- *
- * Allocates and initializes an encryption context.
- *
- * Return: An allocated and initialized encryption context on success; error
- * value or NULL otherwise.
- */
-struct f2fs_crypto_ctx *f2fs_get_crypto_ctx(struct inode *inode)
-{
-        struct f2fs_crypto_ctx *ctx = NULL;
-        unsigned long flags;
-        struct f2fs_crypt_info *ci = F2FS_I(inode)->i_crypt_info;
-        if (ci == NULL)
-                return ERR_PTR(-ENOKEY);
-        /*
-         * We first try getting the ctx from a free list because in
-         * the common case the ctx will have an allocated and
-         * initialized crypto tfm, so it's probably a worthwhile
-         * optimization. For the bounce page, we first try getting it
-         * from the kernel allocator because that's just about as fast
-         * as getting it from a list and because a cache of free pages
-         * should generally be a "last resort" option for a filesystem
-         * to be able to do its job.
-         */
-        spin_lock_irqsave(&f2fs_crypto_ctx_lock, flags);
-        ctx = list_first_entry_or_null(&f2fs_free_crypto_ctxs,
-                                        struct f2fs_crypto_ctx, free_list);
-        if (ctx)
-                list_del(&ctx->free_list);
-        spin_unlock_irqrestore(&f2fs_crypto_ctx_lock, flags);
-        if (!ctx) {
-                ctx = kmem_cache_zalloc(f2fs_crypto_ctx_cachep, GFP_NOFS);
-                if (!ctx)
-                        return ERR_PTR(-ENOMEM);
-                ctx->flags |= F2FS_CTX_REQUIRES_FREE_ENCRYPT_FL;
-        } else {
-                ctx->flags &= ~F2FS_CTX_REQUIRES_FREE_ENCRYPT_FL;
-        }
-        ctx->flags &= ~F2FS_WRITE_PATH_FL;
-        return ctx;
-}
-/*
- * Call f2fs_decrypt on every single page, reusing the encryption
- * context.
- */
-static void completion_pages(struct work_struct *work)
-{
-        struct f2fs_crypto_ctx *ctx =
-                container_of(work, struct f2fs_crypto_ctx, r.work);
-        struct bio *bio = ctx->r.bio;
-        struct bio_vec *bv;
-        int i;
-        bio_for_each_segment_all(bv, bio, i) {
-                struct page *page = bv->bv_page;
-                int ret = f2fs_decrypt(ctx, page);
-                if (ret) {
-                        WARN_ON_ONCE(1);
-                        SetPageError(page);
-                } else
-                        SetPageUptodate(page);
-                unlock_page(page);
-        }
-        f2fs_release_crypto_ctx(ctx);
-        bio_put(bio);
-}
-void f2fs_end_io_crypto_work(struct f2fs_crypto_ctx *ctx, struct bio *bio)
-{
-        INIT_WORK(&ctx->r.work, completion_pages);
-        ctx->r.bio = bio;
-        queue_work(f2fs_read_workqueue, &ctx->r.work);
-}
-static void f2fs_crypto_destroy(void)
-{
-        struct f2fs_crypto_ctx *pos, *n;
-        list_for_each_entry_safe(pos, n, &f2fs_free_crypto_ctxs, free_list)
-                kmem_cache_free(f2fs_crypto_ctx_cachep, pos);
-        INIT_LIST_HEAD(&f2fs_free_crypto_ctxs);
-        if (f2fs_bounce_page_pool)
-                mempool_destroy(f2fs_bounce_page_pool);
-        f2fs_bounce_page_pool = NULL;
-}
-/**
- * f2fs_crypto_initialize() - Set up for f2fs encryption.
- *
- * We only call this when we start accessing encrypted files, since it
- * results in memory getting allocated that wouldn't otherwise be used.
- *
- * Return: Zero on success, non-zero otherwise.
- */
-int f2fs_crypto_initialize(void)
-{
-        int i, res = -ENOMEM;
-        if (f2fs_bounce_page_pool)
-                return 0;
-        mutex_lock(&crypto_init);
-        if (f2fs_bounce_page_pool)
-                goto already_initialized;
-        for (i = 0; i < num_prealloc_crypto_ctxs; i++) {
-                struct f2fs_crypto_ctx *ctx;
-                ctx = kmem_cache_zalloc(f2fs_crypto_ctx_cachep, GFP_KERNEL);
-                if (!ctx)
-                        goto fail;
-                list_add(&ctx->free_list, &f2fs_free_crypto_ctxs);
-        }
-        /* must be allocated at the last step to avoid race condition above */
-        f2fs_bounce_page_pool =
-                mempool_create_page_pool(num_prealloc_crypto_pages, 0);
-        if (!f2fs_bounce_page_pool)
-                goto fail;
-already_initialized:
-        mutex_unlock(&crypto_init);
-        return 0;
-fail:
-        f2fs_crypto_destroy();
-        mutex_unlock(&crypto_init);
-        return res;
-}
-/**
- * f2fs_exit_crypto() - Shutdown the f2fs encryption system
- */
-void f2fs_exit_crypto(void)
-{
-        f2fs_crypto_destroy();
-        if (f2fs_read_workqueue)
-                destroy_workqueue(f2fs_read_workqueue);
-        if (f2fs_crypto_ctx_cachep)
-                kmem_cache_destroy(f2fs_crypto_ctx_cachep);
-        if (f2fs_crypt_info_cachep)
-                kmem_cache_destroy(f2fs_crypt_info_cachep);
-}
-int __init f2fs_init_crypto(void)
-{
-        int res = -ENOMEM;
-        f2fs_read_workqueue = alloc_workqueue("f2fs_crypto", WQ_HIGHPRI, 0);
-        if (!f2fs_read_workqueue)
-                goto fail;
-        f2fs_crypto_ctx_cachep = KMEM_CACHE(f2fs_crypto_ctx,
-                                                SLAB_RECLAIM_ACCOUNT);
-        if (!f2fs_crypto_ctx_cachep)
-                goto fail;
-        f2fs_crypt_info_cachep = KMEM_CACHE(f2fs_crypt_info,
-                                                SLAB_RECLAIM_ACCOUNT);
-        if (!f2fs_crypt_info_cachep)
-                goto fail;
-        return 0;
-fail:
-        f2fs_exit_crypto();
-        return res;
-}
-void f2fs_restore_and_release_control_page(struct page **page)
-{
-        struct f2fs_crypto_ctx *ctx;
-        struct page *bounce_page;
-        /* The bounce data pages are unmapped. */
-        if ((*page)->mapping)
-                return;
-        /* The bounce data page is unmapped. */
-        bounce_page = *page;
-        ctx = (struct f2fs_crypto_ctx *)page_private(bounce_page);
-        /* restore control page */
-        *page = ctx->w.control_page;
-        f2fs_restore_control_page(bounce_page);
-}
-void f2fs_restore_control_page(struct page *data_page)
-{
-        struct f2fs_crypto_ctx *ctx =
-                (struct f2fs_crypto_ctx *)page_private(data_page);
-        set_page_private(data_page, (unsigned long)NULL);
-        ClearPagePrivate(data_page);
-        unlock_page(data_page);
-        f2fs_release_crypto_ctx(ctx);
-}
-/**
- * f2fs_crypt_complete() - The completion callback for page encryption
- * @req: The asynchronous encryption request context
- * @res: The result of the encryption operation
- */
-static void f2fs_crypt_complete(struct crypto_async_request *req, int res)
-{
-        struct f2fs_completion_result *ecr = req->data;
-        if (res == -EINPROGRESS)
-                return;
-        ecr->res = res;
-        complete(&ecr->completion);
-}
-typedef enum {
-        F2FS_DECRYPT = 0,
-        F2FS_ENCRYPT,
-} f2fs_direction_t;
-static int f2fs_page_crypto(struct f2fs_crypto_ctx *ctx,
-                                struct inode *inode,
-                                f2fs_direction_t rw,
-                                pgoff_t index,
-                                struct page *src_page,
-                                struct page *dest_page)
-{
-        u8 xts_tweak[F2FS_XTS_TWEAK_SIZE];
-        struct ablkcipher_request *req = NULL;
-        DECLARE_F2FS_COMPLETION_RESULT(ecr);
-        struct scatterlist dst, src;
-        struct f2fs_crypt_info *ci = F2FS_I(inode)->i_crypt_info;
-        struct crypto_ablkcipher *tfm = ci->ci_ctfm;
-        int res = 0;
-        req = ablkcipher_request_alloc(tfm, GFP_NOFS);
-        if (!req) {
-                printk_ratelimited(KERN_ERR
-                                "%s: crypto_request_alloc() failed\n",
-                                __func__);
-                return -ENOMEM;
-        }
-        ablkcipher_request_set_callback(
-                req, CRYPTO_TFM_REQ_MAY_BACKLOG | CRYPTO_TFM_REQ_MAY_SLEEP,
-                f2fs_crypt_complete, &ecr);
-        BUILD_BUG_ON(F2FS_XTS_TWEAK_SIZE < sizeof(index));
-        memcpy(xts_tweak, &index, sizeof(index));
-        memset(&xts_tweak[sizeof(index)], 0,
-                        F2FS_XTS_TWEAK_SIZE - sizeof(index));
-        sg_init_table(&dst, 1);
-        sg_set_page(&dst, dest_page, PAGE_CACHE_SIZE, 0);
-        sg_init_table(&src, 1);
-        sg_set_page(&src, src_page, PAGE_CACHE_SIZE, 0);
-        ablkcipher_request_set_crypt(req, &src, &dst, PAGE_CACHE_SIZE,
-                                        xts_tweak);
-        if (rw == F2FS_DECRYPT)
-                res = crypto_ablkcipher_decrypt(req);
-        else
-                res = crypto_ablkcipher_encrypt(req);
-        if (res == -EINPROGRESS || res == -EBUSY) {
-                BUG_ON(req->base.data != &ecr);
-                wait_for_completion(&ecr.completion);
-                res = ecr.res;
-        }
-        ablkcipher_request_free(req);
-        if (res) {
-                printk_ratelimited(KERN_ERR
-                        "%s: crypto_ablkcipher_encrypt() returned %d\n",
-                        __func__, res);
-                return res;
-        }
-        return 0;
-}
-static struct page *alloc_bounce_page(struct f2fs_crypto_ctx *ctx)
-{
-        ctx->w.bounce_page = mempool_alloc(f2fs_bounce_page_pool, GFP_NOWAIT);
-        if (ctx->w.bounce_page == NULL)
-                return ERR_PTR(-ENOMEM);
-        ctx->flags |= F2FS_WRITE_PATH_FL;
-        return ctx->w.bounce_page;
-}
-/**
- * f2fs_encrypt() - Encrypts a page
- * @inode:          The inode for which the encryption should take place
- * @plaintext_page: The page to encrypt. Must be locked.
- *
- * Allocates a ciphertext page and encrypts plaintext_page into it using the ctx
- * encryption context.
- *
- * Called on the page write path.  The caller must call
- * f2fs_restore_control_page() on the returned ciphertext page to
- * release the bounce buffer and the encryption context.
- *
- * Return: An allocated page with the encrypted content on success. Else, an
- * error value or NULL.
- */
-struct page *f2fs_encrypt(struct inode *inode,
-                          struct page *plaintext_page)
-{
-        struct f2fs_crypto_ctx *ctx;
-        struct page *ciphertext_page = NULL;
-        int err;
-        BUG_ON(!PageLocked(plaintext_page));
-        ctx = f2fs_get_crypto_ctx(inode);
-        if (IS_ERR(ctx))
-                return (struct page *)ctx;
-        /* The encryption operation will require a bounce page. */
-        ciphertext_page = alloc_bounce_page(ctx);
-        if (IS_ERR(ciphertext_page))
-                goto err_out;
-        ctx->w.control_page = plaintext_page;
-        err = f2fs_page_crypto(ctx, inode, F2FS_ENCRYPT, plaintext_page->index,
-                                        plaintext_page, ciphertext_page);
-        if (err) {
-                ciphertext_page = ERR_PTR(err);
-                goto err_out;
-        }
-        SetPagePrivate(ciphertext_page);
-        set_page_private(ciphertext_page, (unsigned long)ctx);
-        lock_page(ciphertext_page);
-        return ciphertext_page;
-err_out:
-        f2fs_release_crypto_ctx(ctx);
-        return ciphertext_page;
-}
-/**
- * f2fs_decrypt() - Decrypts a page in-place
- * @ctx:  The encryption context.
- * @page: The page to decrypt. Must be locked.
- *
- * Decrypts page in-place using the ctx encryption context.
- *
- * Called from the read completion callback.
- *
- * Return: Zero on success, non-zero otherwise.
- */
-int f2fs_decrypt(struct f2fs_crypto_ctx *ctx, struct page *page)
-{
-        BUG_ON(!PageLocked(page));
-        return f2fs_page_crypto(ctx, page->mapping->host,
-                                F2FS_DECRYPT, page->index, page, page);
-}
-/*
- * Convenience function which takes care of allocating and
- * deallocating the encryption context
- */
-int f2fs_decrypt_one(struct inode *inode, struct page *page)
-{
-        struct f2fs_crypto_ctx *ctx = f2fs_get_crypto_ctx(inode);
-        int ret;
-        if (IS_ERR(ctx))
-                return PTR_ERR(ctx);
-        ret = f2fs_decrypt(ctx, page);
-        f2fs_release_crypto_ctx(ctx);
-        return ret;
-}
-bool f2fs_valid_contents_enc_mode(uint32_t mode)
-{
-        return (mode == F2FS_ENCRYPTION_MODE_AES_256_XTS);
-}
-/**
- * f2fs_validate_encryption_key_size() - Validate the encryption key size
- * @mode: The key mode.
- * @size: The key size to validate.
- *
- * Return: The validated key size for @mode. Zero if invalid.
- */
-uint32_t f2fs_validate_encryption_key_size(uint32_t mode, uint32_t size)
-{
-        if (size == f2fs_encryption_key_size(mode))
-                return size;
-        return 0;
-}
diff --git a/fs/f2fs/crypto_key.c b/fs/f2fs/crypto_key.c
deleted file mode 100644
index 5de2d866a25c..000000000000
--- a/fs/f2fs/crypto_key.c
+++ /dev/null
@@ -1,254 +0,0 @@
-/*
- * linux/fs/f2fs/crypto_key.c
- *
- * Copied from linux/fs/f2fs/crypto_key.c
- *
- * Copyright (C) 2015, Google, Inc.
- *
- * This contains encryption key functions for f2fs
- *
- * Written by Michael Halcrow, Ildar Muslukhov, and Uday Savagaonkar, 2015.
- */
-#include <keys/encrypted-type.h>
-#include <keys/user-type.h>
-#include <linux/random.h>
-#include <linux/scatterlist.h>
-#include <uapi/linux/keyctl.h>
-#include <crypto/hash.h>
-#include <linux/f2fs_fs.h>
-#include "f2fs.h"
-#include "xattr.h"
-static void derive_crypt_complete(struct crypto_async_request *req, int rc)
-{
-        struct f2fs_completion_result *ecr = req->data;
-        if (rc == -EINPROGRESS)
-                return;
-        ecr->res = rc;
-        complete(&ecr->completion);
-}
-/**
- * f2fs_derive_key_aes() - Derive a key using AES-128-ECB
- * @deriving_key: Encryption key used for derivatio.
- * @source_key:   Source key to which to apply derivation.
- * @derived_key:  Derived key.
- *
- * Return: Zero on success; non-zero otherwise.
- */
-static int f2fs_derive_key_aes(char deriving_key[F2FS_AES_128_ECB_KEY_SIZE],
-                                char source_key[F2FS_AES_256_XTS_KEY_SIZE],
-                                char derived_key[F2FS_AES_256_XTS_KEY_SIZE])
-{
-        int res = 0;
-        struct ablkcipher_request *req = NULL;
-        DECLARE_F2FS_COMPLETION_RESULT(ecr);
-        struct scatterlist src_sg, dst_sg;
-        struct crypto_ablkcipher *tfm = crypto_alloc_ablkcipher("ecb(aes)", 0,
-                                                                0);
-        if (IS_ERR(tfm)) {
-                res = PTR_ERR(tfm);
-                tfm = NULL;
-                goto out;
-        }
-        crypto_ablkcipher_set_flags(tfm, CRYPTO_TFM_REQ_WEAK_KEY);
-        req = ablkcipher_request_alloc(tfm, GFP_NOFS);
-        if (!req) {
-                res = -ENOMEM;
-                goto out;
-        }
-        ablkcipher_request_set_callback(req,
-                        CRYPTO_TFM_REQ_MAY_BACKLOG | CRYPTO_TFM_REQ_MAY_SLEEP,
-                        derive_crypt_complete, &ecr);
-        res = crypto_ablkcipher_setkey(tfm, deriving_key,
-                                F2FS_AES_128_ECB_KEY_SIZE);
-        if (res < 0)
-                goto out;
-        sg_init_one(&src_sg, source_key, F2FS_AES_256_XTS_KEY_SIZE);
-        sg_init_one(&dst_sg, derived_key, F2FS_AES_256_XTS_KEY_SIZE);
-        ablkcipher_request_set_crypt(req, &src_sg, &dst_sg,
-                                        F2FS_AES_256_XTS_KEY_SIZE, NULL);
-        res = crypto_ablkcipher_encrypt(req);
-        if (res == -EINPROGRESS || res == -EBUSY) {
-                BUG_ON(req->base.data != &ecr);
-                wait_for_completion(&ecr.completion);
-                res = ecr.res;
-        }
-out:
-        if (req)
-                ablkcipher_request_free(req);
-        if (tfm)
-                crypto_free_ablkcipher(tfm);
-        return res;
-}
-static void f2fs_free_crypt_info(struct f2fs_crypt_info *ci)
-{
-        if (!ci)
-                return;
-        key_put(ci->ci_keyring_key);
-        crypto_free_ablkcipher(ci->ci_ctfm);
-        kmem_cache_free(f2fs_crypt_info_cachep, ci);
-}
-void f2fs_free_encryption_info(struct inode *inode, struct f2fs_crypt_info *ci)
-{
-        struct f2fs_inode_info *fi = F2FS_I(inode);
-        struct f2fs_crypt_info *prev;
-        if (ci == NULL)
-                ci = ACCESS_ONCE(fi->i_crypt_info);
-        if (ci == NULL)
-                return;
-        prev = cmpxchg(&fi->i_crypt_info, ci, NULL);
-        if (prev != ci)
-                return;
-        f2fs_free_crypt_info(ci);
-}
-int _f2fs_get_encryption_info(struct inode *inode)
-{
-        struct f2fs_inode_info *fi = F2FS_I(inode);
-        struct f2fs_crypt_info *crypt_info;
-        char full_key_descriptor[F2FS_KEY_DESC_PREFIX_SIZE +
-                                (F2FS_KEY_DESCRIPTOR_SIZE * 2) + 1];
-        struct key *keyring_key = NULL;
-        struct f2fs_encryption_key *master_key;
-        struct f2fs_encryption_context ctx;
-        const struct user_key_payload *ukp;
-        struct crypto_ablkcipher *ctfm;
-        const char *cipher_str;
-        char raw_key[F2FS_MAX_KEY_SIZE];
-        char mode;
-        int res;
-        res = f2fs_crypto_initialize();
-        if (res)
-                return res;
-retry:
-        crypt_info = ACCESS_ONCE(fi->i_crypt_info);
-        if (crypt_info) {
-                if (!crypt_info->ci_keyring_key ||
-                                key_validate(crypt_info->ci_keyring_key) == 0)
-                        return 0;
-                f2fs_free_encryption_info(inode, crypt_info);
-                goto retry;
-        }
-        res = f2fs_getxattr(inode, F2FS_XATTR_INDEX_ENCRYPTION,
-                                F2FS_XATTR_NAME_ENCRYPTION_CONTEXT,
-                                &ctx, sizeof(ctx), NULL);
-        if (res < 0)
-                return res;
-        else if (res != sizeof(ctx))
-                return -EINVAL;
-        res = 0;
-        crypt_info = kmem_cache_alloc(f2fs_crypt_info_cachep, GFP_NOFS);
-        if (!crypt_info)
-                return -ENOMEM;
-        crypt_info->ci_flags = ctx.flags;
-        crypt_info->ci_data_mode = ctx.contents_encryption_mode;
-        crypt_info->ci_filename_mode = ctx.filenames_encryption_mode;
-        crypt_info->ci_ctfm = NULL;
-        crypt_info->ci_keyring_key = NULL;
-        memcpy(crypt_info->ci_master_key, ctx.master_key_descriptor,
-                                sizeof(crypt_info->ci_master_key));
-        if (S_ISREG(inode->i_mode))
-                mode = crypt_info->ci_data_mode;
-        else if (S_ISDIR(inode->i_mode) || S_ISLNK(inode->i_mode))
-                mode = crypt_info->ci_filename_mode;
-        else
-                BUG();
-        switch (mode) {
-        case F2FS_ENCRYPTION_MODE_AES_256_XTS:
-                cipher_str = "xts(aes)";
-                break;
-        case F2FS_ENCRYPTION_MODE_AES_256_CTS:
-                cipher_str = "cts(cbc(aes))";
-                break;
-        default:
-                printk_once(KERN_WARNING
-                            "f2fs: unsupported key mode %d (ino %u)\n",
-                            mode, (unsigned) inode->i_ino);
-                res = -ENOKEY;
-                goto out;
-        }
-        memcpy(full_key_descriptor, F2FS_KEY_DESC_PREFIX,
-                                        F2FS_KEY_DESC_PREFIX_SIZE);
-        sprintf(full_key_descriptor + F2FS_KEY_DESC_PREFIX_SIZE,
-                                        "%*phN", F2FS_KEY_DESCRIPTOR_SIZE,
-                                        ctx.master_key_descriptor);
-        full_key_descriptor[F2FS_KEY_DESC_PREFIX_SIZE +
-                                        (2 * F2FS_KEY_DESCRIPTOR_SIZE)] = '\0';
-        keyring_key = request_key(&key_type_logon, full_key_descriptor, NULL);
-        if (IS_ERR(keyring_key)) {
-                res = PTR_ERR(keyring_key);
-                keyring_key = NULL;
-                goto out;
-        }
-        crypt_info->ci_keyring_key = keyring_key;
-        BUG_ON(keyring_key->type != &key_type_logon);
-        ukp = user_key_payload(keyring_key);
-        if (ukp->datalen != sizeof(struct f2fs_encryption_key)) {
-                res = -EINVAL;
-                goto out;
-        }
-        master_key = (struct f2fs_encryption_key *)ukp->data;
-        BUILD_BUG_ON(F2FS_AES_128_ECB_KEY_SIZE !=
-                                F2FS_KEY_DERIVATION_NONCE_SIZE);
-        BUG_ON(master_key->size != F2FS_AES_256_XTS_KEY_SIZE);
-        res = f2fs_derive_key_aes(ctx.nonce, master_key->raw,
-                                  raw_key);
-        if (res)
-                goto out;
-        ctfm = crypto_alloc_ablkcipher(cipher_str, 0, 0);
-        if (!ctfm || IS_ERR(ctfm)) {
-                res = ctfm ? PTR_ERR(ctfm) : -ENOMEM;
-                printk(KERN_DEBUG
-                       "%s: error %d (inode %u) allocating crypto tfm\n",
-                       __func__, res, (unsigned) inode->i_ino);
-                goto out;
-        }
-        crypt_info->ci_ctfm = ctfm;
-        crypto_ablkcipher_clear_flags(ctfm, ~0);
-        crypto_tfm_set_flags(crypto_ablkcipher_tfm(ctfm),
-                             CRYPTO_TFM_REQ_WEAK_KEY);
-        res = crypto_ablkcipher_setkey(ctfm, raw_key,
-                                        f2fs_encryption_key_size(mode));
-        if (res)
-                goto out;
-        memzero_explicit(raw_key, sizeof(raw_key));
-        if (cmpxchg(&fi->i_crypt_info, NULL, crypt_info) != NULL) {
-                f2fs_free_crypt_info(crypt_info);
-                goto retry;
-        }
-        return 0;
-out:
-        if (res == -ENOKEY && !S_ISREG(inode->i_mode))
-                res = 0;
-        f2fs_free_crypt_info(crypt_info);
-        memzero_explicit(raw_key, sizeof(raw_key));
-        return res;
-}
-int f2fs_has_encryption_key(struct inode *inode)
-{
-        struct f2fs_inode_info *fi = F2FS_I(inode);
-        return (fi->i_crypt_info != NULL);
-}
diff --git a/fs/f2fs/crypto_policy.c b/fs/f2fs/crypto_policy.c
deleted file mode 100644
index d4a96af513c2..000000000000
--- a/fs/f2fs/crypto_policy.c
+++ /dev/null
@@ -1,209 +0,0 @@
-/*
- * copied from linux/fs/ext4/crypto_policy.c
- *
- * Copyright (C) 2015, Google, Inc.
- * Copyright (C) 2015, Motorola Mobility.
- *
- * This contains encryption policy functions for f2fs with some modifications
- * to support f2fs-specific xattr APIs.
- *
- * Written by Michael Halcrow, 2015.
- * Modified by Jaegeuk Kim, 2015.
- */
-#include <linux/random.h>
-#include <linux/string.h>
-#include <linux/types.h>
-#include <linux/f2fs_fs.h>
-#include "f2fs.h"
-#include "xattr.h"
-static int f2fs_inode_has_encryption_context(struct inode *inode)
-{
-        int res = f2fs_getxattr(inode, F2FS_XATTR_INDEX_ENCRYPTION,
-                        F2FS_XATTR_NAME_ENCRYPTION_CONTEXT, NULL, 0, NULL);
-        return (res > 0);
-}
-/*
- * check whether the policy is consistent with the encryption context
- * for the inode
- */
-static int f2fs_is_encryption_context_consistent_with_policy(
-        struct inode *inode, const struct f2fs_encryption_policy *policy)
-{
-        struct f2fs_encryption_context ctx;
-        int res = f2fs_getxattr(inode, F2FS_XATTR_INDEX_ENCRYPTION,
-                                F2FS_XATTR_NAME_ENCRYPTION_CONTEXT, &ctx,
-                                sizeof(ctx), NULL);
-        if (res != sizeof(ctx))
-                return 0;
-        return (memcmp(ctx.master_key_descriptor, policy->master_key_descriptor,
-                                F2FS_KEY_DESCRIPTOR_SIZE) == 0 &&
-                        (ctx.flags == policy->flags) &&
-                        (ctx.contents_encryption_mode ==
-                         policy->contents_encryption_mode) &&
-                        (ctx.filenames_encryption_mode ==
-                         policy->filenames_encryption_mode));
-}
-static int f2fs_create_encryption_context_from_policy(
-        struct inode *inode, const struct f2fs_encryption_policy *policy)
-{
-        struct f2fs_encryption_context ctx;
-        ctx.format = F2FS_ENCRYPTION_CONTEXT_FORMAT_V1;
-        memcpy(ctx.master_key_descriptor, policy->master_key_descriptor,
-                        F2FS_KEY_DESCRIPTOR_SIZE);
-        if (!f2fs_valid_contents_enc_mode(policy->contents_encryption_mode)) {
-                printk(KERN_WARNING
-                       "%s: Invalid contents encryption mode %d\n", __func__,
-                        policy->contents_encryption_mode);
-                return -EINVAL;
-        }
-        if (!f2fs_valid_filenames_enc_mode(policy->filenames_encryption_mode)) {
-                printk(KERN_WARNING
-                       "%s: Invalid filenames encryption mode %d\n", __func__,
-                        policy->filenames_encryption_mode);
-                return -EINVAL;
-        }
-        if (policy->flags & ~F2FS_POLICY_FLAGS_VALID)
-                return -EINVAL;
-        ctx.contents_encryption_mode = policy->contents_encryption_mode;
-        ctx.filenames_encryption_mode = policy->filenames_encryption_mode;
-        ctx.flags = policy->flags;
-        BUILD_BUG_ON(sizeof(ctx.nonce) != F2FS_KEY_DERIVATION_NONCE_SIZE);
-        get_random_bytes(ctx.nonce, F2FS_KEY_DERIVATION_NONCE_SIZE);
-        return f2fs_setxattr(inode, F2FS_XATTR_INDEX_ENCRYPTION,
-                        F2FS_XATTR_NAME_ENCRYPTION_CONTEXT, &ctx,
-                        sizeof(ctx), NULL, XATTR_CREATE);
-}
-int f2fs_process_policy(const struct f2fs_encryption_policy *policy,
-                        struct inode *inode)
-{
-        if (policy->version != 0)
-                return -EINVAL;
-        if (!S_ISDIR(inode->i_mode))
-                return -EINVAL;
-        if (!f2fs_inode_has_encryption_context(inode)) {
-                if (!f2fs_empty_dir(inode))
-                        return -ENOTEMPTY;
-                return f2fs_create_encryption_context_from_policy(inode,
-                                                                  policy);
-        }
-        if (f2fs_is_encryption_context_consistent_with_policy(inode, policy))
-                return 0;
-        printk(KERN_WARNING "%s: Policy inconsistent with encryption context\n",
-               __func__);
-        return -EINVAL;
-}
-int f2fs_get_policy(struct inode *inode, struct f2fs_encryption_policy *policy)
-{
-        struct f2fs_encryption_context ctx;
-        int res;
-        if (!f2fs_encrypted_inode(inode))
-                return -ENODATA;
-        res = f2fs_getxattr(inode, F2FS_XATTR_INDEX_ENCRYPTION,
-                                F2FS_XATTR_NAME_ENCRYPTION_CONTEXT,
-                                &ctx, sizeof(ctx), NULL);
-        if (res != sizeof(ctx))
-                return -ENODATA;
-        if (ctx.format != F2FS_ENCRYPTION_CONTEXT_FORMAT_V1)
-                return -EINVAL;
-        policy->version = 0;
-        policy->contents_encryption_mode = ctx.contents_encryption_mode;
-        policy->filenames_encryption_mode = ctx.filenames_encryption_mode;
-        policy->flags = ctx.flags;
-        memcpy(&policy->master_key_descriptor, ctx.master_key_descriptor,
-                        F2FS_KEY_DESCRIPTOR_SIZE);
-        return 0;
-}
-int f2fs_is_child_context_consistent_with_parent(struct inode *parent,
-                                                struct inode *child)
-{
-        struct f2fs_crypt_info *parent_ci, *child_ci;
-        int res;
-        if ((parent == NULL) || (child == NULL)) {
-                pr_err("parent %p child %p\n", parent, child);
-                BUG_ON(1);
-        }
-        /* no restrictions if the parent directory is not encrypted */
-        if (!f2fs_encrypted_inode(parent))
-                return 1;
-        /* if the child directory is not encrypted, this is always a problem */
-        if (!f2fs_encrypted_inode(child))
-                return 0;
-        res = f2fs_get_encryption_info(parent);
-        if (res)
-                return 0;
-        res = f2fs_get_encryption_info(child);
-        if (res)
-                return 0;
-        parent_ci = F2FS_I(parent)->i_crypt_info;
-        child_ci = F2FS_I(child)->i_crypt_info;
-        if (!parent_ci && !child_ci)
-                return 1;
-        if (!parent_ci || !child_ci)
-                return 0;
-        return (memcmp(parent_ci->ci_master_key,
-                        child_ci->ci_master_key,
-                        F2FS_KEY_DESCRIPTOR_SIZE) == 0 &&
-                (parent_ci->ci_data_mode == child_ci->ci_data_mode) &&
-                (parent_ci->ci_filename_mode == child_ci->ci_filename_mode) &&
-                (parent_ci->ci_flags == child_ci->ci_flags));
-}
-/**
- * f2fs_inherit_context() - Sets a child context from its parent
- * @parent: Parent inode from which the context is inherited.
- * @child:  Child inode that inherits the context from @parent.
- *
- * Return: Zero on success, non-zero otherwise
- */
-int f2fs_inherit_context(struct inode *parent, struct inode *child,
-                                                struct page *ipage)
-{
-        struct f2fs_encryption_context ctx;
-        struct f2fs_crypt_info *ci;
-        int res;
-        res = f2fs_get_encryption_info(parent);
-        if (res < 0)
-                return res;
-        ci = F2FS_I(parent)->i_crypt_info;
-        BUG_ON(ci == NULL);
-        ctx.format = F2FS_ENCRYPTION_CONTEXT_FORMAT_V1;
-        ctx.contents_encryption_mode = ci->ci_data_mode;
-        ctx.filenames_encryption_mode = ci->ci_filename_mode;
-        ctx.flags = ci->ci_flags;
-        memcpy(ctx.master_key_descriptor, ci->ci_master_key,
-                        F2FS_KEY_DESCRIPTOR_SIZE);
-        get_random_bytes(ctx.nonce, F2FS_KEY_DERIVATION_NONCE_SIZE);
-        return f2fs_setxattr(child, F2FS_XATTR_INDEX_ENCRYPTION,
-                                F2FS_XATTR_NAME_ENCRYPTION_CONTEXT, &ctx,
-                                sizeof(ctx), ipage, XATTR_CREATE);
-}
diff --git a/fs/f2fs/data.c b/fs/f2fs/data.c
index 5c06db17e41f..e5c762b37239 100644
--- a/fs/f2fs/data.c
+++ b/fs/f2fs/data.c
@@ -34,9 +34,9 @@ static void f2fs_read_end_io(struct bio *bio)
        if (f2fs_bio_encrypted(bio)) {
                if (bio->bi_error) {
-                        f2fs_release_crypto_ctx(bio->bi_private);
+                        fscrypt_release_ctx(bio->bi_private);
                } else {
-                        f2fs_end_io_crypto_work(bio->bi_private, bio);
+                        fscrypt_decrypt_bio_pages(bio->bi_private, bio);
                        return;
                }
        }
@@ -64,10 +64,9 @@ static void f2fs_write_end_io(struct bio *bio)
        bio_for_each_segment_all(bvec, bio, i) {
                struct page *page = bvec->bv_page;
-                f2fs_restore_and_release_control_page(&page);
+                fscrypt_pullback_bio_page(&page, true);
                if (unlikely(bio->bi_error)) {
-                        set_page_dirty(page);
                        set_bit(AS_EIO, &page->mapping->flags);
                        f2fs_stop_checkpoint(sbi);
                }
@@ -75,8 +74,7 @@ static void f2fs_write_end_io(struct bio *bio)
                dec_page_count(sbi, F2FS_WRITEBACK);
        }
-        if (!get_pages(sbi, F2FS_WRITEBACK) &&
+        if (!get_pages(sbi, F2FS_WRITEBACK) && wq_has_sleeper(&sbi->cp_wait))
-                        !list_empty(&sbi->cp_wait.task_list))
                wake_up(&sbi->cp_wait);
        bio_put(bio);
@@ -116,8 +114,54 @@ static void __submit_merged_bio(struct f2fs_bio_info *io)
        io->bio = NULL;
 }
-void f2fs_submit_merged_bio(struct f2fs_sb_info *sbi,
+static bool __has_merged_page(struct f2fs_bio_info *io, struct inode *inode,
-                                enum page_type type, int rw)
+                                                struct page *page, nid_t ino)
+{
+        struct bio_vec *bvec;
+        struct page *target;
+        int i;
+        if (!io->bio)
+                return false;
+        if (!inode && !page && !ino)
+                return true;
+        bio_for_each_segment_all(bvec, io->bio, i) {
+                if (bvec->bv_page->mapping)
+                        target = bvec->bv_page;
+                else
+                        target = fscrypt_control_page(bvec->bv_page);
+                if (inode && inode == target->mapping->host)
+                        return true;
+                if (page && page == target)
+                        return true;
+                if (ino && ino == ino_of_node(target))
+                        return true;
+        }
+        return false;
+}
+static bool has_merged_page(struct f2fs_sb_info *sbi, struct inode *inode,
+                                                struct page *page, nid_t ino,
+                                                enum page_type type)
+{
+        enum page_type btype = PAGE_TYPE_OF_BIO(type);
+        struct f2fs_bio_info *io = &sbi->write_io[btype];
+        bool ret;
+        down_read(&io->io_rwsem);
+        ret = __has_merged_page(io, inode, page, ino);
+        up_read(&io->io_rwsem);
+        return ret;
+}
+static void __f2fs_submit_merged_bio(struct f2fs_sb_info *sbi,
+                                struct inode *inode, struct page *page,
+                                nid_t ino, enum page_type type, int rw)
 {
        enum page_type btype = PAGE_TYPE_OF_BIO(type);
        struct f2fs_bio_info *io;
@@ -126,6 +170,9 @@ void f2fs_submit_merged_bio(struct f2fs_sb_info *sbi,
        down_write(&io->io_rwsem);
+        if (!__has_merged_page(io, inode, page, ino))
+                goto out;
        /* change META to META_FLUSH in the checkpoint procedure */
        if (type >= META_FLUSH) {
                io->fio.type = META_FLUSH;
@@ -135,9 +182,31 @@ void f2fs_submit_merged_bio(struct f2fs_sb_info *sbi,
                        io->fio.rw = WRITE_FLUSH_FUA | REQ_META | REQ_PRIO;
        }
        __submit_merged_bio(io);
+out:
        up_write(&io->io_rwsem);
 }
+void f2fs_submit_merged_bio(struct f2fs_sb_info *sbi, enum page_type type,
+                                                                        int rw)
+{
+        __f2fs_submit_merged_bio(sbi, NULL, NULL, 0, type, rw);
+}
+void f2fs_submit_merged_bio_cond(struct f2fs_sb_info *sbi,
+                                struct inode *inode, struct page *page,
+                                nid_t ino, enum page_type type, int rw)
+{
+        if (has_merged_page(sbi, inode, page, ino, type))
+                __f2fs_submit_merged_bio(sbi, inode, page, ino, type, rw);
+}
+void f2fs_flush_merged_bios(struct f2fs_sb_info *sbi)
+{
+        f2fs_submit_merged_bio(sbi, DATA, WRITE);
+        f2fs_submit_merged_bio(sbi, NODE, WRITE);
+        f2fs_submit_merged_bio(sbi, META, WRITE);
+}
 /*
 * Fill the locked page with data located in the block address.
 * Return unlocked page.
@@ -145,13 +214,14 @@ void f2fs_submit_merged_bio(struct f2fs_sb_info *sbi,
 int f2fs_submit_page_bio(struct f2fs_io_info *fio)
 {
        struct bio *bio;
-        struct page *page = fio->encrypted_page ? fio->encrypted_page : fio->page;
+        struct page *page = fio->encrypted_page ?
+                        fio->encrypted_page : fio->page;
        trace_f2fs_submit_page_bio(page, fio);
        f2fs_trace_ios(fio, 0);
        /* Allocate a new bio */
-        bio = __bio_alloc(fio->sbi, fio->blk_addr, 1, is_read_io(fio->rw));
+        bio = __bio_alloc(fio->sbi, fio->new_blkaddr, 1, is_read_io(fio->rw));
        if (bio_add_page(bio, page, PAGE_CACHE_SIZE, 0) < PAGE_CACHE_SIZE) {
                bio_put(bio);
@@ -172,21 +242,24 @@ void f2fs_submit_page_mbio(struct f2fs_io_info *fio)
        io = is_read ? &sbi->read_io : &sbi->write_io[btype];
-        verify_block_addr(sbi, fio->blk_addr);
+        if (fio->old_blkaddr != NEW_ADDR)
+                verify_block_addr(sbi, fio->old_blkaddr);
+        verify_block_addr(sbi, fio->new_blkaddr);
        down_write(&io->io_rwsem);
        if (!is_read)
                inc_page_count(sbi, F2FS_WRITEBACK);
-        if (io->bio && (io->last_block_in_bio != fio->blk_addr - 1 ||
+        if (io->bio && (io->last_block_in_bio != fio->new_blkaddr - 1 ||
                                                io->fio.rw != fio->rw))
                __submit_merged_bio(io);
 alloc_new:
        if (io->bio == NULL) {
                int bio_blocks = MAX_BIO_BLOCKS(sbi);
-                io->bio = __bio_alloc(sbi, fio->blk_addr, bio_blocks, is_read);
+                io->bio = __bio_alloc(sbi, fio->new_blkaddr,
+                                                bio_blocks, is_read);
                io->fio = *fio;
        }
@@ -198,7 +271,7 @@ alloc_new:
                goto alloc_new;
        }
-        io->last_block_in_bio = fio->blk_addr;
+        io->last_block_in_bio = fio->new_blkaddr;
        f2fs_trace_ios(fio, 0);
        up_write(&io->io_rwsem);
@@ -218,7 +291,7 @@ void set_data_blkaddr(struct dnode_of_data *dn)
        struct page *node_page = dn->node_page;
        unsigned int ofs_in_node = dn->ofs_in_node;
-        f2fs_wait_on_page_writeback(node_page, NODE);
+        f2fs_wait_on_page_writeback(node_page, NODE, true);
        rn = F2FS_NODE(node_page);
@@ -229,6 +302,13 @@ void set_data_blkaddr(struct dnode_of_data *dn)
                dn->node_changed = true;
 }
+void f2fs_update_data_blkaddr(struct dnode_of_data *dn, block_t blkaddr)
+{
+        dn->data_blkaddr = blkaddr;
+        set_data_blkaddr(dn);
+        f2fs_update_extent_cache(dn);
+}
 int reserve_new_block(struct dnode_of_data *dn)
 {
        struct f2fs_sb_info *sbi = F2FS_I_SB(dn->inode);
@@ -332,7 +412,7 @@ got_it:
                return page;
        }
-        fio.blk_addr = dn.data_blkaddr;
+        fio.new_blkaddr = fio.old_blkaddr = dn.data_blkaddr;
        fio.page = page;
        err = f2fs_submit_page_bio(&fio);
        if (err)
@@ -461,7 +541,6 @@ got_it:
 static int __allocate_data_block(struct dnode_of_data *dn)
 {
        struct f2fs_sb_info *sbi = F2FS_I_SB(dn->inode);
-        struct f2fs_inode_info *fi = F2FS_I(dn->inode);
        struct f2fs_summary sum;
        struct node_info ni;
        int seg = CURSEG_WARM_DATA;
@@ -489,7 +568,7 @@ alloc:
        set_data_blkaddr(dn);
        /* update i_size */
-        fofs = start_bidx_of_node(ofs_of_node(dn->node_page), fi) +
+        fofs = start_bidx_of_node(ofs_of_node(dn->node_page), dn->inode) +
                                                        dn->ofs_in_node;
        if (i_size_read(dn->inode) < ((loff_t)(fofs + 1) << PAGE_CACHE_SHIFT))
                i_size_write(dn->inode,
@@ -497,67 +576,33 @@ alloc:
        return 0;
 }
-static int __allocate_data_blocks(struct inode *inode, loff_t offset,
+ssize_t f2fs_preallocate_blocks(struct kiocb *iocb, struct iov_iter *from)
-                                                        size_t count)
 {
-        struct f2fs_sb_info *sbi = F2FS_I_SB(inode);
+        struct inode *inode = file_inode(iocb->ki_filp);
-        struct dnode_of_data dn;
+        struct f2fs_map_blocks map;
-        u64 start = F2FS_BYTES_TO_BLK(offset);
+        ssize_t ret = 0;
-        u64 len = F2FS_BYTES_TO_BLK(count);
-        bool allocated;
-        u64 end_offset;
-        int err = 0;
-        while (len) {
-                f2fs_lock_op(sbi);
-                /* When reading holes, we need its node page */
-                set_new_dnode(&dn, inode, NULL, NULL, 0);
-                err = get_dnode_of_data(&dn, start, ALLOC_NODE);
-                if (err)
-                        goto out;
-                allocated = false;
-                end_offset = ADDRS_PER_PAGE(dn.node_page, F2FS_I(inode));
-                while (dn.ofs_in_node < end_offset && len) {
-                        block_t blkaddr;
-                        if (unlikely(f2fs_cp_error(sbi))) {
-                                err = -EIO;
-                                goto sync_out;
-                        }
-                        blkaddr = datablock_addr(dn.node_page, dn.ofs_in_node);
-                        if (blkaddr == NULL_ADDR || blkaddr == NEW_ADDR) {
-                                err = __allocate_data_block(&dn);
-                                if (err)
-                                        goto sync_out;
-                                allocated = true;
-                        }
-                        len--;
-                        start++;
-                        dn.ofs_in_node++;
-                }
-                if (allocated)
+        map.m_lblk = F2FS_BYTES_TO_BLK(iocb->ki_pos);
-                        sync_inode_page(&dn);
+        map.m_len = F2FS_BLK_ALIGN(iov_iter_count(from));
+        map.m_next_pgofs = NULL;
-                f2fs_put_dnode(&dn);
+        if (f2fs_encrypted_inode(inode))
-                f2fs_unlock_op(sbi);
+                return 0;
-                f2fs_balance_fs(sbi, dn.node_changed);
+        if (iocb->ki_flags & IOCB_DIRECT) {
+                ret = f2fs_convert_inline_inode(inode);
+                if (ret)
+                        return ret;
+                return f2fs_map_blocks(inode, &map, 1, F2FS_GET_BLOCK_PRE_DIO);
        }
-        return err;
+        if (iocb->ki_pos + iov_iter_count(from) > MAX_INLINE_DATA) {
+                ret = f2fs_convert_inline_inode(inode);
-sync_out:
+                if (ret)
-        if (allocated)
+                        return ret;
-                sync_inode_page(&dn);
+        }
-        f2fs_put_dnode(&dn);
+        if (!f2fs_has_inline_data(inode))
-out:
+                return f2fs_map_blocks(inode, &map, 1, F2FS_GET_BLOCK_PRE_AIO);
-        f2fs_unlock_op(sbi);
+        return ret;
-        f2fs_balance_fs(sbi, dn.node_changed);
-        return err;
 }
 /*
@@ -588,13 +633,14 @@ int f2fs_map_blocks(struct inode *inode, struct f2fs_map_blocks *map,
        /* it only supports block size == page size */
        pgofs = (pgoff_t)map->m_lblk;
-        if (f2fs_lookup_extent_cache(inode, pgofs, &ei)) {
+        if (!create && f2fs_lookup_extent_cache(inode, pgofs, &ei)) {
                map->m_pblk = ei.blk + pgofs - ei.fofs;
                map->m_len = min((pgoff_t)maxblocks, ei.fofs + ei.len - pgofs);
                map->m_flags = F2FS_MAP_MAPPED;
                goto out;
        }
+next_dnode:
        if (create)
                f2fs_lock_op(sbi);
@@ -602,120 +648,98 @@ int f2fs_map_blocks(struct inode *inode, struct f2fs_map_blocks *map,
        set_new_dnode(&dn, inode, NULL, NULL, 0);
        err = get_dnode_of_data(&dn, pgofs, mode);
        if (err) {
-                if (err == -ENOENT)
+                if (err == -ENOENT) {
                        err = 0;
+                        if (map->m_next_pgofs)
+                                *map->m_next_pgofs =
+                                        get_next_page_offset(&dn, pgofs);
+                }
                goto unlock_out;
        }
-        if (dn.data_blkaddr == NEW_ADDR || dn.data_blkaddr == NULL_ADDR) {
+        end_offset = ADDRS_PER_PAGE(dn.node_page, inode);
+next_block:
+        blkaddr = datablock_addr(dn.node_page, dn.ofs_in_node);
+        if (blkaddr == NEW_ADDR || blkaddr == NULL_ADDR) {
                if (create) {
                        if (unlikely(f2fs_cp_error(sbi))) {
                                err = -EIO;
-                                goto put_out;
+                                goto sync_out;
+                        }
+                        if (flag == F2FS_GET_BLOCK_PRE_AIO) {
+                                if (blkaddr == NULL_ADDR)
+                                        err = reserve_new_block(&dn);
+                        } else {
+                                err = __allocate_data_block(&dn);
                        }
-                        err = __allocate_data_block(&dn);
                        if (err)
-                                goto put_out;
+                                goto sync_out;
                        allocated = true;
                        map->m_flags = F2FS_MAP_NEW;
+                        blkaddr = dn.data_blkaddr;
                } else {
+                        if (flag == F2FS_GET_BLOCK_FIEMAP &&
+                                                blkaddr == NULL_ADDR) {
+                                if (map->m_next_pgofs)
+                                        *map->m_next_pgofs = pgofs + 1;
+                        }
                        if (flag != F2FS_GET_BLOCK_FIEMAP ||
-                                                dn.data_blkaddr != NEW_ADDR) {
+                                                blkaddr != NEW_ADDR) {
                                if (flag == F2FS_GET_BLOCK_BMAP)
                                        err = -ENOENT;
-                                goto put_out;
+                                goto sync_out;
                        }
-                        /*
-                         * preallocated unwritten block should be mapped
-                         * for fiemap.
-                         */
-                        if (dn.data_blkaddr == NEW_ADDR)
-                                map->m_flags = F2FS_MAP_UNWRITTEN;
                }
        }
-        map->m_flags |= F2FS_MAP_MAPPED;
+        if (map->m_len == 0) {
-        map->m_pblk = dn.data_blkaddr;
+                /* preallocated unwritten block should be mapped for fiemap. */
-        map->m_len = 1;
+                if (blkaddr == NEW_ADDR)
+                        map->m_flags |= F2FS_MAP_UNWRITTEN;
+                map->m_flags |= F2FS_MAP_MAPPED;
+                map->m_pblk = blkaddr;
+                map->m_len = 1;
+        } else if ((map->m_pblk != NEW_ADDR &&
+                        blkaddr == (map->m_pblk + ofs)) ||
+                        (map->m_pblk == NEW_ADDR && blkaddr == NEW_ADDR) ||
+                        flag == F2FS_GET_BLOCK_PRE_DIO ||
+                        flag == F2FS_GET_BLOCK_PRE_AIO) {
+                ofs++;
+                map->m_len++;
+        } else {
+                goto sync_out;
+        }
-        end_offset = ADDRS_PER_PAGE(dn.node_page, F2FS_I(inode));
        dn.ofs_in_node++;
        pgofs++;
-get_next:
+        if (map->m_len < maxblocks) {
-        if (map->m_len >= maxblocks)
+                if (dn.ofs_in_node < end_offset)
-                goto sync_out;
+                        goto next_block;
-        if (dn.ofs_in_node >= end_offset) {
                if (allocated)
                        sync_inode_page(&dn);
-                allocated = false;
                f2fs_put_dnode(&dn);
                if (create) {
                        f2fs_unlock_op(sbi);
-                        f2fs_balance_fs(sbi, dn.node_changed);
+                        f2fs_balance_fs(sbi, allocated);
-                        f2fs_lock_op(sbi);
-                }
-                set_new_dnode(&dn, inode, NULL, NULL, 0);
-                err = get_dnode_of_data(&dn, pgofs, mode);
-                if (err) {
-                        if (err == -ENOENT)
-                                err = 0;
-                        goto unlock_out;
-                }
-                end_offset = ADDRS_PER_PAGE(dn.node_page, F2FS_I(inode));
-        }
-        blkaddr = datablock_addr(dn.node_page, dn.ofs_in_node);
-        if (blkaddr == NEW_ADDR || blkaddr == NULL_ADDR) {
-                if (create) {
-                        if (unlikely(f2fs_cp_error(sbi))) {
-                                err = -EIO;
-                                goto sync_out;
-                        }
-                        err = __allocate_data_block(&dn);
-                        if (err)
-                                goto sync_out;
-                        allocated = true;
-                        map->m_flags |= F2FS_MAP_NEW;
-                        blkaddr = dn.data_blkaddr;
-                } else {
-                        /*
-                         * we only merge preallocated unwritten blocks
-                         * for fiemap.
-                         */
-                        if (flag != F2FS_GET_BLOCK_FIEMAP ||
-                                        blkaddr != NEW_ADDR)
-                                goto sync_out;
                }
-        }
+                allocated = false;
+                goto next_dnode;
-        /* Give more consecutive addresses for the readahead */
-        if ((map->m_pblk != NEW_ADDR &&
-                        blkaddr == (map->m_pblk + ofs)) ||
-                        (map->m_pblk == NEW_ADDR &&
-                        blkaddr == NEW_ADDR)) {
-                ofs++;
-                dn.ofs_in_node++;
-                pgofs++;
-                map->m_len++;
-                goto get_next;
        }
 sync_out:
        if (allocated)
                sync_inode_page(&dn);
-put_out:
        f2fs_put_dnode(&dn);
 unlock_out:
        if (create) {
                f2fs_unlock_op(sbi);
-                f2fs_balance_fs(sbi, dn.node_changed);
+                f2fs_balance_fs(sbi, allocated);
        }
 out:
        trace_f2fs_map_blocks(inode, map, err);
@@ -723,13 +747,15 @@ out:
 }
 static int __get_data_block(struct inode *inode, sector_t iblock,
-                        struct buffer_head *bh, int create, int flag)
+                        struct buffer_head *bh, int create, int flag,
+                        pgoff_t *next_pgofs)
 {
        struct f2fs_map_blocks map;
        int ret;
        map.m_lblk = iblock;
        map.m_len = bh->b_size >> inode->i_blkbits;
+        map.m_next_pgofs = next_pgofs;
        ret = f2fs_map_blocks(inode, &map, create, flag);
        if (!ret) {
@@ -741,16 +767,18 @@ static int __get_data_block(struct inode *inode, sector_t iblock,
 }
 static int get_data_block(struct inode *inode, sector_t iblock,
-                        struct buffer_head *bh_result, int create, int flag)
+                        struct buffer_head *bh_result, int create, int flag,
+                        pgoff_t *next_pgofs)
 {
-        return __get_data_block(inode, iblock, bh_result, create, flag);
+        return __get_data_block(inode, iblock, bh_result, create,
+                                                        flag, next_pgofs);
 }
 static int get_data_block_dio(struct inode *inode, sector_t iblock,
                        struct buffer_head *bh_result, int create)
 {
        return __get_data_block(inode, iblock, bh_result, create,
-                                                F2FS_GET_BLOCK_DIO);
+                                                F2FS_GET_BLOCK_DIO, NULL);
 }
 static int get_data_block_bmap(struct inode *inode, sector_t iblock,
@@ -761,7 +789,7 @@ static int get_data_block_bmap(struct inode *inode, sector_t iblock,
                return -EFBIG;
        return __get_data_block(inode, iblock, bh_result, create,
-                                                F2FS_GET_BLOCK_BMAP);
+                                                F2FS_GET_BLOCK_BMAP, NULL);
 }
 static inline sector_t logical_to_blk(struct inode *inode, loff_t offset)
@@ -779,6 +807,7 @@ int f2fs_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
 {
        struct buffer_head map_bh;
        sector_t start_blk, last_blk;
+        pgoff_t next_pgofs;
        loff_t isize;
        u64 logical = 0, phys = 0, size = 0;
        u32 flags = 0;
@@ -814,14 +843,15 @@ next:
        map_bh.b_size = len;
        ret = get_data_block(inode, start_blk, &map_bh, 0,
-                                        F2FS_GET_BLOCK_FIEMAP);
+                                        F2FS_GET_BLOCK_FIEMAP, &next_pgofs);
        if (ret)
                goto out;
        /* HOLE */
        if (!buffer_mapped(&map_bh)) {
+                start_blk = next_pgofs;
                /* Go through holes util pass the EOF */
-                if (blk_to_logical(inode, start_blk++) < isize)
+                if (blk_to_logical(inode, start_blk) < isize)
                        goto prep_next;
                /* Found a hole beyond isize means no more extents.
                 * Note that the premise is that filesystems don't
@@ -889,6 +919,7 @@ static int f2fs_mpage_readpages(struct address_space *mapping,
        map.m_lblk = 0;
        map.m_len = 0;
        map.m_flags = 0;
+        map.m_next_pgofs = NULL;
        for (page_idx = 0; nr_pages; page_idx++, nr_pages--) {
@@ -927,7 +958,7 @@ static int f2fs_mpage_readpages(struct address_space *mapping,
                        map.m_len = last_block - block_in_file;
                        if (f2fs_map_blocks(inode, &map, 0,
-                                                        F2FS_GET_BLOCK_READ))
+                                                F2FS_GET_BLOCK_READ))
                                goto set_error_page;
                }
 got_it:
@@ -956,12 +987,12 @@ submit_and_realloc:
                        bio = NULL;
                }
                if (bio == NULL) {
-                        struct f2fs_crypto_ctx *ctx = NULL;
+                        struct fscrypt_ctx *ctx = NULL;
                        if (f2fs_encrypted_inode(inode) &&
                                        S_ISREG(inode->i_mode)) {
-                                ctx = f2fs_get_crypto_ctx(inode);
+                                ctx = fscrypt_get_ctx(inode);
                                if (IS_ERR(ctx))
                                        goto set_error_page;
@@ -974,7 +1005,7 @@ submit_and_realloc:
                                min_t(int, nr_pages, BIO_MAX_PAGES));
                        if (!bio) {
                                if (ctx)
-                                        f2fs_release_crypto_ctx(ctx);
+                                        fscrypt_release_ctx(ctx);
                                goto set_error_page;
                        }
                        bio->bi_bdev = bdev;
@@ -1052,10 +1083,10 @@ int do_write_data_page(struct f2fs_io_info *fio)
        if (err)
                return err;
-        fio->blk_addr = dn.data_blkaddr;
+        fio->old_blkaddr = dn.data_blkaddr;
        /* This page is already truncated */
-        if (fio->blk_addr == NULL_ADDR) {
+        if (fio->old_blkaddr == NULL_ADDR) {
                ClearPageUptodate(page);
                goto out_writepage;
        }
@@ -1064,9 +1095,9 @@ int do_write_data_page(struct f2fs_io_info *fio)
                /* wait for GCed encrypted page writeback */
                f2fs_wait_on_encrypted_page_writeback(F2FS_I_SB(inode),
-                                                        fio->blk_addr);
+                                                        fio->old_blkaddr);
-                fio->encrypted_page = f2fs_encrypt(inode, fio->page);
+                fio->encrypted_page = fscrypt_encrypt_page(inode, fio->page);
                if (IS_ERR(fio->encrypted_page)) {
                        err = PTR_ERR(fio->encrypted_page);
                        goto out_writepage;
@@ -1079,7 +1110,7 @@ int do_write_data_page(struct f2fs_io_info *fio)
         * If current allocation needs SSR,
         * it had better in-place writes for updated data.
         */
-        if (unlikely(fio->blk_addr != NEW_ADDR &&
+        if (unlikely(fio->old_blkaddr != NEW_ADDR &&
                        !is_cold_data(page) &&
                        !IS_ATOMIC_WRITTEN_PAGE(page) &&
                        need_inplace_update(inode))) {
@@ -1088,8 +1119,6 @@ int do_write_data_page(struct f2fs_io_info *fio)
                trace_f2fs_do_write_data_page(page, IPU);
        } else {
                write_data_page(&dn, fio);
-                set_data_blkaddr(&dn);
-                f2fs_update_extent_cache(&dn);
                trace_f2fs_do_write_data_page(page, OPU);
                set_inode_flag(F2FS_I(inode), FI_APPEND_WRITE);
                if (page->index == 0)
@@ -1177,12 +1206,18 @@ out:
        inode_dec_dirty_pages(inode);
        if (err)
                ClearPageUptodate(page);
+        if (wbc->for_reclaim) {
+                f2fs_submit_merged_bio_cond(sbi, NULL, page, 0, DATA, WRITE);
+                remove_dirty_inode(inode);
+        }
        unlock_page(page);
        f2fs_balance_fs(sbi, need_balance_fs);
-        if (wbc->for_reclaim || unlikely(f2fs_cp_error(sbi))) {
+        if (unlikely(f2fs_cp_error(sbi)))
                f2fs_submit_merged_bio(sbi, DATA, WRITE);
-                remove_dirty_inode(inode);
-        }
        return 0;
 redirty_out:
@@ -1282,7 +1317,8 @@ continue_unlock:
                        if (PageWriteback(page)) {
                                if (wbc->sync_mode != WB_SYNC_NONE)
-                                        f2fs_wait_on_page_writeback(page, DATA);
+                                        f2fs_wait_on_page_writeback(page,
+                                                                DATA, true);
                                else
                                        goto continue_unlock;
                        }
@@ -1339,8 +1375,6 @@ static int f2fs_write_data_pages(struct address_space *mapping,
        int ret;
        long diff;
-        trace_f2fs_writepages(mapping->host, wbc, DATA);
        /* deal with chardevs and other special file */
        if (!mapping->a_ops->writepage)
                return 0;
@@ -1362,14 +1396,16 @@ static int f2fs_write_data_pages(struct address_space *mapping,
        if (unlikely(is_sbi_flag_set(sbi, SBI_POR_DOING)))
                goto skip_write;
+        trace_f2fs_writepages(mapping->host, wbc, DATA);
        diff = nr_pages_to_write(sbi, DATA, wbc);
-        if (!S_ISDIR(inode->i_mode)) {
+        if (!S_ISDIR(inode->i_mode) && wbc->sync_mode == WB_SYNC_ALL) {
                mutex_lock(&sbi->writepages);
                locked = true;
        }
        ret = f2fs_write_cache_pages(mapping, wbc, __f2fs_writepage, mapping);
-        f2fs_submit_merged_bio(sbi, DATA, WRITE);
+        f2fs_submit_merged_bio_cond(sbi, inode, NULL, 0, DATA, WRITE);
        if (locked)
                mutex_unlock(&sbi->writepages);
@@ -1380,6 +1416,7 @@ static int f2fs_write_data_pages(struct address_space *mapping,
 skip_write:
        wbc->pages_skipped += get_dirty_pages(inode);
+        trace_f2fs_writepages(mapping->host, wbc, DATA);
        return 0;
 }
@@ -1406,6 +1443,14 @@ static int prepare_write_begin(struct f2fs_sb_info *sbi,
        struct extent_info ei;
        int err = 0;
+        /*
+         * we already allocated all the blocks, so we don't need to get
+         * the block addresses when there is no need to fill the page.
+         */
+        if (!f2fs_has_inline_data(inode) && !f2fs_encrypted_inode(inode) &&
+                                        len == PAGE_CACHE_SIZE)
+                return 0;
        if (f2fs_has_inline_data(inode) ||
                        (pos & PAGE_CACHE_MASK) >= i_size_read(inode)) {
                f2fs_lock_op(sbi);
@@ -1425,7 +1470,7 @@ restart:
                if (pos + len <= MAX_INLINE_DATA) {
                        read_inline_data(page, ipage);
                        set_inode_flag(F2FS_I(inode), FI_DATA_EXIST);
-                        sync_inode_page(&dn);
+                        set_inline_node(ipage);
                } else {
                        err = f2fs_convert_inline_page(&dn, page);
                        if (err)
@@ -1439,13 +1484,9 @@ restart:
                if (f2fs_lookup_extent_cache(inode, index, &ei)) {
                        dn.data_blkaddr = ei.blk + index - ei.fofs;
                } else {
-                        bool restart = false;
                        /* hole case */
                        err = get_dnode_of_data(&dn, index, LOOKUP_NODE);
-                        if (err || (!err && dn.data_blkaddr == NULL_ADDR))
+                        if (err || (!err && dn.data_blkaddr == NULL_ADDR)) {
-                                restart = true;
-                        if (restart) {
                                f2fs_put_dnode(&dn);
                                f2fs_lock_op(sbi);
                                locked = true;
@@ -1514,7 +1555,7 @@ repeat:
                }
        }
-        f2fs_wait_on_page_writeback(page, DATA);
+        f2fs_wait_on_page_writeback(page, DATA, false);
        /* wait for GCed encrypted page writeback */
        if (f2fs_encrypted_inode(inode) && S_ISREG(inode->i_mode))
@@ -1541,7 +1582,8 @@ repeat:
                        .sbi = sbi,
                        .type = DATA,
                        .rw = READ_SYNC,
-                        .blk_addr = blkaddr,
+                        .old_blkaddr = blkaddr,
+                        .new_blkaddr = blkaddr,
                        .page = page,
                        .encrypted_page = NULL,
                };
@@ -1561,7 +1603,7 @@ repeat:
                /* avoid symlink page */
                if (f2fs_encrypted_inode(inode) && S_ISREG(inode->i_mode)) {
-                        err = f2fs_decrypt_one(inode, page);
+                        err = fscrypt_decrypt_page(page);
                        if (err)
                                goto fail;
                }
@@ -1592,7 +1634,6 @@ static int f2fs_write_end(struct file *file,
        if (pos + copied > i_size_read(inode)) {
                i_size_write(inode, pos + copied);
                mark_inode_dirty(inode);
-                update_inode_page(inode);
        }
        f2fs_put_page(page, 1);
@@ -1617,34 +1658,21 @@ static int check_direct_IO(struct inode *inode, struct iov_iter *iter,
 static ssize_t f2fs_direct_IO(struct kiocb *iocb, struct iov_iter *iter,
                              loff_t offset)
 {
-        struct file *file = iocb->ki_filp;
+        struct address_space *mapping = iocb->ki_filp->f_mapping;
-        struct address_space *mapping = file->f_mapping;
        struct inode *inode = mapping->host;
        size_t count = iov_iter_count(iter);
        int err;
-        /* we don't need to use inline_data strictly */
+        err = check_direct_IO(inode, iter, offset);
-        err = f2fs_convert_inline_inode(inode);
        if (err)
                return err;
        if (f2fs_encrypted_inode(inode) && S_ISREG(inode->i_mode))
                return 0;
-        err = check_direct_IO(inode, iter, offset);
-        if (err)
-                return err;
        trace_f2fs_direct_IO_enter(inode, offset, count, iov_iter_rw(iter));
-        if (iov_iter_rw(iter) == WRITE) {
-                err = __allocate_data_blocks(inode, offset, count);
-                if (err)
-                        goto out;
-        }
        err = blockdev_direct_IO(iocb, inode, iter, offset, get_data_block_dio);
-out:
        if (err < 0 && iov_iter_rw(iter) == WRITE)
                f2fs_write_failed(mapping, offset + count);
diff --git a/fs/f2fs/dir.c b/fs/f2fs/dir.c
index faa7495e2d7e..80641ad82745 100644
--- a/fs/f2fs/dir.c
+++ b/fs/f2fs/dir.c
@@ -77,7 +77,7 @@ static unsigned long dir_block_index(unsigned int level,
 }
 static struct f2fs_dir_entry *find_in_block(struct page *dentry_page,
-                                struct f2fs_filename *fname,
+                                struct fscrypt_name *fname,
                                f2fs_hash_t namehash,
                                int *max_slots,
                                struct page **res_page)
@@ -103,15 +103,15 @@ static struct f2fs_dir_entry *find_in_block(struct page *dentry_page,
        return de;
 }
-struct f2fs_dir_entry *find_target_dentry(struct f2fs_filename *fname,
+struct f2fs_dir_entry *find_target_dentry(struct fscrypt_name *fname,
                        f2fs_hash_t namehash, int *max_slots,
                        struct f2fs_dentry_ptr *d)
 {
        struct f2fs_dir_entry *de;
        unsigned long bit_pos = 0;
        int max_len = 0;
-        struct f2fs_str de_name = FSTR_INIT(NULL, 0);
+        struct fscrypt_str de_name = FSTR_INIT(NULL, 0);
-        struct f2fs_str *name = &fname->disk_name;
+        struct fscrypt_str *name = &fname->disk_name;
        if (max_slots)
                *max_slots = 0;
@@ -157,7 +157,7 @@ found:
 static struct f2fs_dir_entry *find_in_level(struct inode *dir,
                                        unsigned int level,
-                                        struct f2fs_filename *fname,
+                                        struct fscrypt_name *fname,
                                        struct page **res_page)
 {
        struct qstr name = FSTR_TO_QSTR(&fname->disk_name);
@@ -218,12 +218,12 @@ struct f2fs_dir_entry *f2fs_find_entry(struct inode *dir,
        struct f2fs_dir_entry *de = NULL;
        unsigned int max_depth;
        unsigned int level;
-        struct f2fs_filename fname;
+        struct fscrypt_name fname;
        int err;
        *res_page = NULL;
-        err = f2fs_fname_setup_filename(dir, child, 1, &fname);
+        err = fscrypt_setup_filename(dir, child, 1, &fname);
        if (err)
                return NULL;
@@ -251,7 +251,7 @@ struct f2fs_dir_entry *f2fs_find_entry(struct inode *dir,
                        break;
        }
 out:
-        f2fs_fname_free_filename(&fname);
+        fscrypt_free_filename(&fname);
        return de;
 }
@@ -296,7 +296,7 @@ void f2fs_set_link(struct inode *dir, struct f2fs_dir_entry *de,
 {
        enum page_type type = f2fs_has_inline_dentry(dir) ? NODE : DATA;
        lock_page(page);
-        f2fs_wait_on_page_writeback(page, type);
+        f2fs_wait_on_page_writeback(page, type, true);
        de->ino = cpu_to_le32(inode->i_ino);
        set_de_type(de, inode->i_mode);
        f2fs_dentry_kunmap(dir, page);
@@ -311,7 +311,7 @@ static void init_dent_inode(const struct qstr *name, struct page *ipage)
 {
        struct f2fs_inode *ri;
-        f2fs_wait_on_page_writeback(ipage, NODE);
+        f2fs_wait_on_page_writeback(ipage, NODE, true);
        /* copy name info. to this inode page */
        ri = F2FS_INODE(ipage);
@@ -341,24 +341,14 @@ int update_dent_inode(struct inode *inode, struct inode *to,
 void do_make_empty_dir(struct inode *inode, struct inode *parent,
                                        struct f2fs_dentry_ptr *d)
 {
-        struct f2fs_dir_entry *de;
+        struct qstr dot = QSTR_INIT(".", 1);
+        struct qstr dotdot = QSTR_INIT("..", 2);
-        de = &d->dentry[0];
-        de->name_len = cpu_to_le16(1);
-        de->hash_code = 0;
-        de->ino = cpu_to_le32(inode->i_ino);
-        memcpy(d->filename[0], ".", 1);
-        set_de_type(de, inode->i_mode);
-        de = &d->dentry[1];
+        /* update dirent of "." */
-        de->hash_code = 0;
+        f2fs_update_dentry(inode->i_ino, inode->i_mode, d, &dot, 0, 0);
-        de->name_len = cpu_to_le16(2);
-        de->ino = cpu_to_le32(parent->i_ino);
-        memcpy(d->filename[1], "..", 2);
-        set_de_type(de, parent->i_mode);
-        test_and_set_bit_le(0, (void *)d->bitmap);
+        /* update dirent of ".." */
-        test_and_set_bit_le(1, (void *)d->bitmap);
+        f2fs_update_dentry(parent->i_ino, parent->i_mode, d, &dotdot, 0, 1);
 }
 static int make_empty_dir(struct inode *inode,
@@ -413,7 +403,7 @@ struct page *init_inode_metadata(struct inode *inode, struct inode *dir,
                        goto put_error;
                if (f2fs_encrypted_inode(dir) && f2fs_may_encrypt(inode)) {
-                        err = f2fs_inherit_context(dir, inode, page);
+                        err = fscrypt_inherit_context(dir, inode, page, false);
                        if (err)
                                goto put_error;
                }
@@ -511,8 +501,12 @@ void f2fs_update_dentry(nid_t ino, umode_t mode, struct f2fs_dentry_ptr *d,
        memcpy(d->filename[bit_pos], name->name, name->len);
        de->ino = cpu_to_le32(ino);
        set_de_type(de, mode);
-        for (i = 0; i < slots; i++)
+        for (i = 0; i < slots; i++) {
                test_and_set_bit_le(bit_pos + i, (void *)d->bitmap);
+                /* avoid wrong garbage data for readdir */
+                if (i)
+                        (de + i)->name_len = 0;
+        }
 }
 /*
@@ -532,11 +526,11 @@ int __f2fs_add_link(struct inode *dir, const struct qstr *name,
        struct f2fs_dentry_block *dentry_blk = NULL;
        struct f2fs_dentry_ptr d;
        struct page *page = NULL;
-        struct f2fs_filename fname;
+        struct fscrypt_name fname;
        struct qstr new_name;
        int slots, err;
-        err = f2fs_fname_setup_filename(dir, name, 0, &fname);
+        err = fscrypt_setup_filename(dir, name, 0, &fname);
        if (err)
                return err;
@@ -598,7 +592,7 @@ start:
        ++level;
        goto start;
 add_dentry:
-        f2fs_wait_on_page_writeback(dentry_page, DATA);
+        f2fs_wait_on_page_writeback(dentry_page, DATA, true);
        if (inode) {
                down_write(&F2FS_I(inode)->i_sem);
@@ -635,7 +629,7 @@ fail:
        kunmap(dentry_page);
        f2fs_put_page(dentry_page, 1);
 out:
-        f2fs_fname_free_filename(&fname);
+        fscrypt_free_filename(&fname);
        f2fs_update_time(F2FS_I_SB(dir), REQ_TIME);
        return err;
 }
@@ -709,7 +703,7 @@ void f2fs_delete_entry(struct f2fs_dir_entry *dentry, struct page *page,
                return f2fs_delete_inline_entry(dentry, page, dir, inode);
        lock_page(page);
-        f2fs_wait_on_page_writeback(page, DATA);
+        f2fs_wait_on_page_writeback(page, DATA, true);
        dentry_blk = page_address(page);
        bit_pos = dentry - dentry_blk->dentry;
@@ -777,12 +771,12 @@ bool f2fs_empty_dir(struct inode *dir)
 }
 bool f2fs_fill_dentries(struct dir_context *ctx, struct f2fs_dentry_ptr *d,
-                                unsigned int start_pos, struct f2fs_str *fstr)
+                        unsigned int start_pos, struct fscrypt_str *fstr)
 {
        unsigned char d_type = DT_UNKNOWN;
        unsigned int bit_pos;
        struct f2fs_dir_entry *de = NULL;
-        struct f2fs_str de_name = FSTR_INIT(NULL, 0);
+        struct fscrypt_str de_name = FSTR_INIT(NULL, 0);
        bit_pos = ((unsigned long)ctx->pos % d->max);
@@ -792,6 +786,12 @@ bool f2fs_fill_dentries(struct dir_context *ctx, struct f2fs_dentry_ptr *d,
                        break;
                de = &d->dentry[bit_pos];
+                if (de->name_len == 0) {
+                        bit_pos++;
+                        ctx->pos = start_pos + bit_pos;
+                        continue;
+                }
                if (de->file_type < F2FS_FT_MAX)
                        d_type = f2fs_filetype_table[de->file_type];
                else
@@ -810,8 +810,9 @@ bool f2fs_fill_dentries(struct dir_context *ctx, struct f2fs_dentry_ptr *d,
                        memcpy(de_name.name, d->filename[bit_pos], de_name.len);
-                        ret = f2fs_fname_disk_to_usr(d->inode, &de->hash_code,
+                        ret = fscrypt_fname_disk_to_usr(d->inode,
-                                                        &de_name, fstr);
+                                                (u32)de->hash_code, 0,
+                                                &de_name, fstr);
                        kfree(de_name.name);
                        if (ret < 0)
                                return true;
@@ -839,16 +840,15 @@ static int f2fs_readdir(struct file *file, struct dir_context *ctx)
        struct file_ra_state *ra = &file->f_ra;
        unsigned int n = ((unsigned long)ctx->pos / NR_DENTRY_IN_BLOCK);
        struct f2fs_dentry_ptr d;
-        struct f2fs_str fstr = FSTR_INIT(NULL, 0);
+        struct fscrypt_str fstr = FSTR_INIT(NULL, 0);
        int err = 0;
        if (f2fs_encrypted_inode(inode)) {
-                err = f2fs_get_encryption_info(inode);
+                err = fscrypt_get_encryption_info(inode);
-                if (err)
+                if (err && err != -ENOKEY)
                        return err;
-                err = f2fs_fname_crypto_alloc_buffer(inode, F2FS_NAME_LEN,
+                err = fscrypt_fname_alloc_buffer(inode, F2FS_NAME_LEN, &fstr);
-                                                                &fstr);
                if (err < 0)
                        return err;
        }
@@ -888,15 +888,23 @@ static int f2fs_readdir(struct file *file, struct dir_context *ctx)
                f2fs_put_page(dentry_page, 1);
        }
 out:
-        f2fs_fname_crypto_free_buffer(&fstr);
+        fscrypt_fname_free_buffer(&fstr);
        return err;
 }
+static int f2fs_dir_open(struct inode *inode, struct file *filp)
+{
+        if (f2fs_encrypted_inode(inode))
+                return fscrypt_get_encryption_info(inode) ? -EACCES : 0;
+        return 0;
+}
 const struct file_operations f2fs_dir_operations = {
        .llseek         = generic_file_llseek,
        .read           = generic_read_dir,
        .iterate        = f2fs_readdir,
        .fsync          = f2fs_sync_file,
+        .open           = f2fs_dir_open,
        .unlocked_ioctl = f2fs_ioctl,
 #ifdef CONFIG_COMPAT
        .compat_ioctl   = f2fs_compat_ioctl,
diff --git a/fs/f2fs/extent_cache.c b/fs/f2fs/extent_cache.c
index ccd5c636d3fe..c859bb044728 100644
--- a/fs/f2fs/extent_cache.c
+++ b/fs/f2fs/extent_cache.c
@@ -33,6 +33,7 @@ static struct extent_node *__attach_extent_node(struct f2fs_sb_info *sbi,
        en->ei = *ei;
        INIT_LIST_HEAD(&en->list);
+        en->et = et;
        rb_link_node(&en->rb_node, parent, p);
        rb_insert_color(&en->rb_node, &et->root);
@@ -50,6 +51,24 @@ static void __detach_extent_node(struct f2fs_sb_info *sbi,
        if (et->cached_en == en)
                et->cached_en = NULL;
+        kmem_cache_free(extent_node_slab, en);
+}
+/*
+ * Flow to release an extent_node:
+ * 1. list_del_init
+ * 2. __detach_extent_node
+ * 3. kmem_cache_free.
+ */
+static void __release_extent_node(struct f2fs_sb_info *sbi,
+                        struct extent_tree *et, struct extent_node *en)
+{
+        spin_lock(&sbi->extent_lock);
+        f2fs_bug_on(sbi, list_empty(&en->list));
+        list_del_init(&en->list);
+        spin_unlock(&sbi->extent_lock);
+        __detach_extent_node(sbi, et, en);
 }
 static struct extent_tree *__grab_extent_tree(struct inode *inode)
@@ -129,7 +148,7 @@ static struct extent_node *__init_extent_tree(struct f2fs_sb_info *sbi,
 }
 static unsigned int __free_extent_tree(struct f2fs_sb_info *sbi,
-                                        struct extent_tree *et, bool free_all)
+                                        struct extent_tree *et)
 {
        struct rb_node *node, *next;
        struct extent_node *en;
@@ -139,18 +158,7 @@ static unsigned int __free_extent_tree(struct f2fs_sb_info *sbi,
        while (node) {
                next = rb_next(node);
                en = rb_entry(node, struct extent_node, rb_node);
+                __release_extent_node(sbi, et, en);
-                if (free_all) {
-                        spin_lock(&sbi->extent_lock);
-                        if (!list_empty(&en->list))
-                                list_del_init(&en->list);
-                        spin_unlock(&sbi->extent_lock);
-                }
-                if (free_all || list_empty(&en->list)) {
-                        __detach_extent_node(sbi, et, en);
-                        kmem_cache_free(extent_node_slab, en);
-                }
                node = next;
        }
@@ -232,9 +240,10 @@ static bool f2fs_lookup_extent_tree(struct inode *inode, pgoff_t pgofs,
        if (en) {
                *ei = en->ei;
                spin_lock(&sbi->extent_lock);
-                if (!list_empty(&en->list))
+                if (!list_empty(&en->list)) {
                        list_move_tail(&en->list, &sbi->extent_list);
-                et->cached_en = en;
+                        et->cached_en = en;
+                }
                spin_unlock(&sbi->extent_lock);
                ret = true;
        }
@@ -329,7 +338,6 @@ lookup_neighbors:
 static struct extent_node *__try_merge_extent_node(struct f2fs_sb_info *sbi,
                                struct extent_tree *et, struct extent_info *ei,
-                                struct extent_node **den,
                                struct extent_node *prev_ex,
                                struct extent_node *next_ex)
 {
@@ -342,20 +350,25 @@ static struct extent_node *__try_merge_extent_node(struct f2fs_sb_info *sbi,
        }
        if (next_ex && __is_front_mergeable(ei, &next_ex->ei)) {
-                if (en) {
+                if (en)
-                        __detach_extent_node(sbi, et, prev_ex);
+                        __release_extent_node(sbi, et, prev_ex);
-                        *den = prev_ex;
-                }
                next_ex->ei.fofs = ei->fofs;
                next_ex->ei.blk = ei->blk;
                next_ex->ei.len += ei->len;
                en = next_ex;
        }
-        if (en) {
+        if (!en)
-                __try_update_largest_extent(et, en);
+                return NULL;
+        __try_update_largest_extent(et, en);
+        spin_lock(&sbi->extent_lock);
+        if (!list_empty(&en->list)) {
+                list_move_tail(&en->list, &sbi->extent_list);
                et->cached_en = en;
        }
+        spin_unlock(&sbi->extent_lock);
        return en;
 }
@@ -391,7 +404,12 @@ do_insert:
                return NULL;
        __try_update_largest_extent(et, en);
+        /* update in global extent list */
+        spin_lock(&sbi->extent_lock);
+        list_add_tail(&en->list, &sbi->extent_list);
        et->cached_en = en;
+        spin_unlock(&sbi->extent_lock);
        return en;
 }
@@ -479,7 +497,7 @@ static unsigned int f2fs_update_extent_tree_range(struct inode *inode,
                if (parts)
                        __try_update_largest_extent(et, en);
                else
-                        __detach_extent_node(sbi, et, en);
+                        __release_extent_node(sbi, et, en);
                /*
                 * if original extent is split into zero or two parts, extent
@@ -490,31 +508,15 @@ static unsigned int f2fs_update_extent_tree_range(struct inode *inode,
                        insert_p = NULL;
                        insert_parent = NULL;
                }
-                /* update in global extent list */
-                spin_lock(&sbi->extent_lock);
-                if (!parts && !list_empty(&en->list))
-                        list_del(&en->list);
-                if (en1)
-                        list_add_tail(&en1->list, &sbi->extent_list);
-                spin_unlock(&sbi->extent_lock);
-                /* release extent node */
-                if (!parts)
-                        kmem_cache_free(extent_node_slab, en);
                en = next_en;
        }
        /* 3. update extent in extent cache */
        if (blkaddr) {
-                struct extent_node *den = NULL;
                set_extent_info(&ei, fofs, blkaddr, len);
-                en1 = __try_merge_extent_node(sbi, et, &ei, &den,
+                if (!__try_merge_extent_node(sbi, et, &ei, prev_en, next_en))
-                                                        prev_en, next_en);
+                        __insert_extent_tree(sbi, et, &ei,
-                if (!en1)
-                        en1 = __insert_extent_tree(sbi, et, &ei,
                                                insert_p, insert_parent);
                /* give up extent_cache, if split and small updates happen */
@@ -524,24 +526,10 @@ static unsigned int f2fs_update_extent_tree_range(struct inode *inode,
                        et->largest.len = 0;
                        set_inode_flag(F2FS_I(inode), FI_NO_EXTENT);
                }
-                spin_lock(&sbi->extent_lock);
-                if (en1) {
-                        if (list_empty(&en1->list))
-                                list_add_tail(&en1->list, &sbi->extent_list);
-                        else
-                                list_move_tail(&en1->list, &sbi->extent_list);
-                }
-                if (den && !list_empty(&den->list))
-                        list_del(&den->list);
-                spin_unlock(&sbi->extent_lock);
-                if (den)
-                        kmem_cache_free(extent_node_slab, den);
        }
        if (is_inode_flag_set(F2FS_I(inode), FI_NO_EXTENT))
-                __free_extent_tree(sbi, et, true);
+                __free_extent_tree(sbi, et);
        write_unlock(&et->lock);
@@ -550,14 +538,10 @@ static unsigned int f2fs_update_extent_tree_range(struct inode *inode,
 unsigned int f2fs_shrink_extent_tree(struct f2fs_sb_info *sbi, int nr_shrink)
 {
-        struct extent_tree *treevec[EXT_TREE_VEC_SIZE];
        struct extent_tree *et, *next;
-        struct extent_node *en, *tmp;
+        struct extent_node *en;
-        unsigned long ino = F2FS_ROOT_INO(sbi);
-        unsigned int found;
        unsigned int node_cnt = 0, tree_cnt = 0;
        int remained;
-        bool do_free = false;
        if (!test_opt(sbi, EXTENT_CACHE))
                return 0;
@@ -572,10 +556,10 @@ unsigned int f2fs_shrink_extent_tree(struct f2fs_sb_info *sbi, int nr_shrink)
        list_for_each_entry_safe(et, next, &sbi->zombie_list, list) {
                if (atomic_read(&et->node_cnt)) {
                        write_lock(&et->lock);
-                        node_cnt += __free_extent_tree(sbi, et, true);
+                        node_cnt += __free_extent_tree(sbi, et);
                        write_unlock(&et->lock);
                }
+                f2fs_bug_on(sbi, atomic_read(&et->node_cnt));
                list_del_init(&et->list);
                radix_tree_delete(&sbi->extent_tree_root, et->ino);
                kmem_cache_free(extent_tree_slab, et);
@@ -585,6 +569,7 @@ unsigned int f2fs_shrink_extent_tree(struct f2fs_sb_info *sbi, int nr_shrink)
                if (node_cnt + tree_cnt >= nr_shrink)
                        goto unlock_out;
+                cond_resched();
        }
        up_write(&sbi->extent_tree_lock);
@@ -596,42 +581,29 @@ free_node:
        remained = nr_shrink - (node_cnt + tree_cnt);
        spin_lock(&sbi->extent_lock);
-        list_for_each_entry_safe(en, tmp, &sbi->extent_list, list) {
+        for (; remained > 0; remained--) {
-                if (!remained--)
+                if (list_empty(&sbi->extent_list))
                        break;
-                list_del_init(&en->list);
+                en = list_first_entry(&sbi->extent_list,
-                do_free = true;
+                                        struct extent_node, list);
-        }
+                et = en->et;
-        spin_unlock(&sbi->extent_lock);
+                if (!write_trylock(&et->lock)) {
+                        /* refresh this extent node's position in extent list */
-        if (do_free == false)
+                        list_move_tail(&en->list, &sbi->extent_list);
-                goto unlock_out;
+                        continue;
+                }
-        /*
-         * reset ino for searching victims from beginning of global extent tree.
-         */
-        ino = F2FS_ROOT_INO(sbi);
-        while ((found = radix_tree_gang_lookup(&sbi->extent_tree_root,
-                                (void **)treevec, ino, EXT_TREE_VEC_SIZE))) {
-                unsigned i;
-                ino = treevec[found - 1]->ino + 1;
-                for (i = 0; i < found; i++) {
-                        struct extent_tree *et = treevec[i];
-                        if (!atomic_read(&et->node_cnt))
+                list_del_init(&en->list);
-                                continue;
+                spin_unlock(&sbi->extent_lock);
-                        if (write_trylock(&et->lock)) {
+                __detach_extent_node(sbi, et, en);
-                                node_cnt += __free_extent_tree(sbi, et, false);
-                                write_unlock(&et->lock);
-                        }
-                        if (node_cnt + tree_cnt >= nr_shrink)
+                write_unlock(&et->lock);
-                                goto unlock_out;
+                node_cnt++;
-                }
+                spin_lock(&sbi->extent_lock);
        }
+        spin_unlock(&sbi->extent_lock);
 unlock_out:
        up_write(&sbi->extent_tree_lock);
 out:
@@ -650,7 +622,7 @@ unsigned int f2fs_destroy_extent_node(struct inode *inode)
                return 0;
        write_lock(&et->lock);
-        node_cnt = __free_extent_tree(sbi, et, true);
+        node_cnt = __free_extent_tree(sbi, et);
        write_unlock(&et->lock);
        return node_cnt;
@@ -701,19 +673,21 @@ bool f2fs_lookup_extent_cache(struct inode *inode, pgoff_t pgofs,
 void f2fs_update_extent_cache(struct dnode_of_data *dn)
 {
-        struct f2fs_inode_info *fi = F2FS_I(dn->inode);
        pgoff_t fofs;
+        block_t blkaddr;
        if (!f2fs_may_extent_tree(dn->inode))
                return;
-        f2fs_bug_on(F2FS_I_SB(dn->inode), dn->data_blkaddr == NEW_ADDR);
+        if (dn->data_blkaddr == NEW_ADDR)
+                blkaddr = NULL_ADDR;
+        else
+                blkaddr = dn->data_blkaddr;
-        fofs = start_bidx_of_node(ofs_of_node(dn->node_page), fi) +
+        fofs = start_bidx_of_node(ofs_of_node(dn->node_page), dn->inode) +
-                                                        dn->ofs_in_node;
+                                                                dn->ofs_in_node;
-        if (f2fs_update_extent_tree_range(dn->inode, fofs, dn->data_blkaddr, 1))
+        if (f2fs_update_extent_tree_range(dn->inode, fofs, blkaddr, 1))
                sync_inode_page(dn);
 }
diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h
index ff79054c6cf6..bbe2cd1265d0 100644
--- a/fs/f2fs/f2fs.h
+++ b/fs/f2fs/f2fs.h
@@ -22,10 +22,11 @@
 #include <linux/vmalloc.h>
 #include <linux/bio.h>
 #include <linux/blkdev.h>
+#include <linux/fscrypto.h>
+#include <crypto/hash.h>
 #ifdef CONFIG_F2FS_CHECK_FS
 #define f2fs_bug_on(sbi, condition)     BUG_ON(condition)
-#define f2fs_down_write(x, y)   down_write_nest_lock(x, y)
 #else
 #define f2fs_bug_on(sbi, condition)                                     \
        do {                                                            \
@@ -34,7 +35,6 @@
                        set_sbi_flag(sbi, SBI_NEED_FSCK);               \
                }                                                       \
        } while (0)
-#define f2fs_down_write(x, y)   down_write(x)
 #endif
 /*
@@ -84,27 +84,6 @@ struct f2fs_mount_info {
 #define F2FS_CLEAR_FEATURE(sb, mask)                                    \
        F2FS_SB(sb)->raw_super->feature &= ~cpu_to_le32(mask)
-#define CRCPOLY_LE 0xedb88320
-static inline __u32 f2fs_crc32(void *buf, size_t len)
-{
-        unsigned char *p = (unsigned char *)buf;
-        __u32 crc = F2FS_SUPER_MAGIC;
-        int i;
-        while (len--) {
-                crc ^= *p++;
-                for (i = 0; i < 8; i++)
-                        crc = (crc >> 1) ^ ((crc & 1) ? CRCPOLY_LE : 0);
-        }
-        return crc;
-}
-static inline bool f2fs_crc_valid(__u32 blk_crc, void *buf, size_t buf_size)
-{
-        return f2fs_crc32(buf, buf_size) == blk_crc;
-}
 /*
 * For checkpoint manager
 */
@@ -183,37 +162,37 @@ struct fsync_inode_entry {
        block_t last_inode;     /* block address locating the last inode */
 };
-#define nats_in_cursum(sum)             (le16_to_cpu(sum->n_nats))
+#define nats_in_cursum(jnl)             (le16_to_cpu(jnl->n_nats))
-#define sits_in_cursum(sum)             (le16_to_cpu(sum->n_sits))
+#define sits_in_cursum(jnl)             (le16_to_cpu(jnl->n_sits))
-#define nat_in_journal(sum, i)          (sum->nat_j.entries[i].ne)
+#define nat_in_journal(jnl, i)          (jnl->nat_j.entries[i].ne)
-#define nid_in_journal(sum, i)          (sum->nat_j.entries[i].nid)
+#define nid_in_journal(jnl, i)          (jnl->nat_j.entries[i].nid)
-#define sit_in_journal(sum, i)          (sum->sit_j.entries[i].se)
+#define sit_in_journal(jnl, i)          (jnl->sit_j.entries[i].se)
-#define segno_in_journal(sum, i)        (sum->sit_j.entries[i].segno)
+#define segno_in_journal(jnl, i)        (jnl->sit_j.entries[i].segno)
-#define MAX_NAT_JENTRIES(sum)   (NAT_JOURNAL_ENTRIES - nats_in_cursum(sum))
+#define MAX_NAT_JENTRIES(jnl)   (NAT_JOURNAL_ENTRIES - nats_in_cursum(jnl))
-#define MAX_SIT_JENTRIES(sum)   (SIT_JOURNAL_ENTRIES - sits_in_cursum(sum))
+#define MAX_SIT_JENTRIES(jnl)   (SIT_JOURNAL_ENTRIES - sits_in_cursum(jnl))
-static inline int update_nats_in_cursum(struct f2fs_summary_block *rs, int i)
+static inline int update_nats_in_cursum(struct f2fs_journal *journal, int i)
 {
-        int before = nats_in_cursum(rs);
+        int before = nats_in_cursum(journal);
-        rs->n_nats = cpu_to_le16(before + i);
+        journal->n_nats = cpu_to_le16(before + i);
        return before;
 }
-static inline int update_sits_in_cursum(struct f2fs_summary_block *rs, int i)
+static inline int update_sits_in_cursum(struct f2fs_journal *journal, int i)
 {
-        int before = sits_in_cursum(rs);
+        int before = sits_in_cursum(journal);
-        rs->n_sits = cpu_to_le16(before + i);
+        journal->n_sits = cpu_to_le16(before + i);
        return before;
 }
-static inline bool __has_cursum_space(struct f2fs_summary_block *sum, int size,
+static inline bool __has_cursum_space(struct f2fs_journal *journal,
-                                                                int type)
+                                                        int size, int type)
 {
        if (type == NAT_JOURNAL)
-                return size <= MAX_NAT_JENTRIES(sum);
+                return size <= MAX_NAT_JENTRIES(journal);
-        return size <= MAX_SIT_JENTRIES(sum);
+        return size <= MAX_SIT_JENTRIES(journal);
 }
 /*
@@ -233,12 +212,9 @@ static inline bool __has_cursum_space(struct f2fs_summary_block *sum, int size,
 #define F2FS_IOC_WRITE_CHECKPOINT       _IO(F2FS_IOCTL_MAGIC, 7)
 #define F2FS_IOC_DEFRAGMENT             _IO(F2FS_IOCTL_MAGIC, 8)
-#define F2FS_IOC_SET_ENCRYPTION_POLICY                                  \
+#define F2FS_IOC_SET_ENCRYPTION_POLICY  FS_IOC_SET_ENCRYPTION_POLICY
-                _IOR('f', 19, struct f2fs_encryption_policy)
+#define F2FS_IOC_GET_ENCRYPTION_POLICY  FS_IOC_GET_ENCRYPTION_POLICY
-#define F2FS_IOC_GET_ENCRYPTION_PWSALT                                  \
+#define F2FS_IOC_GET_ENCRYPTION_PWSALT  FS_IOC_GET_ENCRYPTION_PWSALT
-                _IOW('f', 20, __u8[16])
-#define F2FS_IOC_GET_ENCRYPTION_POLICY                                  \
-                _IOW('f', 21, struct f2fs_encryption_policy)
 /*
 * should be same as XFS_IOC_GOINGDOWN.
@@ -268,25 +244,6 @@ struct f2fs_defragment {
 * For INODE and NODE manager
 */
 /* for directory operations */
-struct f2fs_str {
-        unsigned char *name;
-        u32 len;
-};
-struct f2fs_filename {
-        const struct qstr *usr_fname;
-        struct f2fs_str disk_name;
-        f2fs_hash_t hash;
-#ifdef CONFIG_F2FS_FS_ENCRYPTION
-        struct f2fs_str crypto_buf;
-#endif
-};
-#define FSTR_INIT(n, l)         { .name = n, .len = l }
-#define FSTR_TO_QSTR(f)         QSTR_INIT((f)->name, (f)->len)
-#define fname_name(p)           ((p)->disk_name.name)
-#define fname_len(p)            ((p)->disk_name.len)
 struct f2fs_dentry_ptr {
        struct inode *inode;
        const void *bitmap;
@@ -354,6 +311,7 @@ struct extent_node {
        struct rb_node rb_node;         /* rb node located in rb-tree */
        struct list_head list;          /* node in global extent list of sbi */
        struct extent_info ei;          /* extent info */
+        struct extent_tree *et;         /* extent tree pointer */
 };
 struct extent_tree {
@@ -382,6 +340,7 @@ struct f2fs_map_blocks {
        block_t m_lblk;
        unsigned int m_len;
        unsigned int m_flags;
+        pgoff_t *m_next_pgofs;          /* point next possible non-hole pgofs */
 };
 /* for flag in get_data_block */
@@ -389,6 +348,8 @@ struct f2fs_map_blocks {
 #define F2FS_GET_BLOCK_DIO              1
 #define F2FS_GET_BLOCK_FIEMAP           2
 #define F2FS_GET_BLOCK_BMAP             3
+#define F2FS_GET_BLOCK_PRE_DIO          4
+#define F2FS_GET_BLOCK_PRE_AIO          5
 /*
 * i_advise uses FADVISE_XXX_BIT. We can add additional hints later.
@@ -410,15 +371,6 @@ struct f2fs_map_blocks {
 #define file_enc_name(inode)    is_file(inode, FADVISE_ENC_NAME_BIT)
 #define file_set_enc_name(inode) set_file(inode, FADVISE_ENC_NAME_BIT)
-/* Encryption algorithms */
-#define F2FS_ENCRYPTION_MODE_INVALID            0
-#define F2FS_ENCRYPTION_MODE_AES_256_XTS        1
-#define F2FS_ENCRYPTION_MODE_AES_256_GCM        2
-#define F2FS_ENCRYPTION_MODE_AES_256_CBC        3
-#define F2FS_ENCRYPTION_MODE_AES_256_CTS        4
-#include "f2fs_crypto.h"
 #define DEF_DIR_LEVEL           0
 struct f2fs_inode_info {
@@ -442,13 +394,7 @@ struct f2fs_inode_info {
        struct list_head dirty_list;    /* linked in global dirty list */
        struct list_head inmem_pages;   /* inmemory pages managed by f2fs */
        struct mutex inmem_lock;        /* lock for inmemory pages */
        struct extent_tree *extent_tree;        /* cached extent_tree entry */
-#ifdef CONFIG_F2FS_FS_ENCRYPTION
-        /* Encryption params */
-        struct f2fs_crypt_info *i_crypt_info;
-#endif
 };
 static inline void get_extent_info(struct extent_info *ext,
@@ -515,6 +461,7 @@ struct f2fs_nm_info {
        nid_t next_scan_nid;            /* the next nid to be scanned */
        unsigned int ram_thresh;        /* control the memory footprint */
        unsigned int ra_nid_pages;      /* # of nid pages to be readaheaded */
+        unsigned int dirty_nats_ratio;  /* control dirty nats ratio threshold */
        /* NAT cache management */
        struct radix_tree_root nat_root;/* root of the nat entry cache */
@@ -549,6 +496,8 @@ struct dnode_of_data {
        unsigned int ofs_in_node;       /* data offset in the node page */
        bool inode_page_locked;         /* inode page is locked or not */
        bool node_changed;              /* is node block changed */
+        char cur_level;                 /* level of hole node page */
+        char max_level;                 /* level of current page located */
        block_t data_blkaddr;           /* block address of the node block */
 };
@@ -679,6 +628,7 @@ enum page_type {
        META_FLUSH,
        INMEM,          /* the below types are used by tracepoints only. */
        INMEM_DROP,
+        INMEM_REVOKE,
        IPU,
        OPU,
 };
@@ -687,7 +637,8 @@ struct f2fs_io_info {
        struct f2fs_sb_info *sbi;       /* f2fs_sb_info pointer */
        enum page_type type;    /* contains DATA/NODE/META/META_FLUSH */
        int rw;                 /* contains R/RS/W/WS with REQ_META/REQ_PRIO */
-        block_t blk_addr;       /* block address to be written */
+        block_t new_blkaddr;    /* new block address to be written */
+        block_t old_blkaddr;    /* old block address before Cow */
        struct page *page;      /* page to be written */
        struct page *encrypted_page;    /* encrypted page */
 };
@@ -844,8 +795,22 @@ struct f2fs_sb_info {
        struct list_head s_list;
        struct mutex umount_mutex;
        unsigned int shrinker_run_no;
+        /* For write statistics */
+        u64 sectors_written_start;
+        u64 kbytes_written;
+        /* Reference to checksum algorithm driver via cryptoapi */
+        struct crypto_shash *s_chksum_driver;
 };
+/* For write statistics. Suppose sector size is 512 bytes,
+ * and the return value is in kbytes. s is of struct f2fs_sb_info.
+ */
+#define BD_PART_WRITTEN(s)                                               \
+(((u64)part_stat_read(s->sb->s_bdev->bd_part, sectors[1]) -              \
+                s->sectors_written_start) >> 1)
 static inline void f2fs_update_time(struct f2fs_sb_info *sbi, int type)
 {
        sbi->last_time[type] = jiffies;
@@ -874,6 +839,29 @@ static inline bool is_idle(struct f2fs_sb_info *sbi)
 /*
 * Inline functions
 */
+static inline u32 f2fs_crc32(struct f2fs_sb_info *sbi, const void *address,
+                           unsigned int length)
+{
+        SHASH_DESC_ON_STACK(shash, sbi->s_chksum_driver);
+        u32 *ctx = (u32 *)shash_desc_ctx(shash);
+        int err;
+        shash->tfm = sbi->s_chksum_driver;
+        shash->flags = 0;
+        *ctx = F2FS_SUPER_MAGIC;
+        err = crypto_shash_update(shash, address, length);
+        BUG_ON(err);
+        return *ctx;
+}
+static inline bool f2fs_crc_valid(struct f2fs_sb_info *sbi, __u32 blk_crc,
+                                  void *buf, size_t buf_size)
+{
+        return f2fs_crc32(sbi, buf, buf_size) == blk_crc;
+}
 static inline struct f2fs_inode_info *F2FS_I(struct inode *inode)
 {
        return container_of(inode, struct f2fs_inode_info, vfs_inode);
@@ -1006,7 +994,7 @@ static inline void f2fs_unlock_op(struct f2fs_sb_info *sbi)
 static inline void f2fs_lock_all(struct f2fs_sb_info *sbi)
 {
-        f2fs_down_write(&sbi->cp_rwsem, &sbi->cp_mutex);
+        down_write(&sbi->cp_rwsem);
 }
 static inline void f2fs_unlock_all(struct f2fs_sb_info *sbi)
@@ -1525,9 +1513,9 @@ static inline int f2fs_has_inline_xattr(struct inode *inode)
        return is_inode_flag_set(F2FS_I(inode), FI_INLINE_XATTR);
 }
-static inline unsigned int addrs_per_inode(struct f2fs_inode_info *fi)
+static inline unsigned int addrs_per_inode(struct inode *inode)
 {
-        if (f2fs_has_inline_xattr(&fi->vfs_inode))
+        if (f2fs_has_inline_xattr(inode))
                return DEF_ADDRS_PER_INODE - F2FS_INLINE_XATTR_ADDRS;
        return DEF_ADDRS_PER_INODE;
 }
@@ -1681,10 +1669,10 @@ static inline void *f2fs_kvzalloc(size_t size, gfp_t flags)
         (F2FS_I(i)->i_acl_mode) : ((i)->i_mode))
 /* get offset of first page in next direct node */
-#define PGOFS_OF_NEXT_DNODE(pgofs, fi)                          \
+#define PGOFS_OF_NEXT_DNODE(pgofs, inode)                               \
-        ((pgofs < ADDRS_PER_INODE(fi)) ? ADDRS_PER_INODE(fi) :  \
+        ((pgofs < ADDRS_PER_INODE(inode)) ? ADDRS_PER_INODE(inode) :    \
-        (pgofs - ADDRS_PER_INODE(fi) + ADDRS_PER_BLOCK) /       \
+        (pgofs - ADDRS_PER_INODE(inode) + ADDRS_PER_BLOCK) /    \
-        ADDRS_PER_BLOCK * ADDRS_PER_BLOCK + ADDRS_PER_INODE(fi))
+        ADDRS_PER_BLOCK * ADDRS_PER_BLOCK + ADDRS_PER_INODE(inode))
 /*
 * file.c
@@ -1723,10 +1711,10 @@ struct dentry *f2fs_get_parent(struct dentry *child);
 extern unsigned char f2fs_filetype_table[F2FS_FT_MAX];
 void set_de_type(struct f2fs_dir_entry *, umode_t);
-struct f2fs_dir_entry *find_target_dentry(struct f2fs_filename *,
+struct f2fs_dir_entry *find_target_dentry(struct fscrypt_name *,
                        f2fs_hash_t, int *, struct f2fs_dentry_ptr *);
 bool f2fs_fill_dentries(struct dir_context *, struct f2fs_dentry_ptr *,
-                        unsigned int, struct f2fs_str *);
+                        unsigned int, struct fscrypt_str *);
 void do_make_empty_dir(struct inode *, struct inode *,
                        struct f2fs_dentry_ptr *);
 struct page *init_inode_metadata(struct inode *, struct inode *,
@@ -1763,6 +1751,7 @@ int f2fs_commit_super(struct f2fs_sb_info *, bool);
 int f2fs_sync_fs(struct super_block *, int);
 extern __printf(3, 4)
 void f2fs_msg(struct super_block *, const char *, const char *, ...);
+int sanity_check_ckpt(struct f2fs_sb_info *sbi);
 /*
 * hash.c
@@ -1780,6 +1769,7 @@ int need_dentry_mark(struct f2fs_sb_info *, nid_t);
 bool is_checkpointed_node(struct f2fs_sb_info *, nid_t);
 bool need_inode_block_update(struct f2fs_sb_info *, nid_t);
 void get_node_info(struct f2fs_sb_info *, nid_t, struct node_info *);
+pgoff_t get_next_page_offset(struct dnode_of_data *, pgoff_t);
 int get_dnode_of_data(struct dnode_of_data *, pgoff_t, int);
 int truncate_inode_blocks(struct inode *, pgoff_t);
 int truncate_xattr_node(struct inode *, struct page *);
@@ -1811,7 +1801,8 @@ void destroy_node_manager_caches(void);
 * segment.c
 */
 void register_inmem_page(struct inode *, struct page *);
-int commit_inmem_pages(struct inode *, bool);
+void drop_inmem_pages(struct inode *);
+int commit_inmem_pages(struct inode *);
 void f2fs_balance_fs(struct f2fs_sb_info *, bool);
 void f2fs_balance_fs_bg(struct f2fs_sb_info *);
 int f2fs_issue_flush(struct f2fs_sb_info *);
@@ -1832,16 +1823,17 @@ void write_meta_page(struct f2fs_sb_info *, struct page *);
 void write_node_page(unsigned int, struct f2fs_io_info *);
 void write_data_page(struct dnode_of_data *, struct f2fs_io_info *);
 void rewrite_data_page(struct f2fs_io_info *);
+void __f2fs_replace_block(struct f2fs_sb_info *, struct f2fs_summary *,
+                                        block_t, block_t, bool, bool);
 void f2fs_replace_block(struct f2fs_sb_info *, struct dnode_of_data *,
-                                block_t, block_t, unsigned char, bool);
+                                block_t, block_t, unsigned char, bool, bool);
 void allocate_data_block(struct f2fs_sb_info *, struct page *,
                block_t, block_t *, struct f2fs_summary *, int);
-void f2fs_wait_on_page_writeback(struct page *, enum page_type);
+void f2fs_wait_on_page_writeback(struct page *, enum page_type, bool);
 void f2fs_wait_on_encrypted_page_writeback(struct f2fs_sb_info *, block_t);
 void write_data_summaries(struct f2fs_sb_info *, block_t);
 void write_node_summaries(struct f2fs_sb_info *, block_t);
-int lookup_journal_in_cursum(struct f2fs_summary_block *,
+int lookup_journal_in_cursum(struct f2fs_journal *, int, unsigned int, int);
-                                        int, unsigned int, int);
 void flush_sit_entries(struct f2fs_sb_info *, struct cp_control *);
 int build_segment_manager(struct f2fs_sb_info *);
 void destroy_segment_manager(struct f2fs_sb_info *);
@@ -1881,11 +1873,16 @@ void destroy_checkpoint_caches(void);
 * data.c
 */
 void f2fs_submit_merged_bio(struct f2fs_sb_info *, enum page_type, int);
+void f2fs_submit_merged_bio_cond(struct f2fs_sb_info *, struct inode *,
+                                struct page *, nid_t, enum page_type, int);
+void f2fs_flush_merged_bios(struct f2fs_sb_info *);
 int f2fs_submit_page_bio(struct f2fs_io_info *);
 void f2fs_submit_page_mbio(struct f2fs_io_info *);
 void set_data_blkaddr(struct dnode_of_data *);
+void f2fs_update_data_blkaddr(struct dnode_of_data *, block_t);
 int reserve_new_block(struct dnode_of_data *);
 int f2fs_get_block(struct dnode_of_data *, pgoff_t);
+ssize_t f2fs_preallocate_blocks(struct kiocb *, struct iov_iter *);
 int f2fs_reserve_block(struct dnode_of_data *, pgoff_t);
 struct page *get_read_data_page(struct inode *, pgoff_t, int, bool);
 struct page *find_data_page(struct inode *, pgoff_t);
@@ -1902,7 +1899,7 @@ int f2fs_release_page(struct page *, gfp_t);
 */
 int start_gc_thread(struct f2fs_sb_info *);
 void stop_gc_thread(struct f2fs_sb_info *);
-block_t start_bidx_of_node(unsigned int, struct f2fs_inode_info *);
+block_t start_bidx_of_node(unsigned int, struct inode *);
 int f2fs_gc(struct f2fs_sb_info *, bool);
 void build_gc_manager(struct f2fs_sb_info *);
@@ -2093,7 +2090,7 @@ int f2fs_convert_inline_inode(struct inode *);
 int f2fs_write_inline_data(struct inode *, struct page *);
 bool recover_inline_data(struct inode *, struct page *);
 struct f2fs_dir_entry *find_in_inline_dir(struct inode *,
-                                struct f2fs_filename *, struct page **);
+                                struct fscrypt_name *, struct page **);
 struct f2fs_dir_entry *f2fs_parent_inline_dir(struct inode *, struct page **);
 int make_empty_inline_dir(struct inode *inode, struct inode *, struct page *);
 int f2fs_add_inline_entry(struct inode *, const struct qstr *, struct inode *,
@@ -2102,7 +2099,7 @@ void f2fs_delete_inline_entry(struct f2fs_dir_entry *, struct page *,
                                                struct inode *, struct inode *);
 bool f2fs_empty_inline_dir(struct inode *);
 int f2fs_read_inline_dir(struct file *, struct dir_context *,
-                                                struct f2fs_str *);
+                                                struct fscrypt_str *);
 int f2fs_inline_data_fiemap(struct inode *,
                struct fiemap_extent_info *, __u64, __u64);
@@ -2132,13 +2129,9 @@ void destroy_extent_cache(void);
 /*
 * crypto support
 */
-static inline int f2fs_encrypted_inode(struct inode *inode)
+static inline bool f2fs_encrypted_inode(struct inode *inode)
 {
-#ifdef CONFIG_F2FS_FS_ENCRYPTION
        return file_is_encrypt(inode);
-#else
-        return 0;
-#endif
 }
 static inline void f2fs_set_encrypted_inode(struct inode *inode)
@@ -2150,20 +2143,12 @@ static inline void f2fs_set_encrypted_inode(struct inode *inode)
 static inline bool f2fs_bio_encrypted(struct bio *bio)
 {
-#ifdef CONFIG_F2FS_FS_ENCRYPTION
+        return bio->bi_private != NULL;
-        return unlikely(bio->bi_private != NULL);
-#else
-        return false;
-#endif
 }
 static inline int f2fs_sb_has_crypto(struct super_block *sb)
 {
-#ifdef CONFIG_F2FS_FS_ENCRYPTION
        return F2FS_HAS_FEATURE(sb, F2FS_FEATURE_ENCRYPT);
-#else
-        return 0;
-#endif
 }
 static inline bool f2fs_may_encrypt(struct inode *inode)
@@ -2177,86 +2162,28 @@ static inline bool f2fs_may_encrypt(struct inode *inode)
 #endif
 }
-/* crypto_policy.c */
+#ifndef CONFIG_F2FS_FS_ENCRYPTION
-int f2fs_is_child_context_consistent_with_parent(struct inode *,
+#define fscrypt_set_d_op(i)
-                                                        struct inode *);
+#define fscrypt_get_ctx                 fscrypt_notsupp_get_ctx
-int f2fs_inherit_context(struct inode *, struct inode *, struct page *);
+#define fscrypt_release_ctx             fscrypt_notsupp_release_ctx
-int f2fs_process_policy(const struct f2fs_encryption_policy *, struct inode *);
+#define fscrypt_encrypt_page            fscrypt_notsupp_encrypt_page
-int f2fs_get_policy(struct inode *, struct f2fs_encryption_policy *);
+#define fscrypt_decrypt_page            fscrypt_notsupp_decrypt_page
+#define fscrypt_decrypt_bio_pages       fscrypt_notsupp_decrypt_bio_pages
-/* crypt.c */
+#define fscrypt_pullback_bio_page       fscrypt_notsupp_pullback_bio_page
-extern struct kmem_cache *f2fs_crypt_info_cachep;
+#define fscrypt_restore_control_page    fscrypt_notsupp_restore_control_page
-bool f2fs_valid_contents_enc_mode(uint32_t);
+#define fscrypt_zeroout_range           fscrypt_notsupp_zeroout_range
-uint32_t f2fs_validate_encryption_key_size(uint32_t, uint32_t);
+#define fscrypt_process_policy          fscrypt_notsupp_process_policy
-struct f2fs_crypto_ctx *f2fs_get_crypto_ctx(struct inode *);
+#define fscrypt_get_policy              fscrypt_notsupp_get_policy
-void f2fs_release_crypto_ctx(struct f2fs_crypto_ctx *);
+#define fscrypt_has_permitted_context   fscrypt_notsupp_has_permitted_context
-struct page *f2fs_encrypt(struct inode *, struct page *);
+#define fscrypt_inherit_context         fscrypt_notsupp_inherit_context
-int f2fs_decrypt(struct f2fs_crypto_ctx *, struct page *);
+#define fscrypt_get_encryption_info     fscrypt_notsupp_get_encryption_info
-int f2fs_decrypt_one(struct inode *, struct page *);
+#define fscrypt_put_encryption_info     fscrypt_notsupp_put_encryption_info
-void f2fs_end_io_crypto_work(struct f2fs_crypto_ctx *, struct bio *);
+#define fscrypt_setup_filename          fscrypt_notsupp_setup_filename
+#define fscrypt_free_filename           fscrypt_notsupp_free_filename
-/* crypto_key.c */
+#define fscrypt_fname_encrypted_size    fscrypt_notsupp_fname_encrypted_size
-void f2fs_free_encryption_info(struct inode *, struct f2fs_crypt_info *);
+#define fscrypt_fname_alloc_buffer      fscrypt_notsupp_fname_alloc_buffer
-int _f2fs_get_encryption_info(struct inode *inode);
+#define fscrypt_fname_free_buffer       fscrypt_notsupp_fname_free_buffer
+#define fscrypt_fname_disk_to_usr       fscrypt_notsupp_fname_disk_to_usr
-/* crypto_fname.c */
+#define fscrypt_fname_usr_to_disk       fscrypt_notsupp_fname_usr_to_disk
-bool f2fs_valid_filenames_enc_mode(uint32_t);
-u32 f2fs_fname_crypto_round_up(u32, u32);
-int f2fs_fname_crypto_alloc_buffer(struct inode *, u32, struct f2fs_str *);
-int f2fs_fname_disk_to_usr(struct inode *, f2fs_hash_t *,
-                        const struct f2fs_str *, struct f2fs_str *);
-int f2fs_fname_usr_to_disk(struct inode *, const struct qstr *,
-                        struct f2fs_str *);
-#ifdef CONFIG_F2FS_FS_ENCRYPTION
-void f2fs_restore_and_release_control_page(struct page **);
-void f2fs_restore_control_page(struct page *);
-int __init f2fs_init_crypto(void);
-int f2fs_crypto_initialize(void);
-void f2fs_exit_crypto(void);
-int f2fs_has_encryption_key(struct inode *);
-static inline int f2fs_get_encryption_info(struct inode *inode)
-{
-        struct f2fs_crypt_info *ci = F2FS_I(inode)->i_crypt_info;
-        if (!ci ||
-                (ci->ci_keyring_key &&
-                 (ci->ci_keyring_key->flags & ((1 << KEY_FLAG_INVALIDATED) |
-                                               (1 << KEY_FLAG_REVOKED) |
-                                               (1 << KEY_FLAG_DEAD)))))
-                return _f2fs_get_encryption_info(inode);
-        return 0;
-}
-void f2fs_fname_crypto_free_buffer(struct f2fs_str *);
-int f2fs_fname_setup_filename(struct inode *, const struct qstr *,
-                                int lookup, struct f2fs_filename *);
-void f2fs_fname_free_filename(struct f2fs_filename *);
-#else
-static inline void f2fs_restore_and_release_control_page(struct page **p) { }
-static inline void f2fs_restore_control_page(struct page *p) { }
-static inline int __init f2fs_init_crypto(void) { return 0; }
-static inline void f2fs_exit_crypto(void) { }
-static inline int f2fs_has_encryption_key(struct inode *i) { return 0; }
-static inline int f2fs_get_encryption_info(struct inode *i) { return 0; }
-static inline void f2fs_fname_crypto_free_buffer(struct f2fs_str *p) { }
-static inline int f2fs_fname_setup_filename(struct inode *dir,
-                                        const struct qstr *iname,
-                                        int lookup, struct f2fs_filename *fname)
-{
-        memset(fname, 0, sizeof(struct f2fs_filename));
-        fname->usr_fname = iname;
-        fname->disk_name.name = (unsigned char *)iname->name;
-        fname->disk_name.len = iname->len;
-        return 0;
-}
-static inline void f2fs_fname_free_filename(struct f2fs_filename *fname) { }
 #endif
 #endif
diff --git a/fs/f2fs/f2fs_crypto.h b/fs/f2fs/f2fs_crypto.h
deleted file mode 100644
index c2c1c2b63b25..000000000000
--- a/fs/f2fs/f2fs_crypto.h
+++ /dev/null
@@ -1,151 +0,0 @@
-/*
- * linux/fs/f2fs/f2fs_crypto.h
- *
- * Copied from linux/fs/ext4/ext4_crypto.h
- *
- * Copyright (C) 2015, Google, Inc.
- *
- * This contains encryption header content for f2fs
- *
- * Written by Michael Halcrow, 2015.
- * Modified by Jaegeuk Kim, 2015.
- */
-#ifndef _F2FS_CRYPTO_H
-#define _F2FS_CRYPTO_H
-#include <linux/fs.h>
-#define F2FS_KEY_DESCRIPTOR_SIZE        8
-/* Policy provided via an ioctl on the topmost directory */
-struct f2fs_encryption_policy {
-        char version;
-        char contents_encryption_mode;
-        char filenames_encryption_mode;
-        char flags;
-        char master_key_descriptor[F2FS_KEY_DESCRIPTOR_SIZE];
-} __attribute__((__packed__));
-#define F2FS_ENCRYPTION_CONTEXT_FORMAT_V1       1
-#define F2FS_KEY_DERIVATION_NONCE_SIZE          16
-#define F2FS_POLICY_FLAGS_PAD_4         0x00
-#define F2FS_POLICY_FLAGS_PAD_8         0x01
-#define F2FS_POLICY_FLAGS_PAD_16        0x02
-#define F2FS_POLICY_FLAGS_PAD_32        0x03
-#define F2FS_POLICY_FLAGS_PAD_MASK      0x03
-#define F2FS_POLICY_FLAGS_VALID         0x03
-/**
- * Encryption context for inode
- *
- * Protector format:
- *  1 byte: Protector format (1 = this version)
- *  1 byte: File contents encryption mode
- *  1 byte: File names encryption mode
- *  1 byte: Flags
- *  8 bytes: Master Key descriptor
- *  16 bytes: Encryption Key derivation nonce
- */
-struct f2fs_encryption_context {
-        char format;
-        char contents_encryption_mode;
-        char filenames_encryption_mode;
-        char flags;
-        char master_key_descriptor[F2FS_KEY_DESCRIPTOR_SIZE];
-        char nonce[F2FS_KEY_DERIVATION_NONCE_SIZE];
-} __attribute__((__packed__));
-/* Encryption parameters */
-#define F2FS_XTS_TWEAK_SIZE 16
-#define F2FS_AES_128_ECB_KEY_SIZE 16
-#define F2FS_AES_256_GCM_KEY_SIZE 32
-#define F2FS_AES_256_CBC_KEY_SIZE 32
-#define F2FS_AES_256_CTS_KEY_SIZE 32
-#define F2FS_AES_256_XTS_KEY_SIZE 64
-#define F2FS_MAX_KEY_SIZE 64
-#define F2FS_KEY_DESC_PREFIX "f2fs:"
-#define F2FS_KEY_DESC_PREFIX_SIZE 5
-struct f2fs_encryption_key {
-        __u32 mode;
-        char raw[F2FS_MAX_KEY_SIZE];
-        __u32 size;
-} __attribute__((__packed__));
-struct f2fs_crypt_info {
-        char            ci_data_mode;
-        char            ci_filename_mode;
-        char            ci_flags;
-        struct crypto_ablkcipher *ci_ctfm;
-        struct key      *ci_keyring_key;
-        char            ci_master_key[F2FS_KEY_DESCRIPTOR_SIZE];
-};
-#define F2FS_CTX_REQUIRES_FREE_ENCRYPT_FL             0x00000001
-#define F2FS_WRITE_PATH_FL                            0x00000002
-struct f2fs_crypto_ctx {
-        union {
-                struct {
-                        struct page *bounce_page;       /* Ciphertext page */
-                        struct page *control_page;      /* Original page  */
-                } w;
-                struct {
-                        struct bio *bio;
-                        struct work_struct work;
-                } r;
-                struct list_head free_list;     /* Free list */
-        };
-        char flags;                      /* Flags */
-};
-struct f2fs_completion_result {
-        struct completion completion;
-        int res;
-};
-#define DECLARE_F2FS_COMPLETION_RESULT(ecr) \
-        struct f2fs_completion_result ecr = { \
-                COMPLETION_INITIALIZER((ecr).completion), 0 }
-static inline int f2fs_encryption_key_size(int mode)
-{
-        switch (mode) {
-        case F2FS_ENCRYPTION_MODE_AES_256_XTS:
-                return F2FS_AES_256_XTS_KEY_SIZE;
-        case F2FS_ENCRYPTION_MODE_AES_256_GCM:
-                return F2FS_AES_256_GCM_KEY_SIZE;
-        case F2FS_ENCRYPTION_MODE_AES_256_CBC:
-                return F2FS_AES_256_CBC_KEY_SIZE;
-        case F2FS_ENCRYPTION_MODE_AES_256_CTS:
-                return F2FS_AES_256_CTS_KEY_SIZE;
-        default:
-                BUG();
-        }
-        return 0;
-}
-#define F2FS_FNAME_NUM_SCATTER_ENTRIES  4
-#define F2FS_CRYPTO_BLOCK_SIZE          16
-#define F2FS_FNAME_CRYPTO_DIGEST_SIZE   32
-/**
- * For encrypted symlinks, the ciphertext length is stored at the beginning
- * of the string in little-endian format.
- */
-struct f2fs_encrypted_symlink_data {
-        __le16 len;
-        char encrypted_path[1];
-} __attribute__((__packed__));
-/**
- * This function is used to calculate the disk space required to
- * store a filename of length l in encrypted symlink format.
- */
-static inline u32 encrypted_symlink_data_len(u32 l)
-{
-        return (l + sizeof(struct f2fs_encrypted_symlink_data) - 1);
-}
-#endif  /* _F2FS_CRYPTO_H */
diff --git a/fs/f2fs/file.c b/fs/f2fs/file.c
index ea272be62677..b41c3579ea9e 100644
--- a/fs/f2fs/file.c
+++ b/fs/f2fs/file.c
@@ -86,7 +86,7 @@ static int f2fs_vm_page_mkwrite(struct vm_area_struct *vma,
        trace_f2fs_vm_page_mkwrite(page, DATA);
 mapped:
        /* fill the page */
-        f2fs_wait_on_page_writeback(page, DATA);
+        f2fs_wait_on_page_writeback(page, DATA, false);
        /* wait for GCed encrypted page writeback */
        if (f2fs_encrypted_inode(inode) && S_ISREG(inode->i_mode))
@@ -301,7 +301,7 @@ static pgoff_t __get_first_dirty_index(struct address_space *mapping,
        pagevec_init(&pvec, 0);
        nr_pages = pagevec_lookup_tag(&pvec, mapping, &pgofs,
                                        PAGECACHE_TAG_DIRTY, 1);
-        pgofs = nr_pages ? pvec.pages[0]->index : LONG_MAX;
+        pgofs = nr_pages ? pvec.pages[0]->index : ULONG_MAX;
        pagevec_release(&pvec);
        return pgofs;
 }
@@ -358,15 +358,14 @@ static loff_t f2fs_seek_block(struct file *file, loff_t offset, int whence)
                } else if (err == -ENOENT) {
                        /* direct node does not exists */
                        if (whence == SEEK_DATA) {
-                                pgofs = PGOFS_OF_NEXT_DNODE(pgofs,
+                                pgofs = get_next_page_offset(&dn, pgofs);
-                                                        F2FS_I(inode));
                                continue;
                        } else {
                                goto found;
                        }
                }
-                end_offset = ADDRS_PER_PAGE(dn.node_page, F2FS_I(inode));
+                end_offset = ADDRS_PER_PAGE(dn.node_page, inode);
                /* find data/hole in dnode block */
                for (; dn.ofs_in_node < end_offset;
@@ -422,9 +421,11 @@ static int f2fs_file_mmap(struct file *file, struct vm_area_struct *vma)
        int err;
        if (f2fs_encrypted_inode(inode)) {
-                err = f2fs_get_encryption_info(inode);
+                err = fscrypt_get_encryption_info(inode);
                if (err)
                        return 0;
+                if (!f2fs_encrypted_inode(inode))
+                        return -ENOKEY;
        }
        /* we don't need to use inline_data strictly */
@@ -440,12 +441,18 @@ static int f2fs_file_mmap(struct file *file, struct vm_area_struct *vma)
 static int f2fs_file_open(struct inode *inode, struct file *filp)
 {
        int ret = generic_file_open(inode, filp);
+        struct inode *dir = filp->f_path.dentry->d_parent->d_inode;
        if (!ret && f2fs_encrypted_inode(inode)) {
-                ret = f2fs_get_encryption_info(inode);
+                ret = fscrypt_get_encryption_info(inode);
                if (ret)
-                        ret = -EACCES;
+                        return -EACCES;
+                if (!fscrypt_has_encryption_key(inode))
+                        return -ENOKEY;
        }
+        if (f2fs_encrypted_inode(dir) &&
+                        !fscrypt_has_permitted_context(dir, inode))
+                return -EPERM;
        return ret;
 }
@@ -480,7 +487,7 @@ int truncate_data_blocks_range(struct dnode_of_data *dn, int count)
                 * we will invalidate all blkaddr in the whole range.
                 */
                fofs = start_bidx_of_node(ofs_of_node(dn->node_page),
-                                                F2FS_I(dn->inode)) + ofs;
+                                                        dn->inode) + ofs;
                f2fs_update_extent_cache_range(dn, fofs, 0, len);
                dec_valid_block_count(sbi, dn->inode, nr_free);
                sync_inode_page(dn);
@@ -521,9 +528,10 @@ static int truncate_partial_data_page(struct inode *inode, u64 from,
        if (IS_ERR(page))
                return 0;
 truncate_out:
-        f2fs_wait_on_page_writeback(page, DATA);
+        f2fs_wait_on_page_writeback(page, DATA, true);
        zero_user(page, offset, PAGE_CACHE_SIZE - offset);
-        if (!cache_only || !f2fs_encrypted_inode(inode) || !S_ISREG(inode->i_mode))
+        if (!cache_only || !f2fs_encrypted_inode(inode) ||
+                                        !S_ISREG(inode->i_mode))
                set_page_dirty(page);
        f2fs_put_page(page, 1);
        return 0;
@@ -568,7 +576,7 @@ int truncate_blocks(struct inode *inode, u64 from, bool lock)
                goto out;
        }
-        count = ADDRS_PER_PAGE(dn.node_page, F2FS_I(inode));
+        count = ADDRS_PER_PAGE(dn.node_page, inode);
        count -= dn.ofs_in_node;
        f2fs_bug_on(sbi, count < 0);
@@ -671,7 +679,7 @@ int f2fs_setattr(struct dentry *dentry, struct iattr *attr)
        if (attr->ia_valid & ATTR_SIZE) {
                if (f2fs_encrypted_inode(inode) &&
-                                f2fs_get_encryption_info(inode))
+                                fscrypt_get_encryption_info(inode))
                        return -EACCES;
                if (attr->ia_size <= i_size_read(inode)) {
@@ -743,7 +751,7 @@ static int fill_zero(struct inode *inode, pgoff_t index,
        if (IS_ERR(page))
                return PTR_ERR(page);
-        f2fs_wait_on_page_writeback(page, DATA);
+        f2fs_wait_on_page_writeback(page, DATA, true);
        zero_user(page, start, len);
        set_page_dirty(page);
        f2fs_put_page(page, 1);
@@ -768,7 +776,7 @@ int truncate_hole(struct inode *inode, pgoff_t pg_start, pgoff_t pg_end)
                        return err;
                }
-                end_offset = ADDRS_PER_PAGE(dn.node_page, F2FS_I(inode));
+                end_offset = ADDRS_PER_PAGE(dn.node_page, inode);
                count = min(end_offset - dn.ofs_in_node, pg_end - pg_start);
                f2fs_bug_on(F2FS_I_SB(inode), count == 0 || count > end_offset);
@@ -854,10 +862,8 @@ static int __exchange_data_block(struct inode *inode, pgoff_t src,
        } else {
                new_addr = dn.data_blkaddr;
                if (!is_checkpointed_data(sbi, new_addr)) {
-                        dn.data_blkaddr = NULL_ADDR;
                        /* do not invalidate this block address */
-                        set_data_blkaddr(&dn);
+                        f2fs_update_data_blkaddr(&dn, NULL_ADDR);
-                        f2fs_update_extent_cache(&dn);
                        do_replace = true;
                }
                f2fs_put_dnode(&dn);
@@ -884,7 +890,7 @@ static int __exchange_data_block(struct inode *inode, pgoff_t src,
                get_node_info(sbi, dn.nid, &ni);
                f2fs_replace_block(sbi, &dn, dn.data_blkaddr, new_addr,
-                                ni.version, true);
+                                ni.version, true, false);
                f2fs_put_dnode(&dn);
        } else {
                struct page *psrc, *pdst;
@@ -892,7 +898,7 @@ static int __exchange_data_block(struct inode *inode, pgoff_t src,
                psrc = get_lock_data_page(inode, src, true);
                if (IS_ERR(psrc))
                        return PTR_ERR(psrc);
-                pdst = get_new_data_page(inode, NULL, dst, false);
+                pdst = get_new_data_page(inode, NULL, dst, true);
                if (IS_ERR(pdst)) {
                        f2fs_put_page(psrc, 1);
                        return PTR_ERR(pdst);
@@ -908,9 +914,7 @@ static int __exchange_data_block(struct inode *inode, pgoff_t src,
 err_out:
        if (!get_dnode_of_data(&dn, src, LOOKUP_NODE)) {
-                dn.data_blkaddr = new_addr;
+                f2fs_update_data_blkaddr(&dn, new_addr);
-                set_data_blkaddr(&dn);
-                f2fs_update_extent_cache(&dn);
                f2fs_put_dnode(&dn);
        }
        return ret;
@@ -1050,12 +1054,7 @@ static int f2fs_zero_range(struct inode *inode, loff_t offset, loff_t len,
                        if (dn.data_blkaddr != NEW_ADDR) {
                                invalidate_blocks(sbi, dn.data_blkaddr);
+                                f2fs_update_data_blkaddr(&dn, NEW_ADDR);
-                                dn.data_blkaddr = NEW_ADDR;
-                                set_data_blkaddr(&dn);
-                                dn.data_blkaddr = NULL_ADDR;
-                                f2fs_update_extent_cache(&dn);
                        }
                        f2fs_put_dnode(&dn);
                        f2fs_unlock_op(sbi);
@@ -1253,7 +1252,7 @@ static int f2fs_release_file(struct inode *inode, struct file *filp)
 {
        /* some remained atomic pages should discarded */
        if (f2fs_is_atomic_file(inode))
-                commit_inmem_pages(inode, true);
+                drop_inmem_pages(inode);
        if (f2fs_is_volatile_file(inode)) {
                set_inode_flag(F2FS_I(inode), FI_DROP_CACHE);
                filemap_fdatawrite(inode->i_mapping);
@@ -1377,7 +1376,7 @@ static int f2fs_ioc_commit_atomic_write(struct file *filp)
        if (f2fs_is_atomic_file(inode)) {
                clear_inode_flag(F2FS_I(inode), FI_ATOMIC_FILE);
-                ret = commit_inmem_pages(inode, false);
+                ret = commit_inmem_pages(inode);
                if (ret) {
                        set_inode_flag(F2FS_I(inode), FI_ATOMIC_FILE);
                        goto err_out;
@@ -1440,7 +1439,7 @@ static int f2fs_ioc_abort_volatile_write(struct file *filp)
        if (f2fs_is_atomic_file(inode)) {
                clear_inode_flag(F2FS_I(inode), FI_ATOMIC_FILE);
-                commit_inmem_pages(inode, true);
+                drop_inmem_pages(inode);
        }
        if (f2fs_is_volatile_file(inode)) {
                clear_inode_flag(F2FS_I(inode), FI_VOLATILE_FILE);
@@ -1535,39 +1534,30 @@ static bool uuid_is_nonzero(__u8 u[16])
 static int f2fs_ioc_set_encryption_policy(struct file *filp, unsigned long arg)
 {
-#ifdef CONFIG_F2FS_FS_ENCRYPTION
+        struct fscrypt_policy policy;
-        struct f2fs_encryption_policy policy;
        struct inode *inode = file_inode(filp);
-        if (copy_from_user(&policy, (struct f2fs_encryption_policy __user *)arg,
+        if (copy_from_user(&policy, (struct fscrypt_policy __user *)arg,
-                                sizeof(policy)))
+                                                        sizeof(policy)))
                return -EFAULT;
        f2fs_update_time(F2FS_I_SB(inode), REQ_TIME);
-        return f2fs_process_policy(&policy, inode);
+        return fscrypt_process_policy(inode, &policy);
-#else
-        return -EOPNOTSUPP;
-#endif
 }
 static int f2fs_ioc_get_encryption_policy(struct file *filp, unsigned long arg)
 {
-#ifdef CONFIG_F2FS_FS_ENCRYPTION
+        struct fscrypt_policy policy;
-        struct f2fs_encryption_policy policy;
        struct inode *inode = file_inode(filp);
        int err;
-        err = f2fs_get_policy(inode, &policy);
+        err = fscrypt_get_policy(inode, &policy);
        if (err)
                return err;
-        if (copy_to_user((struct f2fs_encryption_policy __user *)arg, &policy,
+        if (copy_to_user((struct fscrypt_policy __user *)arg, &policy, sizeof(policy)))
-                                                        sizeof(policy)))
                return -EFAULT;
        return 0;
-#else
-        return -EOPNOTSUPP;
-#endif
 }
 static int f2fs_ioc_get_encryption_pwsalt(struct file *filp, unsigned long arg)
@@ -1648,7 +1638,7 @@ static int f2fs_defragment_range(struct f2fs_sb_info *sbi,
                                        struct f2fs_defragment *range)
 {
        struct inode *inode = file_inode(filp);
-        struct f2fs_map_blocks map;
+        struct f2fs_map_blocks map = { .m_next_pgofs = NULL };
        struct extent_info ei;
        pgoff_t pg_start, pg_end;
        unsigned int blk_per_seg = sbi->blocks_per_seg;
@@ -1874,14 +1864,32 @@ long f2fs_ioctl(struct file *filp, unsigned int cmd, unsigned long arg)
 static ssize_t f2fs_file_write_iter(struct kiocb *iocb, struct iov_iter *from)
 {
-        struct inode *inode = file_inode(iocb->ki_filp);
+        struct file *file = iocb->ki_filp;
+        struct inode *inode = file_inode(file);
+        ssize_t ret;
        if (f2fs_encrypted_inode(inode) &&
-                                !f2fs_has_encryption_key(inode) &&
+                                !fscrypt_has_encryption_key(inode) &&
-                                f2fs_get_encryption_info(inode))
+                                fscrypt_get_encryption_info(inode))
                return -EACCES;
-        return generic_file_write_iter(iocb, from);
+        inode_lock(inode);
+        ret = generic_write_checks(iocb, from);
+        if (ret > 0) {
+                ret = f2fs_preallocate_blocks(iocb, from);
+                if (!ret)
+                        ret = __generic_file_write_iter(iocb, from);
+        }
+        inode_unlock(inode);
+        if (ret > 0) {
+                ssize_t err;
+                err = generic_write_sync(file, iocb->ki_pos - ret, ret);
+                if (err < 0)
+                        ret = err;
+        }
+        return ret;
 }
 #ifdef CONFIG_COMPAT
diff --git a/fs/f2fs/gc.c b/fs/f2fs/gc.c
index f610c2a9bdde..b0051a97824c 100644
--- a/fs/f2fs/gc.c
+++ b/fs/f2fs/gc.c
@@ -245,6 +245,18 @@ static inline unsigned int get_gc_cost(struct f2fs_sb_info *sbi,
                return get_cb_cost(sbi, segno);
 }
+static unsigned int count_bits(const unsigned long *addr,
+                                unsigned int offset, unsigned int len)
+{
+        unsigned int end = offset + len, sum = 0;
+        while (offset < end) {
+                if (test_bit(offset++, addr))
+                        ++sum;
+        }
+        return sum;
+}
 /*
 * This function is called from two paths.
 * One is garbage collection and the other is SSR segment selection.
@@ -258,9 +270,9 @@ static int get_victim_by_default(struct f2fs_sb_info *sbi,
 {
        struct dirty_seglist_info *dirty_i = DIRTY_I(sbi);
        struct victim_sel_policy p;
-        unsigned int secno, max_cost;
+        unsigned int secno, max_cost, last_victim;
        unsigned int last_segment = MAIN_SEGS(sbi);
-        int nsearched = 0;
+        unsigned int nsearched = 0;
        mutex_lock(&dirty_i->seglist_lock);
@@ -273,6 +285,7 @@ static int get_victim_by_default(struct f2fs_sb_info *sbi,
        if (p.max_search == 0)
                goto out;
+        last_victim = sbi->last_victim[p.gc_mode];
        if (p.alloc_mode == LFS && gc_type == FG_GC) {
                p.min_segno = check_bg_victims(sbi);
                if (p.min_segno != NULL_SEGNO)
@@ -295,27 +308,35 @@ static int get_victim_by_default(struct f2fs_sb_info *sbi,
                }
                p.offset = segno + p.ofs_unit;
-                if (p.ofs_unit > 1)
+                if (p.ofs_unit > 1) {
                        p.offset -= segno % p.ofs_unit;
+                        nsearched += count_bits(p.dirty_segmap,
+                                                p.offset - p.ofs_unit,
+                                                p.ofs_unit);
+                } else {
+                        nsearched++;
+                }
                secno = GET_SECNO(sbi, segno);
                if (sec_usage_check(sbi, secno))
-                        continue;
+                        goto next;
                if (gc_type == BG_GC && test_bit(secno, dirty_i->victim_secmap))
-                        continue;
+                        goto next;
                cost = get_gc_cost(sbi, segno, &p);
                if (p.min_cost > cost) {
                        p.min_segno = segno;
                        p.min_cost = cost;
-                } else if (unlikely(cost == max_cost)) {
-                        continue;
                }
+next:
-                if (nsearched++ >= p.max_search) {
+                if (nsearched >= p.max_search) {
-                        sbi->last_victim[p.gc_mode] = segno;
+                        if (!sbi->last_victim[p.gc_mode] && segno <= last_victim)
+                                sbi->last_victim[p.gc_mode] = last_victim + 1;
+                        else
+                                sbi->last_victim[p.gc_mode] = segno + 1;
                        break;
                }
        }
@@ -399,7 +420,7 @@ static int check_valid_map(struct f2fs_sb_info *sbi,
 * On validity, copy that node with cold status, otherwise (invalid node)
 * ignore that.
 */
-static int gc_node_segment(struct f2fs_sb_info *sbi,
+static void gc_node_segment(struct f2fs_sb_info *sbi,
                struct f2fs_summary *sum, unsigned int segno, int gc_type)
 {
        bool initial = true;
@@ -419,7 +440,7 @@ next_step:
                /* stop BG_GC if there is not enough free sections. */
                if (gc_type == BG_GC && has_not_enough_free_secs(sbi, 0))
-                        return 0;
+                        return;
                if (check_valid_map(sbi, segno, off) == 0)
                        continue;
@@ -446,7 +467,7 @@ next_step:
                /* set page dirty and write it */
                if (gc_type == FG_GC) {
-                        f2fs_wait_on_page_writeback(node_page, NODE);
+                        f2fs_wait_on_page_writeback(node_page, NODE, true);
                        set_page_dirty(node_page);
                } else {
                        if (!PageWriteback(node_page))
@@ -460,20 +481,6 @@ next_step:
                initial = false;
                goto next_step;
        }
-        if (gc_type == FG_GC) {
-                struct writeback_control wbc = {
-                        .sync_mode = WB_SYNC_ALL,
-                        .nr_to_write = LONG_MAX,
-                        .for_reclaim = 0,
-                };
-                sync_node_pages(sbi, 0, &wbc);
-                /* return 1 only if FG_GC succefully reclaimed one */
-                if (get_valid_blocks(sbi, segno, 1) == 0)
-                        return 1;
-        }
-        return 0;
 }
 /*
@@ -483,7 +490,7 @@ next_step:
 * as indirect or double indirect node blocks, are given, it must be a caller's
 * bug.
 */
-block_t start_bidx_of_node(unsigned int node_ofs, struct f2fs_inode_info *fi)
+block_t start_bidx_of_node(unsigned int node_ofs, struct inode *inode)
 {
        unsigned int indirect_blks = 2 * NIDS_PER_BLOCK + 4;
        unsigned int bidx;
@@ -500,7 +507,7 @@ block_t start_bidx_of_node(unsigned int node_ofs, struct f2fs_inode_info *fi)
                int dec = (node_ofs - indirect_blks - 3) / (NIDS_PER_BLOCK + 1);
                bidx = node_ofs - 5 - dec;
        }
-        return bidx * ADDRS_PER_BLOCK + ADDRS_PER_INODE(fi);
+        return bidx * ADDRS_PER_BLOCK + ADDRS_PER_INODE(inode);
 }
 static bool is_alive(struct f2fs_sb_info *sbi, struct f2fs_summary *sum,
@@ -546,6 +553,7 @@ static void move_encrypted_block(struct inode *inode, block_t bidx)
        struct f2fs_summary sum;
        struct node_info ni;
        struct page *page;
+        block_t newaddr;
        int err;
        /* do not read out */
@@ -567,21 +575,24 @@ static void move_encrypted_block(struct inode *inode, block_t bidx)
         * don't cache encrypted data into meta inode until previous dirty
         * data were writebacked to avoid racing between GC and flush.
         */
-        f2fs_wait_on_page_writeback(page, DATA);
+        f2fs_wait_on_page_writeback(page, DATA, true);
        get_node_info(fio.sbi, dn.nid, &ni);
        set_summary(&sum, dn.nid, dn.ofs_in_node, ni.version);
        /* read page */
        fio.page = page;
-        fio.blk_addr = dn.data_blkaddr;
+        fio.new_blkaddr = fio.old_blkaddr = dn.data_blkaddr;
-        fio.encrypted_page = pagecache_get_page(META_MAPPING(fio.sbi),
+        allocate_data_block(fio.sbi, NULL, fio.old_blkaddr, &newaddr,
-                                        fio.blk_addr,
+                                                        &sum, CURSEG_COLD_DATA);
-                                        FGP_LOCK|FGP_CREAT,
-                                        GFP_NOFS);
+        fio.encrypted_page = pagecache_get_page(META_MAPPING(fio.sbi), newaddr,
-        if (!fio.encrypted_page)
+                                        FGP_LOCK | FGP_CREAT, GFP_NOFS);
-                goto put_out;
+        if (!fio.encrypted_page) {
+                err = -ENOMEM;
+                goto recover_block;
+        }
        err = f2fs_submit_page_bio(&fio);
        if (err)
@@ -590,33 +601,39 @@ static void move_encrypted_block(struct inode *inode, block_t bidx)
        /* write page */
        lock_page(fio.encrypted_page);
-        if (unlikely(!PageUptodate(fio.encrypted_page)))
+        if (unlikely(!PageUptodate(fio.encrypted_page))) {
+                err = -EIO;
                goto put_page_out;
-        if (unlikely(fio.encrypted_page->mapping != META_MAPPING(fio.sbi)))
+        }
+        if (unlikely(fio.encrypted_page->mapping != META_MAPPING(fio.sbi))) {
+                err = -EIO;
                goto put_page_out;
+        }
        set_page_dirty(fio.encrypted_page);
-        f2fs_wait_on_page_writeback(fio.encrypted_page, DATA);
+        f2fs_wait_on_page_writeback(fio.encrypted_page, DATA, true);
        if (clear_page_dirty_for_io(fio.encrypted_page))
                dec_page_count(fio.sbi, F2FS_DIRTY_META);
        set_page_writeback(fio.encrypted_page);
        /* allocate block address */
-        f2fs_wait_on_page_writeback(dn.node_page, NODE);
+        f2fs_wait_on_page_writeback(dn.node_page, NODE, true);
-        allocate_data_block(fio.sbi, NULL, fio.blk_addr,
-                                        &fio.blk_addr, &sum, CURSEG_COLD_DATA);
        fio.rw = WRITE_SYNC;
+        fio.new_blkaddr = newaddr;
        f2fs_submit_page_mbio(&fio);
-        dn.data_blkaddr = fio.blk_addr;
+        f2fs_update_data_blkaddr(&dn, newaddr);
-        set_data_blkaddr(&dn);
-        f2fs_update_extent_cache(&dn);
        set_inode_flag(F2FS_I(inode), FI_APPEND_WRITE);
        if (page->index == 0)
                set_inode_flag(F2FS_I(inode), FI_FIRST_BLOCK_WRITTEN);
 put_page_out:
        f2fs_put_page(fio.encrypted_page, 1);
+recover_block:
+        if (err)
+                __f2fs_replace_block(fio.sbi, &sum, newaddr, fio.old_blkaddr,
+                                                                true, true);
 put_out:
        f2fs_put_dnode(&dn);
 out:
@@ -645,7 +662,7 @@ static void move_data_page(struct inode *inode, block_t bidx, int gc_type)
                        .encrypted_page = NULL,
                };
                set_page_dirty(page);
-                f2fs_wait_on_page_writeback(page, DATA);
+                f2fs_wait_on_page_writeback(page, DATA, true);
                if (clear_page_dirty_for_io(page))
                        inode_dec_dirty_pages(inode);
                set_cold_data(page);
@@ -663,7 +680,7 @@ out:
 * If the parent node is not valid or the data block address is different,
 * the victim data block is ignored.
 */
-static int gc_data_segment(struct f2fs_sb_info *sbi, struct f2fs_summary *sum,
+static void gc_data_segment(struct f2fs_sb_info *sbi, struct f2fs_summary *sum,
                struct gc_inode_list *gc_list, unsigned int segno, int gc_type)
 {
        struct super_block *sb = sbi->sb;
@@ -686,7 +703,7 @@ next_step:
                /* stop BG_GC if there is not enough free sections. */
                if (gc_type == BG_GC && has_not_enough_free_secs(sbi, 0))
-                        return 0;
+                        return;
                if (check_valid_map(sbi, segno, off) == 0)
                        continue;
@@ -719,7 +736,7 @@ next_step:
                                continue;
                        }
-                        start_bidx = start_bidx_of_node(nofs, F2FS_I(inode));
+                        start_bidx = start_bidx_of_node(nofs, inode);
                        data_page = get_read_data_page(inode,
                                        start_bidx + ofs_in_node, READA, true);
                        if (IS_ERR(data_page)) {
@@ -735,7 +752,7 @@ next_step:
                /* phase 3 */
                inode = find_gc_inode(gc_list, dni.ino);
                if (inode) {
-                        start_bidx = start_bidx_of_node(nofs, F2FS_I(inode))
+                        start_bidx = start_bidx_of_node(nofs, inode)
                                                                + ofs_in_node;
                        if (f2fs_encrypted_inode(inode) && S_ISREG(inode->i_mode))
                                move_encrypted_block(inode, start_bidx);
@@ -747,15 +764,6 @@ next_step:
        if (++phase < 4)
                goto next_step;
-        if (gc_type == FG_GC) {
-                f2fs_submit_merged_bio(sbi, DATA, WRITE);
-                /* return 1 only if FG_GC succefully reclaimed one */
-                if (get_valid_blocks(sbi, segno, 1) == 0)
-                        return 1;
-        }
-        return 0;
 }
 static int __get_victim(struct f2fs_sb_info *sbi, unsigned int *victim,
@@ -771,53 +779,92 @@ static int __get_victim(struct f2fs_sb_info *sbi, unsigned int *victim,
        return ret;
 }
-static int do_garbage_collect(struct f2fs_sb_info *sbi, unsigned int segno,
+static int do_garbage_collect(struct f2fs_sb_info *sbi,
+                                unsigned int start_segno,
                                struct gc_inode_list *gc_list, int gc_type)
 {
        struct page *sum_page;
        struct f2fs_summary_block *sum;
        struct blk_plug plug;
-        int nfree = 0;
+        unsigned int segno = start_segno;
+        unsigned int end_segno = start_segno + sbi->segs_per_sec;
+        int seg_freed = 0;
+        unsigned char type = IS_DATASEG(get_seg_entry(sbi, segno)->type) ?
+                                                SUM_TYPE_DATA : SUM_TYPE_NODE;
-        /* read segment summary of victim */
+        /* readahead multi ssa blocks those have contiguous address */
-        sum_page = get_sum_page(sbi, segno);
+        if (sbi->segs_per_sec > 1)
+                ra_meta_pages(sbi, GET_SUM_BLOCK(sbi, segno),
+                                        sbi->segs_per_sec, META_SSA, true);
+        /* reference all summary page */
+        while (segno < end_segno) {
+                sum_page = get_sum_page(sbi, segno++);
+                unlock_page(sum_page);
+        }
        blk_start_plug(&plug);
-        sum = page_address(sum_page);
+        for (segno = start_segno; segno < end_segno; segno++) {
+                /* find segment summary of victim */
+                sum_page = find_get_page(META_MAPPING(sbi),
+                                        GET_SUM_BLOCK(sbi, segno));
+                f2fs_bug_on(sbi, !PageUptodate(sum_page));
+                f2fs_put_page(sum_page, 0);
-        /*
+                sum = page_address(sum_page);
-         * this is to avoid deadlock:
+                f2fs_bug_on(sbi, type != GET_SUM_TYPE((&sum->footer)));
-         * - lock_page(sum_page)         - f2fs_replace_block
-         *  - check_valid_map()            - mutex_lock(sentry_lock)
+                /*
-         *   - mutex_lock(sentry_lock)     - change_curseg()
+                 * this is to avoid deadlock:
-         *                                  - lock_page(sum_page)
+                 * - lock_page(sum_page)         - f2fs_replace_block
-         */
+                 *  - check_valid_map()            - mutex_lock(sentry_lock)
-        unlock_page(sum_page);
+                 *   - mutex_lock(sentry_lock)     - change_curseg()
+                 *                                  - lock_page(sum_page)
-        switch (GET_SUM_TYPE((&sum->footer))) {
+                 */
-        case SUM_TYPE_NODE:
-                nfree = gc_node_segment(sbi, sum->entries, segno, gc_type);
+                if (type == SUM_TYPE_NODE)
-                break;
+                        gc_node_segment(sbi, sum->entries, segno, gc_type);
-        case SUM_TYPE_DATA:
+                else
-                nfree = gc_data_segment(sbi, sum->entries, gc_list,
+                        gc_data_segment(sbi, sum->entries, gc_list, segno,
-                                                        segno, gc_type);
+                                                                gc_type);
-                break;
+                stat_inc_seg_count(sbi, type, gc_type);
+                f2fs_put_page(sum_page, 0);
+        }
+        if (gc_type == FG_GC) {
+                if (type == SUM_TYPE_NODE) {
+                        struct writeback_control wbc = {
+                                .sync_mode = WB_SYNC_ALL,
+                                .nr_to_write = LONG_MAX,
+                                .for_reclaim = 0,
+                        };
+                        sync_node_pages(sbi, 0, &wbc);
+                } else {
+                        f2fs_submit_merged_bio(sbi, DATA, WRITE);
+                }
        }
        blk_finish_plug(&plug);
-        stat_inc_seg_count(sbi, GET_SUM_TYPE((&sum->footer)), gc_type);
+        if (gc_type == FG_GC) {
+                while (start_segno < end_segno)
+                        if (get_valid_blocks(sbi, start_segno++, 1) == 0)
+                                seg_freed++;
+        }
        stat_inc_call_count(sbi->stat_info);
-        f2fs_put_page(sum_page, 0);
+        return seg_freed;
-        return nfree;
 }
 int f2fs_gc(struct f2fs_sb_info *sbi, bool sync)
 {
-        unsigned int segno, i;
+        unsigned int segno;
        int gc_type = sync ? FG_GC : BG_GC;
-        int sec_freed = 0;
+        int sec_freed = 0, seg_freed;
        int ret = -EINVAL;
        struct cp_control cpc;
        struct gc_inode_list gc_list = {
@@ -838,30 +885,24 @@ gc_more:
        if (gc_type == BG_GC && has_not_enough_free_secs(sbi, sec_freed)) {
                gc_type = FG_GC;
+                /*
+                 * If there is no victim and no prefree segment but still not
+                 * enough free sections, we should flush dent/node blocks and do
+                 * garbage collections.
+                 */
                if (__get_victim(sbi, &segno, gc_type) || prefree_segments(sbi))
                        write_checkpoint(sbi, &cpc);
+                else if (has_not_enough_free_secs(sbi, 0))
+                        write_checkpoint(sbi, &cpc);
        }
        if (segno == NULL_SEGNO && !__get_victim(sbi, &segno, gc_type))
                goto stop;
        ret = 0;
-        /* readahead multi ssa blocks those have contiguous address */
+        seg_freed = do_garbage_collect(sbi, segno, &gc_list, gc_type);
-        if (sbi->segs_per_sec > 1)
-                ra_meta_pages(sbi, GET_SUM_BLOCK(sbi, segno), sbi->segs_per_sec,
-                                                        META_SSA, true);
-        for (i = 0; i < sbi->segs_per_sec; i++) {
-                /*
-                 * for FG_GC case, halt gcing left segments once failed one
-                 * of segments in selected section to avoid long latency.
-                 */
-                if (!do_garbage_collect(sbi, segno + i, &gc_list, gc_type) &&
-                                gc_type == FG_GC)
-                        break;
-        }
-        if (i == sbi->segs_per_sec && gc_type == FG_GC)
+        if (gc_type == FG_GC && seg_freed == sbi->segs_per_sec)
                sec_freed++;
        if (gc_type == FG_GC)
diff --git a/fs/f2fs/inline.c b/fs/f2fs/inline.c
index c3f0b7d4cfca..358214e9f707 100644
--- a/fs/f2fs/inline.c
+++ b/fs/f2fs/inline.c
@@ -71,7 +71,7 @@ bool truncate_inline_inode(struct page *ipage, u64 from)
        addr = inline_data_addr(ipage);
-        f2fs_wait_on_page_writeback(ipage, NODE);
+        f2fs_wait_on_page_writeback(ipage, NODE, true);
        memset(addr + from, 0, MAX_INLINE_DATA - from);
        return true;
@@ -105,7 +105,6 @@ int f2fs_read_inline_data(struct inode *inode, struct page *page)
 int f2fs_convert_inline_page(struct dnode_of_data *dn, struct page *page)
 {
-        void *src_addr, *dst_addr;
        struct f2fs_io_info fio = {
                .sbi = F2FS_I_SB(dn->inode),
                .type = DATA,
@@ -115,8 +114,6 @@ int f2fs_convert_inline_page(struct dnode_of_data *dn, struct page *page)
        };
        int dirty, err;
-        f2fs_bug_on(F2FS_I_SB(dn->inode), page->index);
        if (!f2fs_exist_data(dn->inode))
                goto clear_out;
@@ -124,21 +121,9 @@ int f2fs_convert_inline_page(struct dnode_of_data *dn, struct page *page)
        if (err)
                return err;
-        f2fs_wait_on_page_writeback(page, DATA);
+        f2fs_bug_on(F2FS_P_SB(page), PageWriteback(page));
-        if (PageUptodate(page))
-                goto no_update;
-        zero_user_segment(page, MAX_INLINE_DATA, PAGE_CACHE_SIZE);
-        /* Copy the whole inline data block */
+        read_inline_data(page, dn->inode_page);
-        src_addr = inline_data_addr(dn->inode_page);
-        dst_addr = kmap_atomic(page);
-        memcpy(dst_addr, src_addr, MAX_INLINE_DATA);
-        flush_dcache_page(page);
-        kunmap_atomic(dst_addr);
-        SetPageUptodate(page);
-no_update:
        set_page_dirty(page);
        /* clear dirty state */
@@ -146,11 +131,9 @@ no_update:
        /* write data page to try to make data consistent */
        set_page_writeback(page);
-        fio.blk_addr = dn->data_blkaddr;
+        fio.old_blkaddr = dn->data_blkaddr;
        write_data_page(dn, &fio);
-        set_data_blkaddr(dn);
+        f2fs_wait_on_page_writeback(page, DATA, true);
-        f2fs_update_extent_cache(dn);
-        f2fs_wait_on_page_writeback(page, DATA);
        if (dirty)
                inode_dec_dirty_pages(dn->inode);
@@ -159,6 +142,7 @@ no_update:
        /* clear inline data and flag after data writeback */
        truncate_inline_inode(dn->inode_page, 0);
+        clear_inline_node(dn->inode_page);
 clear_out:
        stat_dec_inline_inode(dn->inode);
        f2fs_clear_inline_inode(dn->inode);
@@ -223,7 +207,7 @@ int f2fs_write_inline_data(struct inode *inode, struct page *page)
        f2fs_bug_on(F2FS_I_SB(inode), page->index);
-        f2fs_wait_on_page_writeback(dn.inode_page, NODE);
+        f2fs_wait_on_page_writeback(dn.inode_page, NODE, true);
        src_addr = kmap_atomic(page);
        dst_addr = inline_data_addr(dn.inode_page);
        memcpy(dst_addr, src_addr, MAX_INLINE_DATA);
@@ -233,6 +217,7 @@ int f2fs_write_inline_data(struct inode *inode, struct page *page)
        set_inode_flag(F2FS_I(inode), FI_DATA_EXIST);
        sync_inode_page(&dn);
+        clear_inline_node(dn.inode_page);
        f2fs_put_dnode(&dn);
        return 0;
 }
@@ -261,7 +246,7 @@ process_inline:
                ipage = get_node_page(sbi, inode->i_ino);
                f2fs_bug_on(sbi, IS_ERR(ipage));
-                f2fs_wait_on_page_writeback(ipage, NODE);
+                f2fs_wait_on_page_writeback(ipage, NODE, true);
                src_addr = inline_data_addr(npage);
                dst_addr = inline_data_addr(ipage);
@@ -292,7 +277,7 @@ process_inline:
 }
 struct f2fs_dir_entry *find_in_inline_dir(struct inode *dir,
-                        struct f2fs_filename *fname, struct page **res_page)
+                        struct fscrypt_name *fname, struct page **res_page)
 {
        struct f2fs_sb_info *sbi = F2FS_SB(dir->i_sb);
        struct f2fs_inline_dentry *inline_dentry;
@@ -389,7 +374,7 @@ static int f2fs_convert_inline_dir(struct inode *dir, struct page *ipage,
        if (err)
                goto out;
-        f2fs_wait_on_page_writeback(page, DATA);
+        f2fs_wait_on_page_writeback(page, DATA, true);
        zero_user_segment(page, MAX_INLINE_DATA, PAGE_CACHE_SIZE);
        dentry_blk = kmap_atomic(page);
@@ -469,7 +454,7 @@ int f2fs_add_inline_entry(struct inode *dir, const struct qstr *name,
                }
        }
-        f2fs_wait_on_page_writeback(ipage, NODE);
+        f2fs_wait_on_page_writeback(ipage, NODE, true);
        name_hash = f2fs_dentry_hash(name);
        make_dentry_ptr(NULL, &d, (void *)dentry_blk, 2);
@@ -507,7 +492,7 @@ void f2fs_delete_inline_entry(struct f2fs_dir_entry *dentry, struct page *page,
        int i;
        lock_page(page);
-        f2fs_wait_on_page_writeback(page, NODE);
+        f2fs_wait_on_page_writeback(page, NODE, true);
        inline_dentry = inline_data_addr(page);
        bit_pos = dentry - inline_dentry->dentry;
@@ -550,7 +535,7 @@ bool f2fs_empty_inline_dir(struct inode *dir)
 }
 int f2fs_read_inline_dir(struct file *file, struct dir_context *ctx,
-                                struct f2fs_str *fstr)
+                                struct fscrypt_str *fstr)
 {
        struct inode *inode = file_inode(file);
        struct f2fs_inline_dentry *inline_dentry = NULL;
diff --git a/fs/f2fs/inode.c b/fs/f2fs/inode.c
index 2adeff26be11..cb269c46ac25 100644
--- a/fs/f2fs/inode.c
+++ b/fs/f2fs/inode.c
@@ -83,7 +83,7 @@ static void __recover_inline_status(struct inode *inode, struct page *ipage)
        while (start < end) {
                if (*start++) {
-                        f2fs_wait_on_page_writeback(ipage, NODE);
+                        f2fs_wait_on_page_writeback(ipage, NODE, true);
                        set_inode_flag(F2FS_I(inode), FI_DATA_EXIST);
                        set_raw_inline(F2FS_I(inode), F2FS_INODE(ipage));
@@ -227,7 +227,7 @@ int update_inode(struct inode *inode, struct page *node_page)
 {
        struct f2fs_inode *ri;
-        f2fs_wait_on_page_writeback(node_page, NODE);
+        f2fs_wait_on_page_writeback(node_page, NODE, true);
        ri = F2FS_INODE(node_page);
@@ -263,6 +263,10 @@ int update_inode(struct inode *inode, struct page *node_page)
        set_cold_node(inode, node_page);
        clear_inode_flag(F2FS_I(inode), FI_DIRTY_INODE);
+        /* deleted inode */
+        if (inode->i_nlink == 0)
+                clear_inline_node(node_page);
        return set_page_dirty(node_page);
 }
@@ -320,7 +324,7 @@ void f2fs_evict_inode(struct inode *inode)
        /* some remained atomic pages should discarded */
        if (f2fs_is_atomic_file(inode))
-                commit_inmem_pages(inode, true);
+                drop_inmem_pages(inode);
        trace_f2fs_evict_inode(inode);
        truncate_inode_pages_final(&inode->i_data);
@@ -385,10 +389,7 @@ no_delete:
                }
        }
 out_clear:
-#ifdef CONFIG_F2FS_FS_ENCRYPTION
+        fscrypt_put_encryption_info(inode, NULL);
-        if (fi->i_crypt_info)
-                f2fs_free_encryption_info(inode, fi->i_crypt_info);
-#endif
        clear_inode(inode);
 }
diff --git a/fs/f2fs/namei.c b/fs/f2fs/namei.c
index 6f944e5eb76e..7876f1052101 100644
--- a/fs/f2fs/namei.c
+++ b/fs/f2fs/namei.c
@@ -169,7 +169,7 @@ static int f2fs_link(struct dentry *old_dentry, struct inode *dir,
        int err;
        if (f2fs_encrypted_inode(dir) &&
-                !f2fs_is_child_context_consistent_with_parent(dir, inode))
+                        !fscrypt_has_permitted_context(dir, inode))
                return -EPERM;
        f2fs_balance_fs(sbi, true);
@@ -260,6 +260,22 @@ static struct dentry *f2fs_lookup(struct inode *dir, struct dentry *dentry,
        struct page *page;
        nid_t ino;
        int err = 0;
+        unsigned int root_ino = F2FS_ROOT_INO(F2FS_I_SB(dir));
+        if (f2fs_encrypted_inode(dir)) {
+                int res = fscrypt_get_encryption_info(dir);
+                /*
+                 * DCACHE_ENCRYPTED_WITH_KEY is set if the dentry is
+                 * created while the directory was encrypted and we
+                 * don't have access to the key.
+                 */
+                if (fscrypt_has_encryption_key(dir))
+                        fscrypt_set_encrypted_dentry(dentry);
+                fscrypt_set_d_op(dentry);
+                if (res && res != -ENOKEY)
+                        return ERR_PTR(res);
+        }
        if (dentry->d_name.len > F2FS_NAME_LEN)
                return ERR_PTR(-ENAMETOOLONG);
@@ -276,15 +292,29 @@ static struct dentry *f2fs_lookup(struct inode *dir, struct dentry *dentry,
        if (IS_ERR(inode))
                return ERR_CAST(inode);
+        if ((dir->i_ino == root_ino) && f2fs_has_inline_dots(dir)) {
+                err = __recover_dot_dentries(dir, root_ino);
+                if (err)
+                        goto err_out;
+        }
        if (f2fs_has_inline_dots(inode)) {
                err = __recover_dot_dentries(inode, dir->i_ino);
                if (err)
                        goto err_out;
        }
+        if (!IS_ERR(inode) && f2fs_encrypted_inode(dir) &&
+                        (S_ISDIR(inode->i_mode) || S_ISLNK(inode->i_mode)) &&
+                        !fscrypt_has_permitted_context(dir, inode)) {
+                bool nokey = f2fs_encrypted_inode(inode) &&
+                        !fscrypt_has_encryption_key(inode);
+                err = nokey ? -ENOKEY : -EPERM;
+                goto err_out;
+        }
        return d_splice_alias(inode, dentry);
 err_out:
-        iget_failed(inode);
+        iput(inode);
        return ERR_PTR(err);
 }
@@ -345,13 +375,23 @@ static int f2fs_symlink(struct inode *dir, struct dentry *dentry,
        struct f2fs_sb_info *sbi = F2FS_I_SB(dir);
        struct inode *inode;
        size_t len = strlen(symname);
-        size_t p_len;
+        struct fscrypt_str disk_link = FSTR_INIT((char *)symname, len + 1);
-        char *p_str;
+        struct fscrypt_symlink_data *sd = NULL;
-        struct f2fs_str disk_link = FSTR_INIT(NULL, 0);
-        struct f2fs_encrypted_symlink_data *sd = NULL;
        int err;
-        if (len > dir->i_sb->s_blocksize)
+        if (f2fs_encrypted_inode(dir)) {
+                err = fscrypt_get_encryption_info(dir);
+                if (err)
+                        return err;
+                if (!fscrypt_has_encryption_key(dir))
+                        return -EPERM;
+                disk_link.len = (fscrypt_fname_encrypted_size(dir, len) +
+                                sizeof(struct fscrypt_symlink_data));
+        }
+        if (disk_link.len > dir->i_sb->s_blocksize)
                return -ENAMETOOLONG;
        inode = f2fs_new_inode(dir, S_IFLNK | S_IRWXUGO);
@@ -374,42 +414,36 @@ static int f2fs_symlink(struct inode *dir, struct dentry *dentry,
        f2fs_unlock_op(sbi);
        alloc_nid_done(sbi, inode->i_ino);
-        if (f2fs_encrypted_inode(dir)) {
+        if (f2fs_encrypted_inode(inode)) {
                struct qstr istr = QSTR_INIT(symname, len);
+                struct fscrypt_str ostr;
-                err = f2fs_get_encryption_info(inode);
+                sd = kzalloc(disk_link.len, GFP_NOFS);
-                if (err)
+                if (!sd) {
+                        err = -ENOMEM;
                        goto err_out;
+                }
-                err = f2fs_fname_crypto_alloc_buffer(inode, len, &disk_link);
+                err = fscrypt_get_encryption_info(inode);
                if (err)
                        goto err_out;
-                err = f2fs_fname_usr_to_disk(inode, &istr, &disk_link);
+                if (!fscrypt_has_encryption_key(inode)) {
-                if (err < 0)
+                        err = -EPERM;
-                        goto err_out;
-                p_len = encrypted_symlink_data_len(disk_link.len) + 1;
-                if (p_len > dir->i_sb->s_blocksize) {
-                        err = -ENAMETOOLONG;
                        goto err_out;
                }
-                sd = kzalloc(p_len, GFP_NOFS);
+                ostr.name = sd->encrypted_path;
-                if (!sd) {
+                ostr.len = disk_link.len;
-                        err = -ENOMEM;
+                err = fscrypt_fname_usr_to_disk(inode, &istr, &ostr);
+                if (err < 0)
                        goto err_out;
-                }
-                memcpy(sd->encrypted_path, disk_link.name, disk_link.len);
+                sd->len = cpu_to_le16(ostr.len);
-                sd->len = cpu_to_le16(disk_link.len);
+                disk_link.name = (char *)sd;
-                p_str = (char *)sd;
-        } else {
-                p_len = len + 1;
-                p_str = (char *)symname;
        }
-        err = page_symlink(inode, p_str, p_len);
+        err = page_symlink(inode, disk_link.name, disk_link.len);
 err_out:
        d_instantiate(dentry, inode);
@@ -425,7 +459,8 @@ err_out:
         * performance regression.
         */
        if (!err) {
-                filemap_write_and_wait_range(inode->i_mapping, 0, p_len - 1);
+                filemap_write_and_wait_range(inode->i_mapping, 0,
+                                                        disk_link.len - 1);
                if (IS_DIRSYNC(dir))
                        f2fs_sync_fs(sbi->sb, 1);
@@ -434,7 +469,6 @@ err_out:
        }
        kfree(sd);
-        f2fs_fname_crypto_free_buffer(&disk_link);
        return err;
 out:
        handle_failed_inode(inode);
@@ -582,7 +616,7 @@ out:
 static int f2fs_tmpfile(struct inode *dir, struct dentry *dentry, umode_t mode)
 {
        if (f2fs_encrypted_inode(dir)) {
-                int err = f2fs_get_encryption_info(dir);
+                int err = fscrypt_get_encryption_info(dir);
                if (err)
                        return err;
        }
@@ -608,11 +642,11 @@ static int f2fs_rename(struct inode *old_dir, struct dentry *old_dentry,
        struct f2fs_dir_entry *old_dir_entry = NULL;
        struct f2fs_dir_entry *old_entry;
        struct f2fs_dir_entry *new_entry;
+        bool is_old_inline = f2fs_has_inline_dentry(old_dir);
        int err = -ENOENT;
        if ((old_dir != new_dir) && f2fs_encrypted_inode(new_dir) &&
-                !f2fs_is_child_context_consistent_with_parent(new_dir,
+                        !fscrypt_has_permitted_context(new_dir, old_inode)) {
-                                                        old_inode)) {
                err = -EPERM;
                goto out;
        }
@@ -654,8 +688,9 @@ static int f2fs_rename(struct inode *old_dir, struct dentry *old_dentry,
                if (err)
                        goto put_out_dir;
-                if (update_dent_inode(old_inode, new_inode,
+                err = update_dent_inode(old_inode, new_inode,
-                                                &new_dentry->d_name)) {
+                                                &new_dentry->d_name);
+                if (err) {
                        release_orphan_inode(sbi);
                        goto put_out_dir;
                }
@@ -693,6 +728,26 @@ static int f2fs_rename(struct inode *old_dir, struct dentry *old_dentry,
                        inc_nlink(new_dir);
                        update_inode_page(new_dir);
                }
+                /*
+                 * old entry and new entry can locate in the same inline
+                 * dentry in inode, when attaching new entry in inline dentry,
+                 * it could force inline dentry conversion, after that,
+                 * old_entry and old_page will point to wrong address, in
+                 * order to avoid this, let's do the check and update here.
+                 */
+                if (is_old_inline && !f2fs_has_inline_dentry(old_dir)) {
+                        f2fs_put_page(old_page, 0);
+                        old_page = NULL;
+                        old_entry = f2fs_find_entry(old_dir,
+                                                &old_dentry->d_name, &old_page);
+                        if (!old_entry) {
+                                err = -EIO;
+                                f2fs_unlock_op(sbi);
+                                goto out_whiteout;
+                        }
+                }
        }
        down_write(&F2FS_I(old_inode)->i_sem);
@@ -771,11 +826,9 @@ static int f2fs_cross_rename(struct inode *old_dir, struct dentry *old_dentry,
        int err = -ENOENT;
        if ((f2fs_encrypted_inode(old_dir) || f2fs_encrypted_inode(new_dir)) &&
-                (old_dir != new_dir) &&
+                        (old_dir != new_dir) &&
-                (!f2fs_is_child_context_consistent_with_parent(new_dir,
+                        (!fscrypt_has_permitted_context(new_dir, old_inode) ||
-                                                                old_inode) ||
+                         !fscrypt_has_permitted_context(old_dir, new_inode)))
-                !f2fs_is_child_context_consistent_with_parent(old_dir,
-                                                                new_inode)))
                return -EPERM;
        old_entry = f2fs_find_entry(old_dir, &old_dentry->d_name, &old_page);
@@ -937,16 +990,15 @@ static int f2fs_rename2(struct inode *old_dir, struct dentry *old_dentry,
        return f2fs_rename(old_dir, old_dentry, new_dir, new_dentry, flags);
 }
-#ifdef CONFIG_F2FS_FS_ENCRYPTION
 static const char *f2fs_encrypted_get_link(struct dentry *dentry,
                                           struct inode *inode,
                                           struct delayed_call *done)
 {
        struct page *cpage = NULL;
        char *caddr, *paddr = NULL;
-        struct f2fs_str cstr = FSTR_INIT(NULL, 0);
+        struct fscrypt_str cstr = FSTR_INIT(NULL, 0);
-        struct f2fs_str pstr = FSTR_INIT(NULL, 0);
+        struct fscrypt_str pstr = FSTR_INIT(NULL, 0);
-        struct f2fs_encrypted_symlink_data *sd;
+        struct fscrypt_symlink_data *sd;
        loff_t size = min_t(loff_t, i_size_read(inode), PAGE_SIZE - 1);
        u32 max_size = inode->i_sb->s_blocksize;
        int res;
@@ -954,7 +1006,7 @@ static const char *f2fs_encrypted_get_link(struct dentry *dentry,
        if (!dentry)
                return ERR_PTR(-ECHILD);
-        res = f2fs_get_encryption_info(inode);
+        res = fscrypt_get_encryption_info(inode);
        if (res)
                return ERR_PTR(res);
@@ -965,7 +1017,8 @@ static const char *f2fs_encrypted_get_link(struct dentry *dentry,
        caddr[size] = 0;
        /* Symlink is encrypted */
-        sd = (struct f2fs_encrypted_symlink_data *)caddr;
+        sd = (struct fscrypt_symlink_data *)caddr;
+        cstr.name = sd->encrypted_path;
        cstr.len = le16_to_cpu(sd->len);
        /* this is broken symlink case */
@@ -973,12 +1026,6 @@ static const char *f2fs_encrypted_get_link(struct dentry *dentry,
                res = -ENOENT;
                goto errout;
        }
-        cstr.name = kmalloc(cstr.len, GFP_NOFS);
-        if (!cstr.name) {
-                res = -ENOMEM;
-                goto errout;
-        }
-        memcpy(cstr.name, sd->encrypted_path, cstr.len);
        /* this is broken symlink case */
        if (unlikely(cstr.name[0] == 0)) {
@@ -986,22 +1033,19 @@ static const char *f2fs_encrypted_get_link(struct dentry *dentry,
                goto errout;
        }
-        if ((cstr.len + sizeof(struct f2fs_encrypted_symlink_data) - 1) >
+        if ((cstr.len + sizeof(struct fscrypt_symlink_data) - 1) > max_size) {
-                                                                max_size) {
                /* Symlink data on the disk is corrupted */
                res = -EIO;
                goto errout;
        }
-        res = f2fs_fname_crypto_alloc_buffer(inode, cstr.len, &pstr);
+        res = fscrypt_fname_alloc_buffer(inode, cstr.len, &pstr);
        if (res)
                goto errout;
-        res = f2fs_fname_disk_to_usr(inode, NULL, &cstr, &pstr);
+        res = fscrypt_fname_disk_to_usr(inode, 0, 0, &cstr, &pstr);
        if (res < 0)
                goto errout;
-        kfree(cstr.name);
        paddr = pstr.name;
        /* Null-terminate the name */
@@ -1011,8 +1055,7 @@ static const char *f2fs_encrypted_get_link(struct dentry *dentry,
        set_delayed_call(done, kfree_link, paddr);
        return paddr;
 errout:
-        kfree(cstr.name);
+        fscrypt_fname_free_buffer(&pstr);
-        f2fs_fname_crypto_free_buffer(&pstr);
        page_cache_release(cpage);
        return ERR_PTR(res);
 }
@@ -1029,7 +1072,6 @@ const struct inode_operations f2fs_encrypted_symlink_inode_operations = {
        .removexattr    = generic_removexattr,
 #endif
 };
-#endif
 const struct inode_operations f2fs_dir_inode_operations = {
        .create         = f2fs_create,
diff --git a/fs/f2fs/node.c b/fs/f2fs/node.c
index 342597a5897f..118321bd1a7f 100644
--- a/fs/f2fs/node.c
+++ b/fs/f2fs/node.c
@@ -257,15 +257,20 @@ static struct nat_entry *grab_nat_entry(struct f2fs_nm_info *nm_i, nid_t nid)
        return new;
 }
-static void cache_nat_entry(struct f2fs_nm_info *nm_i, nid_t nid,
+static void cache_nat_entry(struct f2fs_sb_info *sbi, nid_t nid,
                                                struct f2fs_nat_entry *ne)
 {
+        struct f2fs_nm_info *nm_i = NM_I(sbi);
        struct nat_entry *e;
        e = __lookup_nat_cache(nm_i, nid);
        if (!e) {
                e = grab_nat_entry(nm_i, nid);
                node_info_from_raw_nat(&e->ni, ne);
+        } else {
+                f2fs_bug_on(sbi, nat_get_ino(e) != ne->ino ||
+                                nat_get_blkaddr(e) != ne->block_addr ||
+                                nat_get_version(e) != ne->version);
        }
 }
@@ -354,7 +359,7 @@ void get_node_info(struct f2fs_sb_info *sbi, nid_t nid, struct node_info *ni)
 {
        struct f2fs_nm_info *nm_i = NM_I(sbi);
        struct curseg_info *curseg = CURSEG_I(sbi, CURSEG_HOT_DATA);
-        struct f2fs_summary_block *sum = curseg->sum_blk;
+        struct f2fs_journal *journal = curseg->journal;
        nid_t start_nid = START_NID(nid);
        struct f2fs_nat_block *nat_blk;
        struct page *page = NULL;
@@ -371,23 +376,20 @@ void get_node_info(struct f2fs_sb_info *sbi, nid_t nid, struct node_info *ni)
                ni->ino = nat_get_ino(e);
                ni->blk_addr = nat_get_blkaddr(e);
                ni->version = nat_get_version(e);
-        }
+                up_read(&nm_i->nat_tree_lock);
-        up_read(&nm_i->nat_tree_lock);
-        if (e)
                return;
+        }
        memset(&ne, 0, sizeof(struct f2fs_nat_entry));
-        down_write(&nm_i->nat_tree_lock);
        /* Check current segment summary */
-        mutex_lock(&curseg->curseg_mutex);
+        down_read(&curseg->journal_rwsem);
-        i = lookup_journal_in_cursum(sum, NAT_JOURNAL, nid, 0);
+        i = lookup_journal_in_cursum(journal, NAT_JOURNAL, nid, 0);
        if (i >= 0) {
-                ne = nat_in_journal(sum, i);
+                ne = nat_in_journal(journal, i);
                node_info_from_raw_nat(ni, &ne);
        }
-        mutex_unlock(&curseg->curseg_mutex);
+        up_read(&curseg->journal_rwsem);
        if (i >= 0)
                goto cache;
@@ -398,19 +400,52 @@ void get_node_info(struct f2fs_sb_info *sbi, nid_t nid, struct node_info *ni)
        node_info_from_raw_nat(ni, &ne);
        f2fs_put_page(page, 1);
 cache:
+        up_read(&nm_i->nat_tree_lock);
        /* cache nat entry */
-        cache_nat_entry(NM_I(sbi), nid, &ne);
+        down_write(&nm_i->nat_tree_lock);
+        cache_nat_entry(sbi, nid, &ne);
        up_write(&nm_i->nat_tree_lock);
 }
+pgoff_t get_next_page_offset(struct dnode_of_data *dn, pgoff_t pgofs)
+{
+        const long direct_index = ADDRS_PER_INODE(dn->inode);
+        const long direct_blks = ADDRS_PER_BLOCK;
+        const long indirect_blks = ADDRS_PER_BLOCK * NIDS_PER_BLOCK;
+        unsigned int skipped_unit = ADDRS_PER_BLOCK;
+        int cur_level = dn->cur_level;
+        int max_level = dn->max_level;
+        pgoff_t base = 0;
+        if (!dn->max_level)
+                return pgofs + 1;
+        while (max_level-- > cur_level)
+                skipped_unit *= NIDS_PER_BLOCK;
+        switch (dn->max_level) {
+        case 3:
+                base += 2 * indirect_blks;
+        case 2:
+                base += 2 * direct_blks;
+        case 1:
+                base += direct_index;
+                break;
+        default:
+                f2fs_bug_on(F2FS_I_SB(dn->inode), 1);
+        }
+        return ((pgofs - base) / skipped_unit + 1) * skipped_unit + base;
+}
 /*
 * The maximum depth is four.
 * Offset[0] will have raw inode offset.
 */
-static int get_node_path(struct f2fs_inode_info *fi, long block,
+static int get_node_path(struct inode *inode, long block,
                                int offset[4], unsigned int noffset[4])
 {
-        const long direct_index = ADDRS_PER_INODE(fi);
+        const long direct_index = ADDRS_PER_INODE(inode);
        const long direct_blks = ADDRS_PER_BLOCK;
        const long dptrs_per_blk = NIDS_PER_BLOCK;
        const long indirect_blks = ADDRS_PER_BLOCK * NIDS_PER_BLOCK;
@@ -495,10 +530,10 @@ int get_dnode_of_data(struct dnode_of_data *dn, pgoff_t index, int mode)
        int offset[4];
        unsigned int noffset[4];
        nid_t nids[4];
-        int level, i;
+        int level, i = 0;
        int err = 0;
-        level = get_node_path(F2FS_I(dn->inode), index, offset, noffset);
+        level = get_node_path(dn->inode, index, offset, noffset);
        nids[0] = dn->inode->i_ino;
        npage[0] = dn->inode_page;
@@ -585,6 +620,10 @@ release_pages:
 release_out:
        dn->inode_page = NULL;
        dn->node_page = NULL;
+        if (err == -ENOENT) {
+                dn->cur_level = i;
+                dn->max_level = level;
+        }
        return err;
 }
@@ -792,7 +831,7 @@ int truncate_inode_blocks(struct inode *inode, pgoff_t from)
        trace_f2fs_truncate_inode_blocks_enter(inode, from);
-        level = get_node_path(F2FS_I(inode), from, offset, noffset);
+        level = get_node_path(inode, from, offset, noffset);
 restart:
        page = get_node_page(sbi, inode->i_ino);
        if (IS_ERR(page)) {
@@ -861,7 +900,7 @@ skip_partial:
                                f2fs_put_page(page, 1);
                                goto restart;
                        }
-                        f2fs_wait_on_page_writeback(page, NODE);
+                        f2fs_wait_on_page_writeback(page, NODE, true);
                        ri->i_nid[offset[0] - NODE_DIR1_BLOCK] = 0;
                        set_page_dirty(page);
                        unlock_page(page);
@@ -976,7 +1015,7 @@ struct page *new_node_page(struct dnode_of_data *dn,
        new_ni.ino = dn->inode->i_ino;
        set_node_addr(sbi, &new_ni, NEW_ADDR, false);
-        f2fs_wait_on_page_writeback(page, NODE);
+        f2fs_wait_on_page_writeback(page, NODE, true);
        fill_node_footer(page, dn->nid, dn->inode->i_ino, ofs, true);
        set_cold_node(dn->inode, page);
        SetPageUptodate(page);
@@ -1029,7 +1068,7 @@ static int read_node_page(struct page *page, int rw)
        if (PageUptodate(page))
                return LOCKED_PAGE;
-        fio.blk_addr = ni.blk_addr;
+        fio.new_blkaddr = fio.old_blkaddr = ni.blk_addr;
        return f2fs_submit_page_bio(&fio);
 }
@@ -1045,12 +1084,11 @@ void ra_node_page(struct f2fs_sb_info *sbi, nid_t nid)
                return;
        f2fs_bug_on(sbi, check_nid_range(sbi, nid));
-        apage = find_get_page(NODE_MAPPING(sbi), nid);
+        rcu_read_lock();
-        if (apage && PageUptodate(apage)) {
+        apage = radix_tree_lookup(&NODE_MAPPING(sbi)->page_tree, nid);
-                f2fs_put_page(apage, 0);
+        rcu_read_unlock();
+        if (apage)
                return;
-        }
-        f2fs_put_page(apage, 0);
        apage = grab_cache_page(NODE_MAPPING(sbi), nid);
        if (!apage)
@@ -1063,7 +1101,7 @@ void ra_node_page(struct f2fs_sb_info *sbi, nid_t nid)
 /*
 * readahead MAX_RA_NODE number of node pages.
 */
-void ra_node_pages(struct page *parent, int start)
+static void ra_node_pages(struct page *parent, int start)
 {
        struct f2fs_sb_info *sbi = F2FS_P_SB(parent);
        struct blk_plug plug;
@@ -1083,7 +1121,7 @@ void ra_node_pages(struct page *parent, int start)
        blk_finish_plug(&plug);
 }
-struct page *__get_node_page(struct f2fs_sb_info *sbi, pgoff_t nid,
+static struct page *__get_node_page(struct f2fs_sb_info *sbi, pgoff_t nid,
                                        struct page *parent, int start)
 {
        struct page *page;
@@ -1154,19 +1192,57 @@ void sync_inode_page(struct dnode_of_data *dn)
        dn->node_changed = ret ? true: false;
 }
+static void flush_inline_data(struct f2fs_sb_info *sbi, nid_t ino)
+{
+        struct inode *inode;
+        struct page *page;
+        /* should flush inline_data before evict_inode */
+        inode = ilookup(sbi->sb, ino);
+        if (!inode)
+                return;
+        page = pagecache_get_page(inode->i_mapping, 0, FGP_NOWAIT, 0);
+        if (!page)
+                goto iput_out;
+        if (!trylock_page(page))
+                goto release_out;
+        if (!PageUptodate(page))
+                goto page_out;
+        if (!PageDirty(page))
+                goto page_out;
+        if (!clear_page_dirty_for_io(page))
+                goto page_out;
+        if (!f2fs_write_inline_data(inode, page))
+                inode_dec_dirty_pages(inode);
+        else
+                set_page_dirty(page);
+page_out:
+        unlock_page(page);
+release_out:
+        f2fs_put_page(page, 0);
+iput_out:
+        iput(inode);
+}
 int sync_node_pages(struct f2fs_sb_info *sbi, nid_t ino,
                                        struct writeback_control *wbc)
 {
        pgoff_t index, end;
        struct pagevec pvec;
        int step = ino ? 2 : 0;
-        int nwritten = 0, wrote = 0;
+        int nwritten = 0;
        pagevec_init(&pvec, 0);
 next_step:
        index = 0;
-        end = LONG_MAX;
+        end = ULONG_MAX;
        while (index <= end) {
                int i, nr_pages;
@@ -1203,6 +1279,7 @@ next_step:
                         * If an fsync mode,
                         * we should not skip writing node pages.
                         */
+lock_node:
                        if (ino && ino_of_node(page) == ino)
                                lock_page(page);
                        else if (!trylock_page(page))
@@ -1221,6 +1298,17 @@ continue_unlock:
                                goto continue_unlock;
                        }
+                        /* flush inline_data */
+                        if (!ino && is_inline_node(page)) {
+                                clear_inline_node(page);
+                                unlock_page(page);
+                                flush_inline_data(sbi, ino_of_node(page));
+                                goto lock_node;
+                        }
+                        f2fs_wait_on_page_writeback(page, NODE, true);
+                        BUG_ON(PageWriteback(page));
                        if (!clear_page_dirty_for_io(page))
                                goto continue_unlock;
@@ -1238,8 +1326,6 @@ continue_unlock:
                        if (NODE_MAPPING(sbi)->a_ops->writepage(page, wbc))
                                unlock_page(page);
-                        else
-                                wrote++;
                        if (--wbc->nr_to_write == 0)
                                break;
@@ -1257,15 +1343,12 @@ continue_unlock:
                step++;
                goto next_step;
        }
-        if (wrote)
-                f2fs_submit_merged_bio(sbi, NODE, WRITE);
        return nwritten;
 }
 int wait_on_node_pages_writeback(struct f2fs_sb_info *sbi, nid_t ino)
 {
-        pgoff_t index = 0, end = LONG_MAX;
+        pgoff_t index = 0, end = ULONG_MAX;
        struct pagevec pvec;
        int ret2 = 0, ret = 0;
@@ -1287,7 +1370,7 @@ int wait_on_node_pages_writeback(struct f2fs_sb_info *sbi, nid_t ino)
                                continue;
                        if (ino && ino_of_node(page) == ino) {
-                                f2fs_wait_on_page_writeback(page, NODE);
+                                f2fs_wait_on_page_writeback(page, NODE, true);
                                if (TestClearPageError(page))
                                        ret = -EIO;
                        }
@@ -1326,8 +1409,6 @@ static int f2fs_write_node_page(struct page *page,
        if (unlikely(f2fs_cp_error(sbi)))
                goto redirty_out;
-        f2fs_wait_on_page_writeback(page, NODE);
        /* get old block addr of this node page */
        nid = nid_of_node(page);
        f2fs_bug_on(sbi, page->index != nid);
@@ -1351,14 +1432,18 @@ static int f2fs_write_node_page(struct page *page,
        }
        set_page_writeback(page);
-        fio.blk_addr = ni.blk_addr;
+        fio.old_blkaddr = ni.blk_addr;
        write_node_page(nid, &fio);
-        set_node_addr(sbi, &ni, fio.blk_addr, is_fsync_dnode(page));
+        set_node_addr(sbi, &ni, fio.new_blkaddr, is_fsync_dnode(page));
        dec_page_count(sbi, F2FS_DIRTY_NODES);
        up_read(&sbi->node_write);
+        if (wbc->for_reclaim)
+                f2fs_submit_merged_bio_cond(sbi, NULL, page, 0, NODE, WRITE);
        unlock_page(page);
-        if (wbc->for_reclaim || unlikely(f2fs_cp_error(sbi)))
+        if (unlikely(f2fs_cp_error(sbi)))
                f2fs_submit_merged_bio(sbi, NODE, WRITE);
        return 0;
@@ -1374,8 +1459,6 @@ static int f2fs_write_node_pages(struct address_space *mapping,
        struct f2fs_sb_info *sbi = F2FS_M_SB(mapping);
        long diff;
-        trace_f2fs_writepages(mapping->host, wbc, NODE);
        /* balancing f2fs's metadata in background */
        f2fs_balance_fs_bg(sbi);
@@ -1383,6 +1466,8 @@ static int f2fs_write_node_pages(struct address_space *mapping,
        if (get_pages(sbi, F2FS_DIRTY_NODES) < nr_pages_to_skip(sbi, NODE))
                goto skip_write;
+        trace_f2fs_writepages(mapping->host, wbc, NODE);
        diff = nr_pages_to_write(sbi, NODE, wbc);
        wbc->sync_mode = WB_SYNC_NONE;
        sync_node_pages(sbi, 0, wbc);
@@ -1391,6 +1476,7 @@ static int f2fs_write_node_pages(struct address_space *mapping,
 skip_write:
        wbc->pages_skipped += get_pages(sbi, F2FS_DIRTY_NODES);
+        trace_f2fs_writepages(mapping->host, wbc, NODE);
        return 0;
 }
@@ -1526,7 +1612,7 @@ static void build_free_nids(struct f2fs_sb_info *sbi)
 {
        struct f2fs_nm_info *nm_i = NM_I(sbi);
        struct curseg_info *curseg = CURSEG_I(sbi, CURSEG_HOT_DATA);
-        struct f2fs_summary_block *sum = curseg->sum_blk;
+        struct f2fs_journal *journal = curseg->journal;
        int i = 0;
        nid_t nid = nm_i->next_scan_nid;
@@ -1558,16 +1644,18 @@ static void build_free_nids(struct f2fs_sb_info *sbi)
        nm_i->next_scan_nid = nid;
        /* find free nids from current sum_pages */
-        mutex_lock(&curseg->curseg_mutex);
+        down_read(&curseg->journal_rwsem);
-        for (i = 0; i < nats_in_cursum(sum); i++) {
+        for (i = 0; i < nats_in_cursum(journal); i++) {
-                block_t addr = le32_to_cpu(nat_in_journal(sum, i).block_addr);
+                block_t addr;
-                nid = le32_to_cpu(nid_in_journal(sum, i));
+                addr = le32_to_cpu(nat_in_journal(journal, i).block_addr);
+                nid = le32_to_cpu(nid_in_journal(journal, i));
                if (addr == NULL_ADDR)
                        add_free_nid(sbi, nid, true);
                else
                        remove_free_nid(nm_i, nid);
        }
-        mutex_unlock(&curseg->curseg_mutex);
+        up_read(&curseg->journal_rwsem);
        up_read(&nm_i->nat_tree_lock);
        ra_meta_pages(sbi, NAT_BLOCK_OFFSET(nm_i->next_scan_nid),
@@ -1703,7 +1791,7 @@ void recover_inline_xattr(struct inode *inode, struct page *page)
        src_addr = inline_xattr_addr(page);
        inline_size = inline_xattr_size(inode);
-        f2fs_wait_on_page_writeback(ipage, NODE);
+        f2fs_wait_on_page_writeback(ipage, NODE, true);
        memcpy(dst_addr, src_addr, inline_size);
 update_inode:
        update_inode(inode, ipage);
@@ -1831,16 +1919,16 @@ static void remove_nats_in_journal(struct f2fs_sb_info *sbi)
 {
        struct f2fs_nm_info *nm_i = NM_I(sbi);
        struct curseg_info *curseg = CURSEG_I(sbi, CURSEG_HOT_DATA);
-        struct f2fs_summary_block *sum = curseg->sum_blk;
+        struct f2fs_journal *journal = curseg->journal;
        int i;
-        mutex_lock(&curseg->curseg_mutex);
+        down_write(&curseg->journal_rwsem);
-        for (i = 0; i < nats_in_cursum(sum); i++) {
+        for (i = 0; i < nats_in_cursum(journal); i++) {
                struct nat_entry *ne;
                struct f2fs_nat_entry raw_ne;
-                nid_t nid = le32_to_cpu(nid_in_journal(sum, i));
+                nid_t nid = le32_to_cpu(nid_in_journal(journal, i));
-                raw_ne = nat_in_journal(sum, i);
+                raw_ne = nat_in_journal(journal, i);
                ne = __lookup_nat_cache(nm_i, nid);
                if (!ne) {
@@ -1849,8 +1937,8 @@ static void remove_nats_in_journal(struct f2fs_sb_info *sbi)
                }
                __set_nat_cache_dirty(nm_i, ne);
        }
-        update_nats_in_cursum(sum, -i);
+        update_nats_in_cursum(journal, -i);
-        mutex_unlock(&curseg->curseg_mutex);
+        up_write(&curseg->journal_rwsem);
 }
 static void __adjust_nat_entry_set(struct nat_entry_set *nes,
@@ -1875,7 +1963,7 @@ static void __flush_nat_entry_set(struct f2fs_sb_info *sbi,
                                        struct nat_entry_set *set)
 {
        struct curseg_info *curseg = CURSEG_I(sbi, CURSEG_HOT_DATA);
-        struct f2fs_summary_block *sum = curseg->sum_blk;
+        struct f2fs_journal *journal = curseg->journal;
        nid_t start_nid = set->set * NAT_ENTRY_PER_BLOCK;
        bool to_journal = true;
        struct f2fs_nat_block *nat_blk;
@@ -1887,11 +1975,11 @@ static void __flush_nat_entry_set(struct f2fs_sb_info *sbi,
         * #1, flush nat entries to journal in current hot data summary block.
         * #2, flush nat entries to nat page.
         */
-        if (!__has_cursum_space(sum, set->entry_cnt, NAT_JOURNAL))
+        if (!__has_cursum_space(journal, set->entry_cnt, NAT_JOURNAL))
                to_journal = false;
        if (to_journal) {
-                mutex_lock(&curseg->curseg_mutex);
+                down_write(&curseg->journal_rwsem);
        } else {
                page = get_next_nat_page(sbi, start_nid);
                nat_blk = page_address(page);
@@ -1908,11 +1996,11 @@ static void __flush_nat_entry_set(struct f2fs_sb_info *sbi,
                        continue;
                if (to_journal) {
-                        offset = lookup_journal_in_cursum(sum,
+                        offset = lookup_journal_in_cursum(journal,
                                                        NAT_JOURNAL, nid, 1);
                        f2fs_bug_on(sbi, offset < 0);
-                        raw_ne = &nat_in_journal(sum, offset);
+                        raw_ne = &nat_in_journal(journal, offset);
-                        nid_in_journal(sum, offset) = cpu_to_le32(nid);
+                        nid_in_journal(journal, offset) = cpu_to_le32(nid);
                } else {
                        raw_ne = &nat_blk->entries[nid - start_nid];
                }
@@ -1924,7 +2012,7 @@ static void __flush_nat_entry_set(struct f2fs_sb_info *sbi,
        }
        if (to_journal)
-                mutex_unlock(&curseg->curseg_mutex);
+                up_write(&curseg->journal_rwsem);
        else
                f2fs_put_page(page, 1);
@@ -1941,7 +2029,7 @@ void flush_nat_entries(struct f2fs_sb_info *sbi)
 {
        struct f2fs_nm_info *nm_i = NM_I(sbi);
        struct curseg_info *curseg = CURSEG_I(sbi, CURSEG_HOT_DATA);
-        struct f2fs_summary_block *sum = curseg->sum_blk;
+        struct f2fs_journal *journal = curseg->journal;
        struct nat_entry_set *setvec[SETVEC_SIZE];
        struct nat_entry_set *set, *tmp;
        unsigned int found;
@@ -1958,7 +2046,7 @@ void flush_nat_entries(struct f2fs_sb_info *sbi)
         * entries, remove all entries from journal and merge them
         * into nat entry set.
         */
-        if (!__has_cursum_space(sum, nm_i->dirty_nat_cnt, NAT_JOURNAL))
+        if (!__has_cursum_space(journal, nm_i->dirty_nat_cnt, NAT_JOURNAL))
                remove_nats_in_journal(sbi);
        while ((found = __gang_lookup_nat_set(nm_i,
@@ -1967,7 +2055,7 @@ void flush_nat_entries(struct f2fs_sb_info *sbi)
                set_idx = setvec[found - 1]->set + 1;
                for (idx = 0; idx < found; idx++)
                        __adjust_nat_entry_set(setvec[idx], &sets,
-                                                        MAX_NAT_JENTRIES(sum));
+                                                MAX_NAT_JENTRIES(journal));
        }
        /* flush dirty nats in nat entry set */
@@ -2000,6 +2088,7 @@ static int init_node_manager(struct f2fs_sb_info *sbi)
        nm_i->nat_cnt = 0;
        nm_i->ram_thresh = DEF_RAM_THRESHOLD;
        nm_i->ra_nid_pages = DEF_RA_NID_PAGES;
+        nm_i->dirty_nats_ratio = DEF_DIRTY_NAT_RATIO_THRESHOLD;
        INIT_RADIX_TREE(&nm_i->free_nid_root, GFP_ATOMIC);
        INIT_LIST_HEAD(&nm_i->free_nid_list);
diff --git a/fs/f2fs/node.h b/fs/f2fs/node.h
index d4d1f636fe1c..1f4f9d4569d9 100644
--- a/fs/f2fs/node.h
+++ b/fs/f2fs/node.h
@@ -25,6 +25,9 @@
 /* control the memory footprint threshold (10MB per 1GB ram) */
 #define DEF_RAM_THRESHOLD       10
+/* control dirty nats ratio threshold (default: 10% over max nid count) */
+#define DEF_DIRTY_NAT_RATIO_THRESHOLD           10
 /* vector size for gang look-up from nat cache that consists of radix tree */
 #define NATVEC_SIZE     64
 #define SETVEC_SIZE     32
@@ -117,6 +120,12 @@ static inline void raw_nat_from_node_info(struct f2fs_nat_entry *raw_ne,
        raw_ne->version = ni->version;
 }
+static inline bool excess_dirty_nats(struct f2fs_sb_info *sbi)
+{
+        return NM_I(sbi)->dirty_nat_cnt >= NM_I(sbi)->max_nid *
+                                        NM_I(sbi)->dirty_nats_ratio / 100;
+}
 enum mem_type {
        FREE_NIDS,      /* indicates the free nid list */
        NAT_ENTRIES,    /* indicates the cached nat entry */
@@ -321,7 +330,7 @@ static inline int set_nid(struct page *p, int off, nid_t nid, bool i)
 {
        struct f2fs_node *rn = F2FS_NODE(p);
-        f2fs_wait_on_page_writeback(p, NODE);
+        f2fs_wait_on_page_writeback(p, NODE, true);
        if (i)
                rn->i.i_nid[off - NODE_DIR1_BLOCK] = cpu_to_le32(nid);
@@ -370,6 +379,21 @@ static inline int is_node(struct page *page, int type)
 #define is_fsync_dnode(page)    is_node(page, FSYNC_BIT_SHIFT)
 #define is_dent_dnode(page)     is_node(page, DENT_BIT_SHIFT)
+static inline int is_inline_node(struct page *page)
+{
+        return PageChecked(page);
+}
+static inline void set_inline_node(struct page *page)
+{
+        SetPageChecked(page);
+}
+static inline void clear_inline_node(struct page *page)
+{
+        ClearPageChecked(page);
+}
 static inline void set_cold_node(struct inode *inode, struct page *page)
 {
        struct f2fs_node *rn = F2FS_NODE(page);
diff --git a/fs/f2fs/recovery.c b/fs/f2fs/recovery.c
index 589b20b8677b..0b30cd2aeebd 100644
--- a/fs/f2fs/recovery.c
+++ b/fs/f2fs/recovery.c
@@ -350,8 +350,7 @@ got_it:
                inode = dn->inode;
        }
-        bidx = start_bidx_of_node(offset, F2FS_I(inode)) +
+        bidx = start_bidx_of_node(offset, inode) + le16_to_cpu(sum.ofs_in_node);
-                        le16_to_cpu(sum.ofs_in_node);
        /*
         * if inode page is locked, unlock temporarily, but its reference
@@ -386,10 +385,9 @@ truncate_out:
 static int do_recover_data(struct f2fs_sb_info *sbi, struct inode *inode,
                                        struct page *page, block_t blkaddr)
 {
-        struct f2fs_inode_info *fi = F2FS_I(inode);
-        unsigned int start, end;
        struct dnode_of_data dn;
        struct node_info ni;
+        unsigned int start, end;
        int err = 0, recovered = 0;
        /* step 1: recover xattr */
@@ -409,8 +407,8 @@ static int do_recover_data(struct f2fs_sb_info *sbi, struct inode *inode,
                goto out;
        /* step 3: recover data indices */
-        start = start_bidx_of_node(ofs_of_node(page), fi);
+        start = start_bidx_of_node(ofs_of_node(page), inode);
-        end = start + ADDRS_PER_PAGE(page, fi);
+        end = start + ADDRS_PER_PAGE(page, inode);
        set_new_dnode(&dn, inode, NULL, NULL, 0);
@@ -418,7 +416,7 @@ static int do_recover_data(struct f2fs_sb_info *sbi, struct inode *inode,
        if (err)
                goto out;
-        f2fs_wait_on_page_writeback(dn.node_page, NODE);
+        f2fs_wait_on_page_writeback(dn.node_page, NODE, true);
        get_node_info(sbi, dn.nid, &ni);
        f2fs_bug_on(sbi, ni.ino != ino_of_node(page));
@@ -467,7 +465,7 @@ static int do_recover_data(struct f2fs_sb_info *sbi, struct inode *inode,
                        /* write dummy data page */
                        f2fs_replace_block(sbi, &dn, src, dest,
-                                                        ni.version, false);
+                                                ni.version, false, false);
                        recovered++;
                }
        }
diff --git a/fs/f2fs/segment.c b/fs/f2fs/segment.c
index 5904a411c86f..6f16b39f0b52 100644
--- a/fs/f2fs/segment.c
+++ b/fs/f2fs/segment.c
@@ -191,70 +191,145 @@ void register_inmem_page(struct inode *inode, struct page *page)
        trace_f2fs_register_inmem_page(page, INMEM);
 }
-int commit_inmem_pages(struct inode *inode, bool abort)
+static int __revoke_inmem_pages(struct inode *inode,
+                                struct list_head *head, bool drop, bool recover)
+{
+        struct f2fs_sb_info *sbi = F2FS_I_SB(inode);
+        struct inmem_pages *cur, *tmp;
+        int err = 0;
+        list_for_each_entry_safe(cur, tmp, head, list) {
+                struct page *page = cur->page;
+                if (drop)
+                        trace_f2fs_commit_inmem_page(page, INMEM_DROP);
+                lock_page(page);
+                if (recover) {
+                        struct dnode_of_data dn;
+                        struct node_info ni;
+                        trace_f2fs_commit_inmem_page(page, INMEM_REVOKE);
+                        set_new_dnode(&dn, inode, NULL, NULL, 0);
+                        if (get_dnode_of_data(&dn, page->index, LOOKUP_NODE)) {
+                                err = -EAGAIN;
+                                goto next;
+                        }
+                        get_node_info(sbi, dn.nid, &ni);
+                        f2fs_replace_block(sbi, &dn, dn.data_blkaddr,
+                                        cur->old_addr, ni.version, true, true);
+                        f2fs_put_dnode(&dn);
+                }
+next:
+                ClearPageUptodate(page);
+                set_page_private(page, 0);
+                ClearPageUptodate(page);
+                f2fs_put_page(page, 1);
+                list_del(&cur->list);
+                kmem_cache_free(inmem_entry_slab, cur);
+                dec_page_count(F2FS_I_SB(inode), F2FS_INMEM_PAGES);
+        }
+        return err;
+}
+void drop_inmem_pages(struct inode *inode)
+{
+        struct f2fs_inode_info *fi = F2FS_I(inode);
+        mutex_lock(&fi->inmem_lock);
+        __revoke_inmem_pages(inode, &fi->inmem_pages, true, false);
+        mutex_unlock(&fi->inmem_lock);
+}
+static int __commit_inmem_pages(struct inode *inode,
+                                        struct list_head *revoke_list)
 {
        struct f2fs_sb_info *sbi = F2FS_I_SB(inode);
        struct f2fs_inode_info *fi = F2FS_I(inode);
        struct inmem_pages *cur, *tmp;
-        bool submit_bio = false;
        struct f2fs_io_info fio = {
                .sbi = sbi,
                .type = DATA,
                .rw = WRITE_SYNC | REQ_PRIO,
                .encrypted_page = NULL,
        };
+        bool submit_bio = false;
        int err = 0;
-        /*
-         * The abort is true only when f2fs_evict_inode is called.
-         * Basically, the f2fs_evict_inode doesn't produce any data writes, so
-         * that we don't need to call f2fs_balance_fs.
-         * Otherwise, f2fs_gc in f2fs_balance_fs can wait forever until this
-         * inode becomes free by iget_locked in f2fs_iget.
-         */
-        if (!abort) {
-                f2fs_balance_fs(sbi, true);
-                f2fs_lock_op(sbi);
-        }
-        mutex_lock(&fi->inmem_lock);
        list_for_each_entry_safe(cur, tmp, &fi->inmem_pages, list) {
-                lock_page(cur->page);
+                struct page *page = cur->page;
-                if (!abort) {
-                        if (cur->page->mapping == inode->i_mapping) {
+                lock_page(page);
-                                set_page_dirty(cur->page);
+                if (page->mapping == inode->i_mapping) {
-                                f2fs_wait_on_page_writeback(cur->page, DATA);
+                        trace_f2fs_commit_inmem_page(page, INMEM);
-                                if (clear_page_dirty_for_io(cur->page))
-                                        inode_dec_dirty_pages(inode);
+                        set_page_dirty(page);
-                                trace_f2fs_commit_inmem_page(cur->page, INMEM);
+                        f2fs_wait_on_page_writeback(page, DATA, true);
-                                fio.page = cur->page;
+                        if (clear_page_dirty_for_io(page))
-                                err = do_write_data_page(&fio);
+                                inode_dec_dirty_pages(inode);
-                                if (err) {
-                                        unlock_page(cur->page);
+                        fio.page = page;
-                                        break;
+                        err = do_write_data_page(&fio);
-                                }
+                        if (err) {
-                                clear_cold_data(cur->page);
+                                unlock_page(page);
-                                submit_bio = true;
+                                break;
                        }
-                } else {
-                        ClearPageUptodate(cur->page);
+                        /* record old blkaddr for revoking */
-                        trace_f2fs_commit_inmem_page(cur->page, INMEM_DROP);
+                        cur->old_addr = fio.old_blkaddr;
+                        clear_cold_data(page);
+                        submit_bio = true;
                }
-                set_page_private(cur->page, 0);
+                unlock_page(page);
-                ClearPagePrivate(cur->page);
+                list_move_tail(&cur->list, revoke_list);
-                f2fs_put_page(cur->page, 1);
+        }
-                list_del(&cur->list);
+        if (submit_bio)
-                kmem_cache_free(inmem_entry_slab, cur);
+                f2fs_submit_merged_bio_cond(sbi, inode, NULL, 0, DATA, WRITE);
-                dec_page_count(F2FS_I_SB(inode), F2FS_INMEM_PAGES);
+        if (!err)
+                __revoke_inmem_pages(inode, revoke_list, false, false);
+        return err;
+}
+int commit_inmem_pages(struct inode *inode)
+{
+        struct f2fs_sb_info *sbi = F2FS_I_SB(inode);
+        struct f2fs_inode_info *fi = F2FS_I(inode);
+        struct list_head revoke_list;
+        int err;
+        INIT_LIST_HEAD(&revoke_list);
+        f2fs_balance_fs(sbi, true);
+        f2fs_lock_op(sbi);
+        mutex_lock(&fi->inmem_lock);
+        err = __commit_inmem_pages(inode, &revoke_list);
+        if (err) {
+                int ret;
+                /*
+                 * try to revoke all committed pages, but still we could fail
+                 * due to no memory or other reason, if that happened, EAGAIN
+                 * will be returned, which means in such case, transaction is
+                 * already not integrity, caller should use journal to do the
+                 * recovery or rewrite & commit last transaction. For other
+                 * error number, revoking was done by filesystem itself.
+                 */
+                ret = __revoke_inmem_pages(inode, &revoke_list, false, true);
+                if (ret)
+                        err = ret;
+                /* drop all uncommitted pages */
+                __revoke_inmem_pages(inode, &fi->inmem_pages, true, false);
        }
        mutex_unlock(&fi->inmem_lock);
-        if (!abort) {
+        f2fs_unlock_op(sbi);
-                f2fs_unlock_op(sbi);
-                if (submit_bio)
-                        f2fs_submit_merged_bio(sbi, DATA, WRITE);
-        }
        return err;
 }
@@ -291,11 +366,17 @@ void f2fs_balance_fs_bg(struct f2fs_sb_info *sbi)
        /* checkpoint is the only way to shrink partial cached entries */
        if (!available_free_memory(sbi, NAT_ENTRIES) ||
-                        excess_prefree_segs(sbi) ||
                        !available_free_memory(sbi, INO_ENTRIES) ||
+                        excess_prefree_segs(sbi) ||
+                        excess_dirty_nats(sbi) ||
                        (is_idle(sbi) && f2fs_time_over(sbi, CP_TIME))) {
-                if (test_opt(sbi, DATA_FLUSH))
+                if (test_opt(sbi, DATA_FLUSH)) {
+                        struct blk_plug plug;
+                        blk_start_plug(&plug);
                        sync_dirty_inodes(sbi, FILE_INODE);
+                        blk_finish_plug(&plug);
+                }
                f2fs_sync_fs(sbi->sb, true);
                stat_inc_bg_cp_count(sbi->stat_info);
        }
@@ -502,7 +583,7 @@ static int f2fs_issue_discard(struct f2fs_sb_info *sbi,
 bool discard_next_dnode(struct f2fs_sb_info *sbi, block_t blkaddr)
 {
-        int err = -ENOTSUPP;
+        int err = -EOPNOTSUPP;
        if (test_opt(sbi, DISCARD)) {
                struct seg_entry *se = get_seg_entry(sbi,
@@ -841,6 +922,31 @@ static void write_sum_page(struct f2fs_sb_info *sbi,
        update_meta_page(sbi, (void *)sum_blk, blk_addr);
 }
+static void write_current_sum_page(struct f2fs_sb_info *sbi,
+                                                int type, block_t blk_addr)
+{
+        struct curseg_info *curseg = CURSEG_I(sbi, type);
+        struct page *page = grab_meta_page(sbi, blk_addr);
+        struct f2fs_summary_block *src = curseg->sum_blk;
+        struct f2fs_summary_block *dst;
+        dst = (struct f2fs_summary_block *)page_address(page);
+        mutex_lock(&curseg->curseg_mutex);
+        down_read(&curseg->journal_rwsem);
+        memcpy(&dst->journal, curseg->journal, SUM_JOURNAL_SIZE);
+        up_read(&curseg->journal_rwsem);
+        memcpy(dst->entries, src->entries, SUM_ENTRY_SIZE);
+        memcpy(&dst->footer, &src->footer, SUM_FOOTER_SIZE);
+        mutex_unlock(&curseg->curseg_mutex);
+        set_page_dirty(page);
+        f2fs_put_page(page, 1);
+}
 static int is_next_segment_free(struct f2fs_sb_info *sbi, int type)
 {
        struct curseg_info *curseg = CURSEG_I(sbi, type);
@@ -873,9 +979,8 @@ static void get_new_segment(struct f2fs_sb_info *sbi,
        if (!new_sec && ((*newseg + 1) % sbi->segs_per_sec)) {
                segno = find_next_zero_bit(free_i->free_segmap,
-                                        MAIN_SEGS(sbi), *newseg + 1);
+                                (hint + 1) * sbi->segs_per_sec, *newseg + 1);
-                if (segno - *newseg < sbi->segs_per_sec -
+                if (segno < (hint + 1) * sbi->segs_per_sec)
-                                        (*newseg % sbi->segs_per_sec))
                        goto got_it;
        }
 find_other_zone:
@@ -1280,8 +1385,8 @@ static void do_write_page(struct f2fs_summary *sum, struct f2fs_io_info *fio)
 {
        int type = __get_segment_type(fio->page, fio->type);
-        allocate_data_block(fio->sbi, fio->page, fio->blk_addr,
+        allocate_data_block(fio->sbi, fio->page, fio->old_blkaddr,
-                                        &fio->blk_addr, sum, type);
+                                        &fio->new_blkaddr, sum, type);
        /* writeout dirty page into bdev */
        f2fs_submit_page_mbio(fio);
@@ -1293,7 +1398,8 @@ void write_meta_page(struct f2fs_sb_info *sbi, struct page *page)
                .sbi = sbi,
                .type = META,
                .rw = WRITE_SYNC | REQ_META | REQ_PRIO,
-                .blk_addr = page->index,
+                .old_blkaddr = page->index,
+                .new_blkaddr = page->index,
                .page = page,
                .encrypted_page = NULL,
        };
@@ -1323,19 +1429,19 @@ void write_data_page(struct dnode_of_data *dn, struct f2fs_io_info *fio)
        get_node_info(sbi, dn->nid, &ni);
        set_summary(&sum, dn->nid, dn->ofs_in_node, ni.version);
        do_write_page(&sum, fio);
-        dn->data_blkaddr = fio->blk_addr;
+        f2fs_update_data_blkaddr(dn, fio->new_blkaddr);
 }
 void rewrite_data_page(struct f2fs_io_info *fio)
 {
+        fio->new_blkaddr = fio->old_blkaddr;
        stat_inc_inplace_blocks(fio->sbi);
        f2fs_submit_page_mbio(fio);
 }
-static void __f2fs_replace_block(struct f2fs_sb_info *sbi,
+void __f2fs_replace_block(struct f2fs_sb_info *sbi, struct f2fs_summary *sum,
-                                struct f2fs_summary *sum,
                                block_t old_blkaddr, block_t new_blkaddr,
-                                bool recover_curseg)
+                                bool recover_curseg, bool recover_newaddr)
 {
        struct sit_info *sit_i = SIT_I(sbi);
        struct curseg_info *curseg;
@@ -1378,7 +1484,7 @@ static void __f2fs_replace_block(struct f2fs_sb_info *sbi,
        curseg->next_blkoff = GET_BLKOFF_FROM_SEG0(sbi, new_blkaddr);
        __add_sum_entry(sbi, type, sum);
-        if (!recover_curseg)
+        if (!recover_curseg || recover_newaddr)
                update_sit_entry(sbi, new_blkaddr, 1);
        if (GET_SEGNO(sbi, old_blkaddr) != NULL_SEGNO)
                update_sit_entry(sbi, old_blkaddr, -1);
@@ -1402,66 +1508,30 @@ static void __f2fs_replace_block(struct f2fs_sb_info *sbi,
 void f2fs_replace_block(struct f2fs_sb_info *sbi, struct dnode_of_data *dn,
                                block_t old_addr, block_t new_addr,
-                                unsigned char version, bool recover_curseg)
+                                unsigned char version, bool recover_curseg,
+                                bool recover_newaddr)
 {
        struct f2fs_summary sum;
        set_summary(&sum, dn->nid, dn->ofs_in_node, version);
-        __f2fs_replace_block(sbi, &sum, old_addr, new_addr, recover_curseg);
+        __f2fs_replace_block(sbi, &sum, old_addr, new_addr,
+                                        recover_curseg, recover_newaddr);
-        dn->data_blkaddr = new_addr;
+        f2fs_update_data_blkaddr(dn, new_addr);
-        set_data_blkaddr(dn);
-        f2fs_update_extent_cache(dn);
-}
-static inline bool is_merged_page(struct f2fs_sb_info *sbi,
-                                        struct page *page, enum page_type type)
-{
-        enum page_type btype = PAGE_TYPE_OF_BIO(type);
-        struct f2fs_bio_info *io = &sbi->write_io[btype];
-        struct bio_vec *bvec;
-        struct page *target;
-        int i;
-        down_read(&io->io_rwsem);
-        if (!io->bio) {
-                up_read(&io->io_rwsem);
-                return false;
-        }
-        bio_for_each_segment_all(bvec, io->bio, i) {
-                if (bvec->bv_page->mapping) {
-                        target = bvec->bv_page;
-                } else {
-                        struct f2fs_crypto_ctx *ctx;
-                        /* encrypted page */
-                        ctx = (struct f2fs_crypto_ctx *)page_private(
-                                                                bvec->bv_page);
-                        target = ctx->w.control_page;
-                }
-                if (page == target) {
-                        up_read(&io->io_rwsem);
-                        return true;
-                }
-        }
-        up_read(&io->io_rwsem);
-        return false;
 }
 void f2fs_wait_on_page_writeback(struct page *page,
-                                enum page_type type)
+                                enum page_type type, bool ordered)
 {
        if (PageWriteback(page)) {
                struct f2fs_sb_info *sbi = F2FS_P_SB(page);
-                if (is_merged_page(sbi, page, type))
+                f2fs_submit_merged_bio_cond(sbi, NULL, page, 0, type, WRITE);
-                        f2fs_submit_merged_bio(sbi, type, WRITE);
+                if (ordered)
-                wait_on_page_writeback(page);
+                        wait_on_page_writeback(page);
+                else
+                        wait_for_stable_page(page);
        }
 }
@@ -1477,7 +1547,7 @@ void f2fs_wait_on_encrypted_page_writeback(struct f2fs_sb_info *sbi,
        cpage = find_lock_page(META_MAPPING(sbi), blkaddr);
        if (cpage) {
-                f2fs_wait_on_page_writeback(cpage, DATA);
+                f2fs_wait_on_page_writeback(cpage, DATA, true);
                f2fs_put_page(cpage, 1);
        }
 }
@@ -1498,12 +1568,11 @@ static int read_compacted_summaries(struct f2fs_sb_info *sbi)
        /* Step 1: restore nat cache */
        seg_i = CURSEG_I(sbi, CURSEG_HOT_DATA);
-        memcpy(&seg_i->sum_blk->n_nats, kaddr, SUM_JOURNAL_SIZE);
+        memcpy(seg_i->journal, kaddr, SUM_JOURNAL_SIZE);
        /* Step 2: restore sit cache */
        seg_i = CURSEG_I(sbi, CURSEG_COLD_DATA);
-        memcpy(&seg_i->sum_blk->n_sits, kaddr + SUM_JOURNAL_SIZE,
+        memcpy(seg_i->journal, kaddr + SUM_JOURNAL_SIZE, SUM_JOURNAL_SIZE);
-                                                SUM_JOURNAL_SIZE);
        offset = 2 * SUM_JOURNAL_SIZE;
        /* Step 3: restore summary entries */
@@ -1599,7 +1668,14 @@ static int read_normal_summaries(struct f2fs_sb_info *sbi, int type)
        /* set uncompleted segment to curseg */
        curseg = CURSEG_I(sbi, type);
        mutex_lock(&curseg->curseg_mutex);
-        memcpy(curseg->sum_blk, sum, PAGE_CACHE_SIZE);
+        /* update journal info */
+        down_write(&curseg->journal_rwsem);
+        memcpy(curseg->journal, &sum->journal, SUM_JOURNAL_SIZE);
+        up_write(&curseg->journal_rwsem);
+        memcpy(curseg->sum_blk->entries, sum->entries, SUM_ENTRY_SIZE);
+        memcpy(&curseg->sum_blk->footer, &sum->footer, SUM_FOOTER_SIZE);
        curseg->next_segno = segno;
        reset_curseg(sbi, type, 0);
        curseg->alloc_type = ckpt->alloc_type[type];
@@ -1654,13 +1730,12 @@ static void write_compacted_summaries(struct f2fs_sb_info *sbi, block_t blkaddr)
        /* Step 1: write nat cache */
        seg_i = CURSEG_I(sbi, CURSEG_HOT_DATA);
-        memcpy(kaddr, &seg_i->sum_blk->n_nats, SUM_JOURNAL_SIZE);
+        memcpy(kaddr, seg_i->journal, SUM_JOURNAL_SIZE);
        written_size += SUM_JOURNAL_SIZE;
        /* Step 2: write sit cache */
        seg_i = CURSEG_I(sbi, CURSEG_COLD_DATA);
-        memcpy(kaddr + written_size, &seg_i->sum_blk->n_sits,
+        memcpy(kaddr + written_size, seg_i->journal, SUM_JOURNAL_SIZE);
-                                                SUM_JOURNAL_SIZE);
        written_size += SUM_JOURNAL_SIZE;
        /* Step 3: write summary entries */
@@ -1706,12 +1781,8 @@ static void write_normal_summaries(struct f2fs_sb_info *sbi,
        else
                end = type + NR_CURSEG_NODE_TYPE;
-        for (i = type; i < end; i++) {
+        for (i = type; i < end; i++)
-                struct curseg_info *sum = CURSEG_I(sbi, i);
+                write_current_sum_page(sbi, i, blkaddr + (i - type));
-                mutex_lock(&sum->curseg_mutex);
-                write_sum_page(sbi, sum->sum_blk, blkaddr + (i - type));
-                mutex_unlock(&sum->curseg_mutex);
-        }
 }
 void write_data_summaries(struct f2fs_sb_info *sbi, block_t start_blk)
@@ -1727,24 +1798,24 @@ void write_node_summaries(struct f2fs_sb_info *sbi, block_t start_blk)
        write_normal_summaries(sbi, start_blk, CURSEG_HOT_NODE);
 }
-int lookup_journal_in_cursum(struct f2fs_summary_block *sum, int type,
+int lookup_journal_in_cursum(struct f2fs_journal *journal, int type,
                                        unsigned int val, int alloc)
 {
        int i;
        if (type == NAT_JOURNAL) {
-                for (i = 0; i < nats_in_cursum(sum); i++) {
+                for (i = 0; i < nats_in_cursum(journal); i++) {
-                        if (le32_to_cpu(nid_in_journal(sum, i)) == val)
+                        if (le32_to_cpu(nid_in_journal(journal, i)) == val)
                                return i;
                }
-                if (alloc && __has_cursum_space(sum, 1, NAT_JOURNAL))
+                if (alloc && __has_cursum_space(journal, 1, NAT_JOURNAL))
-                        return update_nats_in_cursum(sum, 1);
+                        return update_nats_in_cursum(journal, 1);
        } else if (type == SIT_JOURNAL) {
-                for (i = 0; i < sits_in_cursum(sum); i++)
+                for (i = 0; i < sits_in_cursum(journal); i++)
-                        if (le32_to_cpu(segno_in_journal(sum, i)) == val)
+                        if (le32_to_cpu(segno_in_journal(journal, i)) == val)
                                return i;
-                if (alloc && __has_cursum_space(sum, 1, SIT_JOURNAL))
+                if (alloc && __has_cursum_space(journal, 1, SIT_JOURNAL))
-                        return update_sits_in_cursum(sum, 1);
+                        return update_sits_in_cursum(journal, 1);
        }
        return -1;
 }
@@ -1848,20 +1919,22 @@ static void add_sits_in_set(struct f2fs_sb_info *sbi)
 static void remove_sits_in_journal(struct f2fs_sb_info *sbi)
 {
        struct curseg_info *curseg = CURSEG_I(sbi, CURSEG_COLD_DATA);
-        struct f2fs_summary_block *sum = curseg->sum_blk;
+        struct f2fs_journal *journal = curseg->journal;
        int i;
-        for (i = sits_in_cursum(sum) - 1; i >= 0; i--) {
+        down_write(&curseg->journal_rwsem);
+        for (i = 0; i < sits_in_cursum(journal); i++) {
                unsigned int segno;
                bool dirtied;
-                segno = le32_to_cpu(segno_in_journal(sum, i));
+                segno = le32_to_cpu(segno_in_journal(journal, i));
                dirtied = __mark_sit_entry_dirty(sbi, segno);
                if (!dirtied)
                        add_sit_entry(segno, &SM_I(sbi)->sit_entry_set);
        }
-        update_sits_in_cursum(sum, -sits_in_cursum(sum));
+        update_sits_in_cursum(journal, -i);
+        up_write(&curseg->journal_rwsem);
 }
 /*
@@ -1873,13 +1946,12 @@ void flush_sit_entries(struct f2fs_sb_info *sbi, struct cp_control *cpc)
        struct sit_info *sit_i = SIT_I(sbi);
        unsigned long *bitmap = sit_i->dirty_sentries_bitmap;
        struct curseg_info *curseg = CURSEG_I(sbi, CURSEG_COLD_DATA);
-        struct f2fs_summary_block *sum = curseg->sum_blk;
+        struct f2fs_journal *journal = curseg->journal;
        struct sit_entry_set *ses, *tmp;
        struct list_head *head = &SM_I(sbi)->sit_entry_set;
        bool to_journal = true;
        struct seg_entry *se;
-        mutex_lock(&curseg->curseg_mutex);
        mutex_lock(&sit_i->sentry_lock);
        if (!sit_i->dirty_sentries)
@@ -1896,7 +1968,7 @@ void flush_sit_entries(struct f2fs_sb_info *sbi, struct cp_control *cpc)
         * entries, remove all entries from journal and add and account
         * them in sit entry set.
         */
-        if (!__has_cursum_space(sum, sit_i->dirty_sentries, SIT_JOURNAL))
+        if (!__has_cursum_space(journal, sit_i->dirty_sentries, SIT_JOURNAL))
                remove_sits_in_journal(sbi);
        /*
@@ -1913,10 +1985,12 @@ void flush_sit_entries(struct f2fs_sb_info *sbi, struct cp_control *cpc)
                unsigned int segno = start_segno;
                if (to_journal &&
-                        !__has_cursum_space(sum, ses->entry_cnt, SIT_JOURNAL))
+                        !__has_cursum_space(journal, ses->entry_cnt, SIT_JOURNAL))
                        to_journal = false;
-                if (!to_journal) {
+                if (to_journal) {
+                        down_write(&curseg->journal_rwsem);
+                } else {
                        page = get_next_sit_page(sbi, start_segno);
                        raw_sit = page_address(page);
                }
@@ -1934,13 +2008,13 @@ void flush_sit_entries(struct f2fs_sb_info *sbi, struct cp_control *cpc)
                        }
                        if (to_journal) {
-                                offset = lookup_journal_in_cursum(sum,
+                                offset = lookup_journal_in_cursum(journal,
                                                        SIT_JOURNAL, segno, 1);
                                f2fs_bug_on(sbi, offset < 0);
-                                segno_in_journal(sum, offset) =
+                                segno_in_journal(journal, offset) =
                                                        cpu_to_le32(segno);
                                seg_info_to_raw_sit(se,
-                                                &sit_in_journal(sum, offset));
+                                        &sit_in_journal(journal, offset));
                        } else {
                                sit_offset = SIT_ENTRY_OFFSET(sit_i, segno);
                                seg_info_to_raw_sit(se,
@@ -1952,7 +2026,9 @@ void flush_sit_entries(struct f2fs_sb_info *sbi, struct cp_control *cpc)
                        ses->entry_cnt--;
                }
-                if (!to_journal)
+                if (to_journal)
+                        up_write(&curseg->journal_rwsem);
+                else
                        f2fs_put_page(page, 1);
                f2fs_bug_on(sbi, ses->entry_cnt);
@@ -1967,7 +2043,6 @@ out:
                        add_discard_addrs(sbi, cpc);
        }
        mutex_unlock(&sit_i->sentry_lock);
-        mutex_unlock(&curseg->curseg_mutex);
        set_prefree_as_free_segments(sbi);
 }
@@ -2099,6 +2174,11 @@ static int build_curseg(struct f2fs_sb_info *sbi)
                array[i].sum_blk = kzalloc(PAGE_CACHE_SIZE, GFP_KERNEL);
                if (!array[i].sum_blk)
                        return -ENOMEM;
+                init_rwsem(&array[i].journal_rwsem);
+                array[i].journal = kzalloc(sizeof(struct f2fs_journal),
+                                                        GFP_KERNEL);
+                if (!array[i].journal)
+                        return -ENOMEM;
                array[i].segno = NULL_SEGNO;
                array[i].next_blkoff = 0;
        }
@@ -2109,11 +2189,11 @@ static void build_sit_entries(struct f2fs_sb_info *sbi)
 {
        struct sit_info *sit_i = SIT_I(sbi);
        struct curseg_info *curseg = CURSEG_I(sbi, CURSEG_COLD_DATA);
-        struct f2fs_summary_block *sum = curseg->sum_blk;
+        struct f2fs_journal *journal = curseg->journal;
        int sit_blk_cnt = SIT_BLK_CNT(sbi);
        unsigned int i, start, end;
        unsigned int readed, start_blk = 0;
-        int nrpages = MAX_BIO_BLOCKS(sbi);
+        int nrpages = MAX_BIO_BLOCKS(sbi) * 8;
        do {
                readed = ra_meta_pages(sbi, start_blk, nrpages, META_SIT, true);
@@ -2127,16 +2207,16 @@ static void build_sit_entries(struct f2fs_sb_info *sbi)
                        struct f2fs_sit_entry sit;
                        struct page *page;
-                        mutex_lock(&curseg->curseg_mutex);
+                        down_read(&curseg->journal_rwsem);
-                        for (i = 0; i < sits_in_cursum(sum); i++) {
+                        for (i = 0; i < sits_in_cursum(journal); i++) {
-                                if (le32_to_cpu(segno_in_journal(sum, i))
+                                if (le32_to_cpu(segno_in_journal(journal, i))
                                                                == start) {
-                                        sit = sit_in_journal(sum, i);
+                                        sit = sit_in_journal(journal, i);
-                                        mutex_unlock(&curseg->curseg_mutex);
+                                        up_read(&curseg->journal_rwsem);
                                        goto got_it;
                                }
                        }
-                        mutex_unlock(&curseg->curseg_mutex);
+                        up_read(&curseg->journal_rwsem);
                        page = get_current_sit_page(sbi, start);
                        sit_blk = (struct f2fs_sit_block *)page_address(page);
@@ -2371,8 +2451,10 @@ static void destroy_curseg(struct f2fs_sb_info *sbi)
        if (!array)
                return;
        SM_I(sbi)->curseg_array = NULL;
-        for (i = 0; i < NR_CURSEG_TYPE; i++)
+        for (i = 0; i < NR_CURSEG_TYPE; i++) {
                kfree(array[i].sum_blk);
+                kfree(array[i].journal);
+        }
        kfree(array);
 }
diff --git a/fs/f2fs/segment.h b/fs/f2fs/segment.h
index ee44d346ea44..975c33df65c7 100644
--- a/fs/f2fs/segment.h
+++ b/fs/f2fs/segment.h
@@ -183,7 +183,7 @@ struct segment_allocation {
 * this value is set in page as a private data which indicate that
 * the page is atomically written, and it is in inmem_pages list.
 */
-#define ATOMIC_WRITTEN_PAGE             0x0000ffff
+#define ATOMIC_WRITTEN_PAGE             ((unsigned long)-1)
 #define IS_ATOMIC_WRITTEN_PAGE(page)                    \
                (page_private(page) == (unsigned long)ATOMIC_WRITTEN_PAGE)
@@ -191,6 +191,7 @@ struct segment_allocation {
 struct inmem_pages {
        struct list_head list;
        struct page *page;
+        block_t old_addr;               /* for revoking when fail to commit */
 };
 struct sit_info {
@@ -257,6 +258,8 @@ struct victim_selection {
 struct curseg_info {
        struct mutex curseg_mutex;              /* lock for consistency */
        struct f2fs_summary_block *sum_blk;     /* cached summary block */
+        struct rw_semaphore journal_rwsem;      /* protect journal area */
+        struct f2fs_journal *journal;           /* cached journal info */
        unsigned char alloc_type;               /* current allocation type */
        unsigned int segno;                     /* current segment number */
        unsigned short next_blkoff;             /* next block offset to write */
diff --git a/fs/f2fs/super.c b/fs/f2fs/super.c
index 6134832baaaf..15bb81f8dac2 100644
--- a/fs/f2fs/super.c
+++ b/fs/f2fs/super.c
@@ -126,6 +126,19 @@ static unsigned char *__struct_ptr(struct f2fs_sb_info *sbi, int struct_type)
        return NULL;
 }
+static ssize_t lifetime_write_kbytes_show(struct f2fs_attr *a,
+                struct f2fs_sb_info *sbi, char *buf)
+{
+        struct super_block *sb = sbi->sb;
+        if (!sb->s_bdev->bd_part)
+                return snprintf(buf, PAGE_SIZE, "0\n");
+        return snprintf(buf, PAGE_SIZE, "%llu\n",
+                (unsigned long long)(sbi->kbytes_written +
+                        BD_PART_WRITTEN(sbi)));
+}
 static ssize_t f2fs_sbi_show(struct f2fs_attr *a,
                        struct f2fs_sb_info *sbi, char *buf)
 {
@@ -204,6 +217,9 @@ static struct f2fs_attr f2fs_attr_##_name = {			\
                f2fs_sbi_show, f2fs_sbi_store,                  \
                offsetof(struct struct_name, elname))
+#define F2FS_GENERAL_RO_ATTR(name) \
+static struct f2fs_attr f2fs_attr_##name = __ATTR(name, 0444, name##_show, NULL)
 F2FS_RW_ATTR(GC_THREAD, f2fs_gc_kthread, gc_min_sleep_time, min_sleep_time);
 F2FS_RW_ATTR(GC_THREAD, f2fs_gc_kthread, gc_max_sleep_time, max_sleep_time);
 F2FS_RW_ATTR(GC_THREAD, f2fs_gc_kthread, gc_no_gc_sleep_time, no_gc_sleep_time);
@@ -216,10 +232,12 @@ F2FS_RW_ATTR(SM_INFO, f2fs_sm_info, min_ipu_util, min_ipu_util);
 F2FS_RW_ATTR(SM_INFO, f2fs_sm_info, min_fsync_blocks, min_fsync_blocks);
 F2FS_RW_ATTR(NM_INFO, f2fs_nm_info, ram_thresh, ram_thresh);
 F2FS_RW_ATTR(NM_INFO, f2fs_nm_info, ra_nid_pages, ra_nid_pages);
+F2FS_RW_ATTR(NM_INFO, f2fs_nm_info, dirty_nats_ratio, dirty_nats_ratio);
 F2FS_RW_ATTR(F2FS_SBI, f2fs_sb_info, max_victim_search, max_victim_search);
 F2FS_RW_ATTR(F2FS_SBI, f2fs_sb_info, dir_level, dir_level);
 F2FS_RW_ATTR(F2FS_SBI, f2fs_sb_info, cp_interval, interval_time[CP_TIME]);
 F2FS_RW_ATTR(F2FS_SBI, f2fs_sb_info, idle_interval, interval_time[REQ_TIME]);
+F2FS_GENERAL_RO_ATTR(lifetime_write_kbytes);
 #define ATTR_LIST(name) (&f2fs_attr_##name.attr)
 static struct attribute *f2fs_attrs[] = {
@@ -237,8 +255,10 @@ static struct attribute *f2fs_attrs[] = {
        ATTR_LIST(dir_level),
        ATTR_LIST(ram_thresh),
        ATTR_LIST(ra_nid_pages),
+        ATTR_LIST(dirty_nats_ratio),
        ATTR_LIST(cp_interval),
        ATTR_LIST(idle_interval),
+        ATTR_LIST(lifetime_write_kbytes),
        NULL,
 };
@@ -450,10 +470,6 @@ static struct inode *f2fs_alloc_inode(struct super_block *sb)
        /* Will be used by directory only */
        fi->i_dir_level = F2FS_SB(sb)->dir_level;
-#ifdef CONFIG_F2FS_FS_ENCRYPTION
-        fi->i_crypt_info = NULL;
-#endif
        return &fi->vfs_inode;
 }
@@ -474,7 +490,7 @@ static int f2fs_drop_inode(struct inode *inode)
                        /* some remained atomic pages should discarded */
                        if (f2fs_is_atomic_file(inode))
-                                commit_inmem_pages(inode, true);
+                                drop_inmem_pages(inode);
                        /* should remain fi->extent_tree for writepage */
                        f2fs_destroy_extent_node(inode);
@@ -487,11 +503,7 @@ static int f2fs_drop_inode(struct inode *inode)
                        sb_end_intwrite(inode->i_sb);
-#ifdef CONFIG_F2FS_FS_ENCRYPTION
+                        fscrypt_put_encryption_info(inode, NULL);
-                        if (F2FS_I(inode)->i_crypt_info)
-                                f2fs_free_encryption_info(inode,
-                                        F2FS_I(inode)->i_crypt_info);
-#endif
                        spin_lock(&inode->i_lock);
                        atomic_dec(&inode->i_count);
                }
@@ -562,6 +574,10 @@ static void f2fs_put_super(struct super_block *sb)
        f2fs_leave_shrinker(sbi);
        mutex_unlock(&sbi->umount_mutex);
+        /* our cp_error case, we can wait for any writeback page */
+        if (get_pages(sbi, F2FS_WRITEBACK))
+                f2fs_flush_merged_bios(sbi);
        iput(sbi->node_inode);
        iput(sbi->meta_inode);
@@ -574,6 +590,8 @@ static void f2fs_put_super(struct super_block *sb)
        wait_for_completion(&sbi->s_kobj_unregister);
        sb->s_fs_info = NULL;
+        if (sbi->s_chksum_driver)
+                crypto_free_shash(sbi->s_chksum_driver);
        kfree(sbi->raw_super);
        kfree(sbi);
 }
@@ -766,8 +784,6 @@ static int f2fs_remount(struct super_block *sb, int *flags, char *data)
        bool need_stop_gc = false;
        bool no_extent_cache = !test_opt(sbi, EXTENT_CACHE);
-        sync_filesystem(sb);
        /*
         * Save the old mount options in case we
         * need to restore them.
@@ -775,6 +791,13 @@ static int f2fs_remount(struct super_block *sb, int *flags, char *data)
        org_mount_opt = sbi->mount_opt;
        active_logs = sbi->active_logs;
+        if (*flags & MS_RDONLY) {
+                set_opt(sbi, FASTBOOT);
+                set_sbi_flag(sbi, SBI_IS_DIRTY);
+        }
+        sync_filesystem(sb);
        sbi->mount_opt.opt = 0;
        default_options(sbi);
@@ -862,6 +885,41 @@ static struct super_operations f2fs_sops = {
        .remount_fs     = f2fs_remount,
 };
+#ifdef CONFIG_F2FS_FS_ENCRYPTION
+static int f2fs_get_context(struct inode *inode, void *ctx, size_t len)
+{
+        return f2fs_getxattr(inode, F2FS_XATTR_INDEX_ENCRYPTION,
+                                F2FS_XATTR_NAME_ENCRYPTION_CONTEXT,
+                                ctx, len, NULL);
+}
+static int f2fs_set_context(struct inode *inode, const void *ctx, size_t len,
+                                                        void *fs_data)
+{
+        return f2fs_setxattr(inode, F2FS_XATTR_INDEX_ENCRYPTION,
+                                F2FS_XATTR_NAME_ENCRYPTION_CONTEXT,
+                                ctx, len, fs_data, XATTR_CREATE);
+}
+static unsigned f2fs_max_namelen(struct inode *inode)
+{
+        return S_ISLNK(inode->i_mode) ?
+                        inode->i_sb->s_blocksize : F2FS_NAME_LEN;
+}
+static struct fscrypt_operations f2fs_cryptops = {
+        .get_context    = f2fs_get_context,
+        .set_context    = f2fs_set_context,
+        .is_encrypted   = f2fs_encrypted_inode,
+        .empty_dir      = f2fs_empty_dir,
+        .max_namelen    = f2fs_max_namelen,
+};
+#else
+static struct fscrypt_operations f2fs_cryptops = {
+        .is_encrypted   = f2fs_encrypted_inode,
+};
+#endif
 static struct inode *f2fs_nfs_get_inode(struct super_block *sb,
                u64 ino, u32 generation)
 {
@@ -1074,7 +1132,7 @@ static int sanity_check_raw_super(struct super_block *sb,
        return 0;
 }
-static int sanity_check_ckpt(struct f2fs_sb_info *sbi)
+int sanity_check_ckpt(struct f2fs_sb_info *sbi)
 {
        unsigned int total, fsmeta;
        struct f2fs_super_block *raw_super = F2FS_RAW_SUPER(sbi);
@@ -1134,14 +1192,15 @@ static void init_sb_info(struct f2fs_sb_info *sbi)
 /*
 * Read f2fs raw super block.
- * Because we have two copies of super block, so read the first one at first,
+ * Because we have two copies of super block, so read both of them
- * if the first one is invalid, move to read the second one.
+ * to get the first valid one. If any one of them is broken, we pass
+ * them recovery flag back to the caller.
 */
 static int read_raw_super_block(struct super_block *sb,
                        struct f2fs_super_block **raw_super,
                        int *valid_super_block, int *recovery)
 {
-        int block = 0;
+        int block;
        struct buffer_head *bh;
        struct f2fs_super_block *super, *buf;
        int err = 0;
@@ -1149,50 +1208,48 @@ static int read_raw_super_block(struct super_block *sb,
        super = kzalloc(sizeof(struct f2fs_super_block), GFP_KERNEL);
        if (!super)
                return -ENOMEM;
-retry:
-        bh = sb_bread(sb, block);
+        for (block = 0; block < 2; block++) {
-        if (!bh) {
+                bh = sb_bread(sb, block);
-                *recovery = 1;
+                if (!bh) {
-                f2fs_msg(sb, KERN_ERR, "Unable to read %dth superblock",
+                        f2fs_msg(sb, KERN_ERR, "Unable to read %dth superblock",
                                block + 1);
-                err = -EIO;
+                        err = -EIO;
-                goto next;
+                        continue;
-        }
+                }
-        buf = (struct f2fs_super_block *)(bh->b_data + F2FS_SUPER_OFFSET);
+                buf = (struct f2fs_super_block *)
+                                (bh->b_data + F2FS_SUPER_OFFSET);
-        /* sanity checking of raw super */
+                /* sanity checking of raw super */
-        if (sanity_check_raw_super(sb, buf)) {
+                if (sanity_check_raw_super(sb, buf)) {
-                brelse(bh);
+                        f2fs_msg(sb, KERN_ERR,
-                *recovery = 1;
+                                "Can't find valid F2FS filesystem in %dth superblock",
-                f2fs_msg(sb, KERN_ERR,
+                                block + 1);
-                        "Can't find valid F2FS filesystem in %dth superblock",
+                        err = -EINVAL;
-                                                                block + 1);
+                        brelse(bh);
-                err = -EINVAL;
+                        continue;
-                goto next;
+                }
-        }
-        if (!*raw_super) {
+                if (!*raw_super) {
-                memcpy(super, buf, sizeof(*super));
+                        memcpy(super, buf, sizeof(*super));
-                *valid_super_block = block;
+                        *valid_super_block = block;
-                *raw_super = super;
+                        *raw_super = super;
+                }
+                brelse(bh);
        }
-        brelse(bh);
-next:
+        /* Fail to read any one of the superblocks*/
-        /* check the validity of the second superblock */
+        if (err < 0)
-        if (block == 0) {
+                *recovery = 1;
-                block++;
-                goto retry;
-        }
        /* No valid superblock */
-        if (!*raw_super) {
+        if (!*raw_super)
                kfree(super);
-                return err;
+        else
-        }
+                err = 0;
-        return 0;
+        return err;
 }
 static int __f2fs_commit_super(struct f2fs_sb_info *sbi, int block)
@@ -1242,6 +1299,7 @@ static int f2fs_fill_super(struct super_block *sb, void *data, int silent)
        bool retry = true, need_fsck = false;
        char *options = NULL;
        int recovery, i, valid_super_block;
+        struct curseg_info *seg_i;
 try_onemore:
        err = -EINVAL;
@@ -1254,6 +1312,15 @@ try_onemore:
        if (!sbi)
                return -ENOMEM;
+        /* Load the checksum driver */
+        sbi->s_chksum_driver = crypto_alloc_shash("crc32", 0, 0);
+        if (IS_ERR(sbi->s_chksum_driver)) {
+                f2fs_msg(sb, KERN_ERR, "Cannot load crc32 driver.");
+                err = PTR_ERR(sbi->s_chksum_driver);
+                sbi->s_chksum_driver = NULL;
+                goto free_sbi;
+        }
        /* set a block size */
        if (unlikely(!sb_set_blocksize(sb, F2FS_BLKSIZE))) {
                f2fs_msg(sb, KERN_ERR, "unable to set blocksize");
@@ -1285,6 +1352,7 @@ try_onemore:
        get_random_bytes(&sbi->s_next_generation, sizeof(u32));
        sb->s_op = &f2fs_sops;
+        sb->s_cop = &f2fs_cryptops;
        sb->s_xattr = f2fs_xattr_handlers;
        sb->s_export_op = &f2fs_export_ops;
        sb->s_magic = F2FS_SUPER_MAGIC;
@@ -1333,13 +1401,6 @@ try_onemore:
                goto free_meta_inode;
        }
-        /* sanity checking of checkpoint */
-        err = -EINVAL;
-        if (sanity_check_ckpt(sbi)) {
-                f2fs_msg(sb, KERN_ERR, "Invalid F2FS checkpoint");
-                goto free_cp;
-        }
        sbi->total_valid_node_count =
                                le32_to_cpu(sbi->ckpt->valid_node_count);
        sbi->total_valid_inode_count =
@@ -1372,6 +1433,17 @@ try_onemore:
                goto free_nm;
        }
+        /* For write statistics */
+        if (sb->s_bdev->bd_part)
+                sbi->sectors_written_start =
+                        (u64)part_stat_read(sb->s_bdev->bd_part, sectors[1]);
+        /* Read accumulated write IO statistics if exists */
+        seg_i = CURSEG_I(sbi, CURSEG_HOT_NODE);
+        if (__exist_node_summaries(sbi))
+                sbi->kbytes_written =
+                        le64_to_cpu(seg_i->sum_blk->journal.info.kbytes_written);
        build_gc_manager(sbi);
        /* get an inode for node space */
@@ -1466,8 +1538,10 @@ try_onemore:
        /* recover broken superblock */
        if (recovery && !f2fs_readonly(sb) && !bdev_read_only(sb->s_bdev)) {
-                f2fs_msg(sb, KERN_INFO, "Recover invalid superblock");
+                err = f2fs_commit_super(sbi, true);
-                f2fs_commit_super(sbi, true);
+                f2fs_msg(sb, KERN_INFO,
+                        "Try to recover %dth superblock, ret: %ld",
+                        sbi->valid_super_block ? 1 : 2, err);
        }
        f2fs_update_time(sbi, CP_TIME);
@@ -1496,7 +1570,6 @@ free_nm:
        destroy_node_manager(sbi);
 free_sm:
        destroy_segment_manager(sbi);
-free_cp:
        kfree(sbi->ckpt);
 free_meta_inode:
        make_bad_inode(sbi->meta_inode);
@@ -1506,6 +1579,8 @@ free_options:
 free_sb_buf:
        kfree(raw_super);
 free_sbi:
+        if (sbi->s_chksum_driver)
+                crypto_free_shash(sbi->s_chksum_driver);
        kfree(sbi);
        /* give only one another chance */
@@ -1585,13 +1660,9 @@ static int __init init_f2fs_fs(void)
                err = -ENOMEM;
                goto free_extent_cache;
        }
-        err = f2fs_init_crypto();
-        if (err)
-                goto free_kset;
        err = register_shrinker(&f2fs_shrinker_info);
        if (err)
-                goto free_crypto;
+                goto free_kset;
        err = register_filesystem(&f2fs_fs_type);
        if (err)
@@ -1606,8 +1677,6 @@ free_filesystem:
        unregister_filesystem(&f2fs_fs_type);
 free_shrinker:
        unregister_shrinker(&f2fs_shrinker_info);
-free_crypto:
-        f2fs_exit_crypto();
 free_kset:
        kset_unregister(f2fs_kset);
 free_extent_cache:
@@ -1630,7 +1699,6 @@ static void __exit exit_f2fs_fs(void)
        f2fs_destroy_root_stats();
        unregister_shrinker(&f2fs_shrinker_info);
        unregister_filesystem(&f2fs_fs_type);
-        f2fs_exit_crypto();
        destroy_extent_cache();
        destroy_checkpoint_caches();
        destroy_segment_manager_caches();
diff --git a/fs/f2fs/trace.c b/fs/f2fs/trace.c
index 145fb659ad44..562ce0821559 100644
--- a/fs/f2fs/trace.c
+++ b/fs/f2fs/trace.c
@@ -29,7 +29,8 @@ static inline void __print_last_io(void)
                        last_io.major, last_io.minor,
                        last_io.pid, "----------------",
                        last_io.type,
-                        last_io.fio.rw, last_io.fio.blk_addr,
+                        last_io.fio.rw,
+                        last_io.fio.new_blkaddr,
                        last_io.len);
        memset(&last_io, 0, sizeof(last_io));
 }
@@ -101,7 +102,8 @@ void f2fs_trace_ios(struct f2fs_io_info *fio, int flush)
                        last_io.pid == pid &&
                        last_io.type == __file_type(inode, pid) &&
                        last_io.fio.rw == fio->rw &&
-                        last_io.fio.blk_addr + last_io.len == fio->blk_addr) {
+                        last_io.fio.new_blkaddr + last_io.len ==
+                                                        fio->new_blkaddr) {
                last_io.len++;
                return;
        }
diff --git a/fs/f2fs/xattr.c b/fs/f2fs/xattr.c
index 10f1e784fa23..06a72dc0191a 100644
--- a/fs/f2fs/xattr.c
+++ b/fs/f2fs/xattr.c
@@ -300,7 +300,7 @@ static inline int write_all_xattrs(struct inode *inode, __u32 hsize,
                if (ipage) {
                        inline_addr = inline_xattr_addr(ipage);
-                        f2fs_wait_on_page_writeback(ipage, NODE);
+                        f2fs_wait_on_page_writeback(ipage, NODE, true);
                } else {
                        page = get_node_page(sbi, inode->i_ino);
                        if (IS_ERR(page)) {
@@ -308,7 +308,7 @@ static inline int write_all_xattrs(struct inode *inode, __u32 hsize,
                                return PTR_ERR(page);
                        }
                        inline_addr = inline_xattr_addr(page);
-                        f2fs_wait_on_page_writeback(page, NODE);
+                        f2fs_wait_on_page_writeback(page, NODE, true);
                }
                memcpy(inline_addr, txattr_addr, inline_size);
                f2fs_put_page(page, 1);
@@ -329,7 +329,7 @@ static inline int write_all_xattrs(struct inode *inode, __u32 hsize,
                        return PTR_ERR(xpage);
                }
                f2fs_bug_on(sbi, new_nid);
-                f2fs_wait_on_page_writeback(xpage, NODE);
+                f2fs_wait_on_page_writeback(xpage, NODE, true);
        } else {
                struct dnode_of_data dn;
                set_new_dnode(&dn, inode, NULL, NULL, new_nid);
diff --git a/fs/f2fs/xattr.h b/fs/f2fs/xattr.h
index 79dccc8252dd..f990de20cdcd 100644
--- a/fs/f2fs/xattr.h
+++ b/fs/f2fs/xattr.h
@@ -126,7 +126,8 @@ extern ssize_t f2fs_listxattr(struct dentry *, char *, size_t);
 #define f2fs_xattr_handlers     NULL
 static inline int f2fs_setxattr(struct inode *inode, int index,
-                const char *name, const void *value, size_t size, int flags)
+                const char *name, const void *value, size_t size,
+                struct page *page, int flags)
 {
        return -EOPNOTSUPP;
 }
diff --git a/fs/fat/Kconfig b/fs/fat/Kconfig
index 182f9ffe2b51..3ff1772f612e 100644
--- a/fs/fat/Kconfig
+++ b/fs/fat/Kconfig
@@ -93,8 +93,24 @@ config FAT_DEFAULT_IOCHARSET
          that most of your FAT filesystems use, and can be overridden
          with the "iocharset" mount option for FAT filesystems.
          Note that "utf8" is not recommended for FAT filesystems.
-          If unsure, you shouldn't set "utf8" here.
+          If unsure, you shouldn't set "utf8" here - select the next option
+          instead if you would like to use UTF-8 encoded file names by default.
          See <file:Documentation/filesystems/vfat.txt> for more information.
          Enable any character sets you need in File Systems/Native Language
          Support.
+config FAT_DEFAULT_UTF8
+        bool "Enable FAT UTF-8 option by default"
+        depends on VFAT_FS
+        default n
+        help
+          Set this if you would like to have "utf8" mount option set
+          by default when mounting FAT filesystems.
+          Even if you say Y here can always disable UTF-8 for
+          particular mount by adding "utf8=0" to mount options.
+          Say Y if you use UTF-8 encoding for file names, N otherwise.
+          See <file:Documentation/filesystems/vfat.txt> for more information.
diff --git a/fs/fat/inode.c b/fs/fat/inode.c
index a5599052116c..226281068a46 100644
--- a/fs/fat/inode.c
+++ b/fs/fat/inode.c
@@ -1127,7 +1127,7 @@ static int parse_options(struct super_block *sb, char *options, int is_vfat,
        }
        opts->name_check = 'n';
        opts->quiet = opts->showexec = opts->sys_immutable = opts->dotsOK =  0;
-        opts->utf8 = opts->unicode_xlate = 0;
+        opts->unicode_xlate = 0;
        opts->numtail = 1;
        opts->usefree = opts->nocase = 0;
        opts->tz_set = 0;
@@ -1135,6 +1135,8 @@ static int parse_options(struct super_block *sb, char *options, int is_vfat,
        opts->errors = FAT_ERRORS_RO;
        *debug = 0;
+        opts->utf8 = IS_ENABLED(CONFIG_FAT_DEFAULT_UTF8) && is_vfat;
        if (!options)
                goto out;
diff --git a/fs/fhandle.c b/fs/fhandle.c
index d59712dfa3e7..ca3c3dd01789 100644
--- a/fs/fhandle.c
+++ b/fs/fhandle.c
@@ -228,7 +228,7 @@ long do_handle_open(int mountdirfd,
                path_put(&path);
                return fd;
        }
-        file = file_open_root(path.dentry, path.mnt, "", open_flag);
+        file = file_open_root(path.dentry, path.mnt, "", open_flag, 0);
        if (IS_ERR(file)) {
                put_unused_fd(fd);
                retval =  PTR_ERR(file);
diff --git a/fs/fs-writeback.c b/fs/fs-writeback.c
index 5c46ed9f3e14..fee81e8768c9 100644
--- a/fs/fs-writeback.c
+++ b/fs/fs-writeback.c
@@ -281,13 +281,15 @@ locked_inode_to_wb_and_lock_list(struct inode *inode)
                wb_get(wb);
                spin_unlock(&inode->i_lock);
                spin_lock(&wb->list_lock);
-                wb_put(wb);             /* not gonna deref it anymore */
                /* i_wb may have changed inbetween, can't use inode_to_wb() */
-                if (likely(wb == inode->i_wb))
+                if (likely(wb == inode->i_wb)) {
-                        return wb;      /* @inode already has ref */
+                        wb_put(wb);     /* @inode already has ref */
+                        return wb;
+                }
                spin_unlock(&wb->list_lock);
+                wb_put(wb);
                cpu_relax();
                spin_lock(&inode->i_lock);
        }
@@ -1337,10 +1339,10 @@ __writeback_single_inode(struct inode *inode, struct writeback_control *wbc)
 * we go e.g. from filesystem. Flusher thread uses __writeback_single_inode()
 * and does more profound writeback list handling in writeback_sb_inodes().
 */
-static int
+static int writeback_single_inode(struct inode *inode,
-writeback_single_inode(struct inode *inode, struct bdi_writeback *wb,
+                                  struct writeback_control *wbc)
-                       struct writeback_control *wbc)
 {
+        struct bdi_writeback *wb;
        int ret = 0;
        spin_lock(&inode->i_lock);
@@ -1378,7 +1380,8 @@ writeback_single_inode(struct inode *inode, struct bdi_writeback *wb,
        ret = __writeback_single_inode(inode, wbc);
        wbc_detach_inode(wbc);
-        spin_lock(&wb->list_lock);
+        wb = inode_to_wb_and_lock_list(inode);
        spin_lock(&inode->i_lock);
        /*
         * If inode is clean, remove it from writeback lists. Otherwise don't
@@ -1453,6 +1456,7 @@ static long writeback_sb_inodes(struct super_block *sb,
        while (!list_empty(&wb->b_io)) {
                struct inode *inode = wb_inode(wb->b_io.prev);
+                struct bdi_writeback *tmp_wb;
                if (inode->i_sb != sb) {
                        if (work->sb) {
@@ -1543,15 +1547,23 @@ static long writeback_sb_inodes(struct super_block *sb,
                        cond_resched();
                }
+                /*
-                spin_lock(&wb->list_lock);
+                 * Requeue @inode if still dirty.  Be careful as @inode may
+                 * have been switched to another wb in the meantime.
+                 */
+                tmp_wb = inode_to_wb_and_lock_list(inode);
                spin_lock(&inode->i_lock);
                if (!(inode->i_state & I_DIRTY_ALL))
                        wrote++;
-                requeue_inode(inode, wb, &wbc);
+                requeue_inode(inode, tmp_wb, &wbc);
                inode_sync_complete(inode);
                spin_unlock(&inode->i_lock);
+                if (unlikely(tmp_wb != wb)) {
+                        spin_unlock(&tmp_wb->list_lock);
+                        spin_lock(&wb->list_lock);
+                }
                /*
                 * bail out to wb_writeback() often enough to check
                 * background threshold and other termination conditions.
@@ -2338,7 +2350,6 @@ EXPORT_SYMBOL(sync_inodes_sb);
 */
 int write_inode_now(struct inode *inode, int sync)
 {
-        struct bdi_writeback *wb = &inode_to_bdi(inode)->wb;
        struct writeback_control wbc = {
                .nr_to_write = LONG_MAX,
                .sync_mode = sync ? WB_SYNC_ALL : WB_SYNC_NONE,
@@ -2350,7 +2361,7 @@ int write_inode_now(struct inode *inode, int sync)
                wbc.nr_to_write = 0;
        might_sleep();
-        return writeback_single_inode(inode, wb, &wbc);
+        return writeback_single_inode(inode, &wbc);
 }
 EXPORT_SYMBOL(write_inode_now);
@@ -2367,7 +2378,7 @@ EXPORT_SYMBOL(write_inode_now);
 */
 int sync_inode(struct inode *inode, struct writeback_control *wbc)
 {
-        return writeback_single_inode(inode, &inode_to_bdi(inode)->wb, wbc);
+        return writeback_single_inode(inode, wbc);
 }
 EXPORT_SYMBOL(sync_inode);
diff --git a/fs/fuse/cuse.c b/fs/fuse/cuse.c
index 8e3ee1936c7e..c5b6b7165489 100644
--- a/fs/fuse/cuse.c
+++ b/fs/fuse/cuse.c
@@ -90,7 +90,7 @@ static struct list_head *cuse_conntbl_head(dev_t devt)
 static ssize_t cuse_read_iter(struct kiocb *kiocb, struct iov_iter *to)
 {
-        struct fuse_io_priv io = { .async = 0, .file = kiocb->ki_filp };
+        struct fuse_io_priv io = FUSE_IO_PRIV_SYNC(kiocb->ki_filp);
        loff_t pos = 0;
        return fuse_direct_io(&io, to, &pos, FUSE_DIO_CUSE);
@@ -98,7 +98,7 @@ static ssize_t cuse_read_iter(struct kiocb *kiocb, struct iov_iter *to)
 static ssize_t cuse_write_iter(struct kiocb *kiocb, struct iov_iter *from)
 {
-        struct fuse_io_priv io = { .async = 0, .file = kiocb->ki_filp };
+        struct fuse_io_priv io = FUSE_IO_PRIV_SYNC(kiocb->ki_filp);
        loff_t pos = 0;
        /*
         * No locking or generic_write_checks(), the server is
diff --git a/fs/fuse/file.c b/fs/fuse/file.c
index b03d253ece15..9dde38f12c07 100644
--- a/fs/fuse/file.c
+++ b/fs/fuse/file.c
@@ -528,6 +528,11 @@ static void fuse_release_user_pages(struct fuse_req *req, int write)
        }
 }
+static void fuse_io_release(struct kref *kref)
+{
+        kfree(container_of(kref, struct fuse_io_priv, refcnt));
+}
 static ssize_t fuse_get_res_by_io(struct fuse_io_priv *io)
 {
        if (io->err)
@@ -585,8 +590,9 @@ static void fuse_aio_complete(struct fuse_io_priv *io, int err, ssize_t pos)
                }
                io->iocb->ki_complete(io->iocb, res, 0);
-                kfree(io);
        }
+        kref_put(&io->refcnt, fuse_io_release);
 }
 static void fuse_aio_complete_req(struct fuse_conn *fc, struct fuse_req *req)
@@ -613,6 +619,7 @@ static size_t fuse_async_req_send(struct fuse_conn *fc, struct fuse_req *req,
                size_t num_bytes, struct fuse_io_priv *io)
 {
        spin_lock(&io->lock);
+        kref_get(&io->refcnt);
        io->size += num_bytes;
        io->reqs++;
        spin_unlock(&io->lock);
@@ -691,7 +698,7 @@ static void fuse_short_read(struct fuse_req *req, struct inode *inode,
 static int fuse_do_readpage(struct file *file, struct page *page)
 {
-        struct fuse_io_priv io = { .async = 0, .file = file };
+        struct fuse_io_priv io = FUSE_IO_PRIV_SYNC(file);
        struct inode *inode = page->mapping->host;
        struct fuse_conn *fc = get_fuse_conn(inode);
        struct fuse_req *req;
@@ -984,7 +991,7 @@ static size_t fuse_send_write_pages(struct fuse_req *req, struct file *file,
        size_t res;
        unsigned offset;
        unsigned i;
-        struct fuse_io_priv io = { .async = 0, .file = file };
+        struct fuse_io_priv io = FUSE_IO_PRIV_SYNC(file);
        for (i = 0; i < req->num_pages; i++)
                fuse_wait_on_page_writeback(inode, req->pages[i]->index);
@@ -1240,6 +1247,7 @@ static int fuse_get_user_pages(struct fuse_req *req, struct iov_iter *ii,
                               size_t *nbytesp, int write)
 {
        size_t nbytes = 0;  /* # bytes already packed in req */
+        ssize_t ret = 0;
        /* Special case for kernel I/O: can copy directly into the buffer */
        if (ii->type & ITER_KVEC) {
@@ -1259,13 +1267,12 @@ static int fuse_get_user_pages(struct fuse_req *req, struct iov_iter *ii,
        while (nbytes < *nbytesp && req->num_pages < req->max_pages) {
                unsigned npages;
                size_t start;
-                ssize_t ret = iov_iter_get_pages(ii,
+                ret = iov_iter_get_pages(ii, &req->pages[req->num_pages],
-                                        &req->pages[req->num_pages],
                                        *nbytesp - nbytes,
                                        req->max_pages - req->num_pages,
                                        &start);
                if (ret < 0)
-                        return ret;
+                        break;
                iov_iter_advance(ii, ret);
                nbytes += ret;
@@ -1288,7 +1295,7 @@ static int fuse_get_user_pages(struct fuse_req *req, struct iov_iter *ii,
        *nbytesp = nbytes;
-        return 0;
+        return ret;
 }
 static inline int fuse_iter_npages(const struct iov_iter *ii_p)
@@ -1312,6 +1319,7 @@ ssize_t fuse_direct_io(struct fuse_io_priv *io, struct iov_iter *iter,
        pgoff_t idx_to = (pos + count - 1) >> PAGE_CACHE_SHIFT;
        ssize_t res = 0;
        struct fuse_req *req;
+        int err = 0;
        if (io->async)
                req = fuse_get_req_for_background(fc, fuse_iter_npages(iter));
@@ -1332,11 +1340,9 @@ ssize_t fuse_direct_io(struct fuse_io_priv *io, struct iov_iter *iter,
                size_t nres;
                fl_owner_t owner = current->files;
                size_t nbytes = min(count, nmax);
-                int err = fuse_get_user_pages(req, iter, &nbytes, write);
+                err = fuse_get_user_pages(req, iter, &nbytes, write);
-                if (err) {
+                if (err && !nbytes)
-                        res = err;
                        break;
-                }
                if (write)
                        nres = fuse_send_write(req, io, pos, nbytes, owner);
@@ -1346,11 +1352,11 @@ ssize_t fuse_direct_io(struct fuse_io_priv *io, struct iov_iter *iter,
                if (!io->async)
                        fuse_release_user_pages(req, !write);
                if (req->out.h.error) {
-                        if (!res)
+                        err = req->out.h.error;
-                                res = req->out.h.error;
                        break;
                } else if (nres > nbytes) {
-                        res = -EIO;
+                        res = 0;
+                        err = -EIO;
                        break;
                }
                count -= nres;
@@ -1374,7 +1380,7 @@ ssize_t fuse_direct_io(struct fuse_io_priv *io, struct iov_iter *iter,
        if (res > 0)
                *ppos = pos;
-        return res;
+        return res > 0 ? res : err;
 }
 EXPORT_SYMBOL_GPL(fuse_direct_io);
@@ -1398,7 +1404,7 @@ static ssize_t __fuse_direct_read(struct fuse_io_priv *io,
 static ssize_t fuse_direct_read_iter(struct kiocb *iocb, struct iov_iter *to)
 {
-        struct fuse_io_priv io = { .async = 0, .file = iocb->ki_filp };
+        struct fuse_io_priv io = FUSE_IO_PRIV_SYNC(iocb->ki_filp);
        return __fuse_direct_read(&io, to, &iocb->ki_pos);
 }
@@ -1406,7 +1412,7 @@ static ssize_t fuse_direct_write_iter(struct kiocb *iocb, struct iov_iter *from)
 {
        struct file *file = iocb->ki_filp;
        struct inode *inode = file_inode(file);
-        struct fuse_io_priv io = { .async = 0, .file = file };
+        struct fuse_io_priv io = FUSE_IO_PRIV_SYNC(file);
        ssize_t res;
        if (is_bad_inode(inode))
@@ -2843,6 +2849,7 @@ fuse_direct_IO(struct kiocb *iocb, struct iov_iter *iter, loff_t offset)
        loff_t i_size;
        size_t count = iov_iter_count(iter);
        struct fuse_io_priv *io;
+        bool is_sync = is_sync_kiocb(iocb);
        pos = offset;
        inode = file->f_mapping->host;
@@ -2863,6 +2870,7 @@ fuse_direct_IO(struct kiocb *iocb, struct iov_iter *iter, loff_t offset)
        if (!io)
                return -ENOMEM;
        spin_lock_init(&io->lock);
+        kref_init(&io->refcnt);
        io->reqs = 1;
        io->bytes = -1;
        io->size = 0;
@@ -2882,12 +2890,18 @@ fuse_direct_IO(struct kiocb *iocb, struct iov_iter *iter, loff_t offset)
         * to wait on real async I/O requests, so we must submit this request
         * synchronously.
         */
-        if (!is_sync_kiocb(iocb) && (offset + count > i_size) &&
+        if (!is_sync && (offset + count > i_size) &&
            iov_iter_rw(iter) == WRITE)
                io->async = false;
-        if (io->async && is_sync_kiocb(iocb))
+        if (io->async && is_sync) {
+                /*
+                 * Additional reference to keep io around after
+                 * calling fuse_aio_complete()
+                 */
+                kref_get(&io->refcnt);
                io->done = &wait;
+        }
        if (iov_iter_rw(iter) == WRITE) {
                ret = fuse_direct_io(io, iter, &pos, FUSE_DIO_WRITE);
@@ -2900,14 +2914,14 @@ fuse_direct_IO(struct kiocb *iocb, struct iov_iter *iter, loff_t offset)
                fuse_aio_complete(io, ret < 0 ? ret : 0, -1);
                /* we have a non-extending, async request, so return */
-                if (!is_sync_kiocb(iocb))
+                if (!is_sync)
                        return -EIOCBQUEUED;
                wait_for_completion(&wait);
                ret = fuse_get_res_by_io(io);
        }
-        kfree(io);
+        kref_put(&io->refcnt, fuse_io_release);
        if (iov_iter_rw(iter) == WRITE) {
                if (ret > 0)
diff --git a/fs/fuse/fuse_i.h b/fs/fuse/fuse_i.h
index ce394b5fe6b4..eddbe02c4028 100644
--- a/fs/fuse/fuse_i.h
+++ b/fs/fuse/fuse_i.h
@@ -22,6 +22,7 @@
 #include <linux/rbtree.h>
 #include <linux/poll.h>
 #include <linux/workqueue.h>
+#include <linux/kref.h>
 /** Max number of pages that can be used in a single read request */
 #define FUSE_MAX_PAGES_PER_REQ 32
@@ -243,6 +244,7 @@ struct fuse_args {
 /** The request IO state (for asynchronous processing) */
 struct fuse_io_priv {
+        struct kref refcnt;
        int async;
        spinlock_t lock;
        unsigned reqs;
@@ -256,6 +258,13 @@ struct fuse_io_priv {
        struct completion *done;
 };
+#define FUSE_IO_PRIV_SYNC(f) \
+{                                       \
+        .refcnt = { ATOMIC_INIT(1) },   \
+        .async = 0,                     \
+        .file = f,                      \
+}
 /**
 * Request flags
 *
diff --git a/fs/gfs2/aops.c b/fs/gfs2/aops.c
index 93f07465e5a6..aa016e4b8bec 100644
--- a/fs/gfs2/aops.c
+++ b/fs/gfs2/aops.c
@@ -1082,7 +1082,7 @@ static ssize_t gfs2_direct_IO(struct kiocb *iocb, struct iov_iter *iter,
         * the first place, mapping->nr_pages will always be zero.
         */
        if (mapping->nrpages) {
-                loff_t lstart = offset & (PAGE_CACHE_SIZE - 1);
+                loff_t lstart = offset & ~(PAGE_CACHE_SIZE - 1);
                loff_t len = iov_iter_count(iter);
                loff_t end = PAGE_ALIGN(offset + len) - 1;
diff --git a/fs/gfs2/dir.c b/fs/gfs2/dir.c
index 6a92592304fb..4a01f30e9995 100644
--- a/fs/gfs2/dir.c
+++ b/fs/gfs2/dir.c
@@ -798,7 +798,7 @@ static int get_first_leaf(struct gfs2_inode *dip, u32 index,
        int error;
        error = get_leaf_nr(dip, index, &leaf_no);
-        if (!error)
+        if (!IS_ERR_VALUE(error))
                error = get_leaf(dip, leaf_no, bh_out);
        return error;
@@ -1014,7 +1014,7 @@ static int dir_split_leaf(struct inode *inode, const struct qstr *name)
        index = name->hash >> (32 - dip->i_depth);
        error = get_leaf_nr(dip, index, &leaf_no);
-        if (error)
+        if (IS_ERR_VALUE(error))
                return error;
        /*  Get the old leaf block  */
@@ -1660,7 +1660,7 @@ struct inode *gfs2_dir_search(struct inode *dir, const struct qstr *name,
                brelse(bh);
                if (fail_on_exist)
                        return ERR_PTR(-EEXIST);
-                inode = gfs2_inode_lookup(dir->i_sb, dtype, addr, formal_ino, 0);
+                inode = gfs2_inode_lookup(dir->i_sb, dtype, addr, formal_ino);
                if (!IS_ERR(inode))
                        GFS2_I(inode)->i_rahead = rahead;
                return inode;
diff --git a/fs/gfs2/export.c b/fs/gfs2/export.c
index 5d15e9498b48..d5bda8513457 100644
--- a/fs/gfs2/export.c
+++ b/fs/gfs2/export.c
@@ -137,7 +137,7 @@ static struct dentry *gfs2_get_dentry(struct super_block *sb,
        struct gfs2_sbd *sdp = sb->s_fs_info;
        struct inode *inode;
-        inode = gfs2_ilookup(sb, inum->no_addr, 0);
+        inode = gfs2_ilookup(sb, inum->no_addr);
        if (inode) {
                if (GFS2_I(inode)->i_no_formal_ino != inum->no_formal_ino) {
                        iput(inode);
diff --git a/fs/gfs2/glock.c b/fs/gfs2/glock.c
index a4ff7b56f5cd..6539131c52a2 100644
--- a/fs/gfs2/glock.c
+++ b/fs/gfs2/glock.c
@@ -572,17 +572,24 @@ static void delete_work_func(struct work_struct *work)
        struct inode *inode;
        u64 no_addr = gl->gl_name.ln_number;
+        /* If someone's using this glock to create a new dinode, the block must
+           have been freed by another node, then re-used, in which case our
+           iopen callback is too late after the fact. Ignore it. */
+        if (test_bit(GLF_INODE_CREATING, &gl->gl_flags))
+                goto out;
        ip = gl->gl_object;
        /* Note: Unsafe to dereference ip as we don't hold right refs/locks */
        if (ip)
-                inode = gfs2_ilookup(sdp->sd_vfs, no_addr, 1);
+                inode = gfs2_ilookup(sdp->sd_vfs, no_addr);
        else
                inode = gfs2_lookup_by_inum(sdp, no_addr, NULL, GFS2_BLKST_UNLINKED);
        if (inode && !IS_ERR(inode)) {
                d_prune_aliases(inode);
                iput(inode);
        }
+out:
        gfs2_glock_put(gl);
 }
@@ -1015,6 +1022,7 @@ void gfs2_glock_dq(struct gfs2_holder *gh)
                handle_callback(gl, LM_ST_UNLOCKED, 0, false);
        list_del_init(&gh->gh_list);
+        clear_bit(HIF_HOLDER, &gh->gh_iflags);
        if (find_first_holder(gl) == NULL) {
                if (glops->go_unlock) {
                        GLOCK_BUG_ON(gl, test_and_set_bit(GLF_LOCK, &gl->gl_flags));
diff --git a/fs/gfs2/incore.h b/fs/gfs2/incore.h
index 845fb09cc606..a6a3389a07fc 100644
--- a/fs/gfs2/incore.h
+++ b/fs/gfs2/incore.h
@@ -328,6 +328,7 @@ enum {
        GLF_LRU                         = 13,
        GLF_OBJECT                      = 14, /* Used only for tracing */
        GLF_BLOCKING                    = 15,
+        GLF_INODE_CREATING              = 16, /* Inode creation occurring */
 };
 struct gfs2_glock {
diff --git a/fs/gfs2/inode.c b/fs/gfs2/inode.c
index 352f958769e1..bb30f9a72c65 100644
--- a/fs/gfs2/inode.c
+++ b/fs/gfs2/inode.c
@@ -37,61 +37,9 @@
 #include "super.h"
 #include "glops.h"
-struct gfs2_skip_data {
+struct inode *gfs2_ilookup(struct super_block *sb, u64 no_addr)
-        u64 no_addr;
-        int skipped;
-        int non_block;
-};
-static int iget_test(struct inode *inode, void *opaque)
-{
-        struct gfs2_inode *ip = GFS2_I(inode);
-        struct gfs2_skip_data *data = opaque;
-        if (ip->i_no_addr == data->no_addr) {
-                if (data->non_block &&
-                    inode->i_state & (I_FREEING|I_CLEAR|I_WILL_FREE)) {
-                        data->skipped = 1;
-                        return 0;
-                }
-                return 1;
-        }
-        return 0;
-}
-static int iget_set(struct inode *inode, void *opaque)
-{
-        struct gfs2_inode *ip = GFS2_I(inode);
-        struct gfs2_skip_data *data = opaque;
-        if (data->skipped)
-                return -ENOENT;
-        inode->i_ino = (unsigned long)(data->no_addr);
-        ip->i_no_addr = data->no_addr;
-        return 0;
-}
-struct inode *gfs2_ilookup(struct super_block *sb, u64 no_addr, int non_block)
 {
-        unsigned long hash = (unsigned long)no_addr;
+        return ilookup(sb, (unsigned long)no_addr);
-        struct gfs2_skip_data data;
-        data.no_addr = no_addr;
-        data.skipped = 0;
-        data.non_block = non_block;
-        return ilookup5(sb, hash, iget_test, &data);
-}
-static struct inode *gfs2_iget(struct super_block *sb, u64 no_addr,
-                               int non_block)
-{
-        struct gfs2_skip_data data;
-        unsigned long hash = (unsigned long)no_addr;
-        data.no_addr = no_addr;
-        data.skipped = 0;
-        data.non_block = non_block;
-        return iget5_locked(sb, hash, iget_test, iget_set, &data);
 }
 /**
@@ -132,21 +80,21 @@ static void gfs2_set_iop(struct inode *inode)
 * @sb: The super block
 * @no_addr: The inode number
 * @type: The type of the inode
- * non_block: Can we block on inodes that are being freed?
 *
 * Returns: A VFS inode, or an error
 */
 struct inode *gfs2_inode_lookup(struct super_block *sb, unsigned int type,
-                                u64 no_addr, u64 no_formal_ino, int non_block)
+                                u64 no_addr, u64 no_formal_ino)
 {
        struct inode *inode;
        struct gfs2_inode *ip;
        struct gfs2_glock *io_gl = NULL;
        int error;
-        inode = gfs2_iget(sb, no_addr, non_block);
+        inode = iget_locked(sb, (unsigned long)no_addr);
        ip = GFS2_I(inode);
+        ip->i_no_addr = no_addr;
        if (!inode)
                return ERR_PTR(-ENOMEM);
@@ -221,7 +169,7 @@ struct inode *gfs2_lookup_by_inum(struct gfs2_sbd *sdp, u64 no_addr,
        if (error)
                goto fail;
-        inode = gfs2_inode_lookup(sb, DT_UNKNOWN, no_addr, 0, 1);
+        inode = gfs2_inode_lookup(sb, DT_UNKNOWN, no_addr, 0);
        if (IS_ERR(inode))
                goto fail;
@@ -592,7 +540,7 @@ static int gfs2_create_inode(struct inode *dir, struct dentry *dentry,
        struct inode *inode = NULL;
        struct gfs2_inode *dip = GFS2_I(dir), *ip;
        struct gfs2_sbd *sdp = GFS2_SB(&dip->i_inode);
-        struct gfs2_glock *io_gl;
+        struct gfs2_glock *io_gl = NULL;
        int error, free_vfs_inode = 1;
        u32 aflags = 0;
        unsigned blocks = 1;
@@ -729,6 +677,8 @@ static int gfs2_create_inode(struct inode *dir, struct dentry *dentry,
        if (error)
                goto fail_gunlock2;
+        BUG_ON(test_and_set_bit(GLF_INODE_CREATING, &io_gl->gl_flags));
        error = gfs2_glock_nq_init(io_gl, LM_ST_SHARED, GL_EXACT, &ip->i_iopen_gh);
        if (error)
                goto fail_gunlock2;
@@ -771,12 +721,15 @@ static int gfs2_create_inode(struct inode *dir, struct dentry *dentry,
        }
        gfs2_glock_dq_uninit(ghs);
        gfs2_glock_dq_uninit(ghs + 1);
+        clear_bit(GLF_INODE_CREATING, &io_gl->gl_flags);
        return error;
 fail_gunlock3:
        gfs2_glock_dq_uninit(&ip->i_iopen_gh);
        gfs2_glock_put(io_gl);
 fail_gunlock2:
+        if (io_gl)
+                clear_bit(GLF_INODE_CREATING, &io_gl->gl_flags);
        gfs2_glock_dq_uninit(ghs + 1);
 fail_free_inode:
        if (ip->i_gl)
diff --git a/fs/gfs2/inode.h b/fs/gfs2/inode.h
index ba4d9492d422..e1af0d4aa308 100644
--- a/fs/gfs2/inode.h
+++ b/fs/gfs2/inode.h
@@ -94,12 +94,11 @@ err:
 }
 extern struct inode *gfs2_inode_lookup(struct super_block *sb, unsigned type, 
-                                       u64 no_addr, u64 no_formal_ino,
+                                       u64 no_addr, u64 no_formal_ino);
-                                       int non_block);
 extern struct inode *gfs2_lookup_by_inum(struct gfs2_sbd *sdp, u64 no_addr,
                                         u64 *no_formal_ino,
                                         unsigned int blktype);
-extern struct inode *gfs2_ilookup(struct super_block *sb, u64 no_addr, int nonblock);
+extern struct inode *gfs2_ilookup(struct super_block *sb, u64 no_addr);
 extern int gfs2_inode_refresh(struct gfs2_inode *ip);
diff --git a/fs/gfs2/ops_fstype.c b/fs/gfs2/ops_fstype.c
index dbed9e243ea2..49b0bff18fe3 100644
--- a/fs/gfs2/ops_fstype.c
+++ b/fs/gfs2/ops_fstype.c
@@ -454,7 +454,7 @@ static int gfs2_lookup_root(struct super_block *sb, struct dentry **dptr,
        struct dentry *dentry;
        struct inode *inode;
-        inode = gfs2_inode_lookup(sb, DT_DIR, no_addr, 0, 0);
+        inode = gfs2_inode_lookup(sb, DT_DIR, no_addr, 0);
        if (IS_ERR(inode)) {
                fs_err(sdp, "can't read in %s inode: %ld\n", name, PTR_ERR(inode));
                return PTR_ERR(inode);
diff --git a/fs/gfs2/super.c b/fs/gfs2/super.c
index 8f960a51a9a0..f8a0cd821290 100644
--- a/fs/gfs2/super.c
+++ b/fs/gfs2/super.c
@@ -1551,12 +1551,16 @@ static void gfs2_evict_inode(struct inode *inode)
                        goto out_truncate;
        }
-        ip->i_iopen_gh.gh_flags |= GL_NOCACHE;
+        if (ip->i_iopen_gh.gh_gl &&
-        gfs2_glock_dq_wait(&ip->i_iopen_gh);
+            test_bit(HIF_HOLDER, &ip->i_iopen_gh.gh_iflags)) {
-        gfs2_holder_reinit(LM_ST_EXCLUSIVE, LM_FLAG_TRY_1CB | GL_NOCACHE, &ip->i_iopen_gh);
+                ip->i_iopen_gh.gh_flags |= GL_NOCACHE;
-        error = gfs2_glock_nq(&ip->i_iopen_gh);
+                gfs2_glock_dq_wait(&ip->i_iopen_gh);
-        if (error)
+                gfs2_holder_reinit(LM_ST_EXCLUSIVE, LM_FLAG_TRY_1CB | GL_NOCACHE,
-                goto out_truncate;
+                                   &ip->i_iopen_gh);
+                error = gfs2_glock_nq(&ip->i_iopen_gh);
+                if (error)
+                        goto out_truncate;
+        }
        /* Case 1 starts here */
@@ -1606,11 +1610,13 @@ out_unlock:
        if (gfs2_rs_active(&ip->i_res))
                gfs2_rs_deltree(&ip->i_res);
-        if (test_bit(HIF_HOLDER, &ip->i_iopen_gh.gh_iflags)) {
+        if (ip->i_iopen_gh.gh_gl) {
-                ip->i_iopen_gh.gh_flags |= GL_NOCACHE;
+                if (test_bit(HIF_HOLDER, &ip->i_iopen_gh.gh_iflags)) {
-                gfs2_glock_dq_wait(&ip->i_iopen_gh);
+                        ip->i_iopen_gh.gh_flags |= GL_NOCACHE;
+                        gfs2_glock_dq_wait(&ip->i_iopen_gh);
+                }
+                gfs2_holder_uninit(&ip->i_iopen_gh);
        }
-        gfs2_holder_uninit(&ip->i_iopen_gh);
        gfs2_glock_dq_uninit(&gh);
        if (error && error != GLR_TRYFAILED && error != -EROFS)
                fs_warn(sdp, "gfs2_evict_inode: %d\n", error);
diff --git a/fs/jbd2/commit.c b/fs/jbd2/commit.c
index 36345fefa3ff..517f2de784cf 100644
--- a/fs/jbd2/commit.c
+++ b/fs/jbd2/commit.c
@@ -131,14 +131,12 @@ static int journal_submit_commit_record(journal_t *journal,
        if (is_journal_aborted(journal))
                return 0;
-        bh = jbd2_journal_get_descriptor_buffer(journal);
+        bh = jbd2_journal_get_descriptor_buffer(commit_transaction,
+                                                JBD2_COMMIT_BLOCK);
        if (!bh)
                return 1;
        tmp = (struct commit_header *)bh->b_data;
-        tmp->h_magic = cpu_to_be32(JBD2_MAGIC_NUMBER);
-        tmp->h_blocktype = cpu_to_be32(JBD2_COMMIT_BLOCK);
-        tmp->h_sequence = cpu_to_be32(commit_transaction->t_tid);
        tmp->h_commit_sec = cpu_to_be64(now.tv_sec);
        tmp->h_commit_nsec = cpu_to_be32(now.tv_nsec);
@@ -222,7 +220,7 @@ static int journal_submit_data_buffers(journal_t *journal,
        spin_lock(&journal->j_list_lock);
        list_for_each_entry(jinode, &commit_transaction->t_inode_list, i_list) {
                mapping = jinode->i_vfs_inode->i_mapping;
-                set_bit(__JI_COMMIT_RUNNING, &jinode->i_flags);
+                jinode->i_flags |= JI_COMMIT_RUNNING;
                spin_unlock(&journal->j_list_lock);
                /*
                 * submit the inode data buffers. We use writepage
@@ -236,8 +234,8 @@ static int journal_submit_data_buffers(journal_t *journal,
                        ret = err;
                spin_lock(&journal->j_list_lock);
                J_ASSERT(jinode->i_transaction == commit_transaction);
-                clear_bit(__JI_COMMIT_RUNNING, &jinode->i_flags);
+                jinode->i_flags &= ~JI_COMMIT_RUNNING;
-                smp_mb__after_atomic();
+                smp_mb();
                wake_up_bit(&jinode->i_flags, __JI_COMMIT_RUNNING);
        }
        spin_unlock(&journal->j_list_lock);
@@ -258,7 +256,7 @@ static int journal_finish_inode_data_buffers(journal_t *journal,
        /* For locking, see the comment in journal_submit_data_buffers() */
        spin_lock(&journal->j_list_lock);
        list_for_each_entry(jinode, &commit_transaction->t_inode_list, i_list) {
-                set_bit(__JI_COMMIT_RUNNING, &jinode->i_flags);
+                jinode->i_flags |= JI_COMMIT_RUNNING;
                spin_unlock(&journal->j_list_lock);
                err = filemap_fdatawait(jinode->i_vfs_inode->i_mapping);
                if (err) {
@@ -274,8 +272,8 @@ static int journal_finish_inode_data_buffers(journal_t *journal,
                                ret = err;
                }
                spin_lock(&journal->j_list_lock);
-                clear_bit(__JI_COMMIT_RUNNING, &jinode->i_flags);
+                jinode->i_flags &= ~JI_COMMIT_RUNNING;
-                smp_mb__after_atomic();
+                smp_mb();
                wake_up_bit(&jinode->i_flags, __JI_COMMIT_RUNNING);
        }
@@ -319,22 +317,6 @@ static void write_tag_block(journal_t *j, journal_block_tag_t *tag,
                tag->t_blocknr_high = cpu_to_be32((block >> 31) >> 1);
 }
-static void jbd2_descr_block_csum_set(journal_t *j,
-                                      struct buffer_head *bh)
-{
-        struct jbd2_journal_block_tail *tail;
-        __u32 csum;
-        if (!jbd2_journal_has_csum_v2or3(j))
-                return;
-        tail = (struct jbd2_journal_block_tail *)(bh->b_data + j->j_blocksize -
-                        sizeof(struct jbd2_journal_block_tail));
-        tail->t_checksum = 0;
-        csum = jbd2_chksum(j, j->j_csum_seed, bh->b_data, j->j_blocksize);
-        tail->t_checksum = cpu_to_be32(csum);
-}
 static void jbd2_block_tag_csum_set(journal_t *j, journal_block_tag_t *tag,
                                    struct buffer_head *bh, __u32 sequence)
 {
@@ -379,7 +361,6 @@ void jbd2_journal_commit_transaction(journal_t *journal)
        ktime_t start_time;
        u64 commit_time;
        char *tagp = NULL;
-        journal_header_t *header;
        journal_block_tag_t *tag = NULL;
        int space_left = 0;
        int first_tag = 0;
@@ -554,8 +535,7 @@ void jbd2_journal_commit_transaction(journal_t *journal)
                jbd2_journal_abort(journal, err);
        blk_start_plug(&plug);
-        jbd2_journal_write_revoke_records(journal, commit_transaction,
+        jbd2_journal_write_revoke_records(commit_transaction, &log_bufs);
-                                          &log_bufs, WRITE_SYNC);
        jbd_debug(3, "JBD2: commit phase 2b\n");
@@ -616,7 +596,9 @@ void jbd2_journal_commit_transaction(journal_t *journal)
                        jbd_debug(4, "JBD2: get descriptor\n");
-                        descriptor = jbd2_journal_get_descriptor_buffer(journal);
+                        descriptor = jbd2_journal_get_descriptor_buffer(
+                                                        commit_transaction,
+                                                        JBD2_DESCRIPTOR_BLOCK);
                        if (!descriptor) {
                                jbd2_journal_abort(journal, -EIO);
                                continue;
@@ -625,11 +607,6 @@ void jbd2_journal_commit_transaction(journal_t *journal)
                        jbd_debug(4, "JBD2: got buffer %llu (%p)\n",
                                (unsigned long long)descriptor->b_blocknr,
                                descriptor->b_data);
-                        header = (journal_header_t *)descriptor->b_data;
-                        header->h_magic     = cpu_to_be32(JBD2_MAGIC_NUMBER);
-                        header->h_blocktype = cpu_to_be32(JBD2_DESCRIPTOR_BLOCK);
-                        header->h_sequence  = cpu_to_be32(commit_transaction->t_tid);
                        tagp = &descriptor->b_data[sizeof(journal_header_t)];
                        space_left = descriptor->b_size -
                                                sizeof(journal_header_t);
@@ -721,7 +698,7 @@ void jbd2_journal_commit_transaction(journal_t *journal)
                        tag->t_flags |= cpu_to_be16(JBD2_FLAG_LAST_TAG);
-                        jbd2_descr_block_csum_set(journal, descriptor);
+                        jbd2_descriptor_block_csum_set(journal, descriptor);
 start_journal_io:
                        for (i = 0; i < bufs; i++) {
                                struct buffer_head *bh = wbuf[i];
diff --git a/fs/jbd2/journal.c b/fs/jbd2/journal.c
index 81e622681c82..de73a9516a54 100644
--- a/fs/jbd2/journal.c
+++ b/fs/jbd2/journal.c
@@ -805,10 +805,13 @@ int jbd2_journal_bmap(journal_t *journal, unsigned long blocknr,
 * But we don't bother doing that, so there will be coherency problems with
 * mmaps of blockdevs which hold live JBD-controlled filesystems.
 */
-struct buffer_head *jbd2_journal_get_descriptor_buffer(journal_t *journal)
+struct buffer_head *
+jbd2_journal_get_descriptor_buffer(transaction_t *transaction, int type)
 {
+        journal_t *journal = transaction->t_journal;
        struct buffer_head *bh;
        unsigned long long blocknr;
+        journal_header_t *header;
        int err;
        err = jbd2_journal_next_log_block(journal, &blocknr);
@@ -821,12 +824,31 @@ struct buffer_head *jbd2_journal_get_descriptor_buffer(journal_t *journal)
                return NULL;
        lock_buffer(bh);
        memset(bh->b_data, 0, journal->j_blocksize);
+        header = (journal_header_t *)bh->b_data;
+        header->h_magic = cpu_to_be32(JBD2_MAGIC_NUMBER);
+        header->h_blocktype = cpu_to_be32(type);
+        header->h_sequence = cpu_to_be32(transaction->t_tid);
        set_buffer_uptodate(bh);
        unlock_buffer(bh);
        BUFFER_TRACE(bh, "return this buffer");
        return bh;
 }
+void jbd2_descriptor_block_csum_set(journal_t *j, struct buffer_head *bh)
+{
+        struct jbd2_journal_block_tail *tail;
+        __u32 csum;
+        if (!jbd2_journal_has_csum_v2or3(j))
+                return;
+        tail = (struct jbd2_journal_block_tail *)(bh->b_data + j->j_blocksize -
+                        sizeof(struct jbd2_journal_block_tail));
+        tail->t_checksum = 0;
+        csum = jbd2_chksum(j, j->j_csum_seed, bh->b_data, j->j_blocksize);
+        tail->t_checksum = cpu_to_be32(csum);
+}
 /*
 * Return tid of the oldest transaction in the journal and block in the journal
 * where the transaction starts.
@@ -1408,11 +1430,12 @@ out:
 /**
 * jbd2_mark_journal_empty() - Mark on disk journal as empty.
 * @journal: The journal to update.
+ * @write_op: With which operation should we write the journal sb
 *
 * Update a journal's dynamic superblock fields to show that journal is empty.
 * Write updated superblock to disk waiting for IO to complete.
 */
-static void jbd2_mark_journal_empty(journal_t *journal)
+static void jbd2_mark_journal_empty(journal_t *journal, int write_op)
 {
        journal_superblock_t *sb = journal->j_superblock;
@@ -1430,7 +1453,7 @@ static void jbd2_mark_journal_empty(journal_t *journal)
        sb->s_start    = cpu_to_be32(0);
        read_unlock(&journal->j_state_lock);
-        jbd2_write_superblock(journal, WRITE_FUA);
+        jbd2_write_superblock(journal, write_op);
        /* Log is no longer empty */
        write_lock(&journal->j_state_lock);
@@ -1716,7 +1739,13 @@ int jbd2_journal_destroy(journal_t *journal)
        if (journal->j_sb_buffer) {
                if (!is_journal_aborted(journal)) {
                        mutex_lock(&journal->j_checkpoint_mutex);
-                        jbd2_mark_journal_empty(journal);
+                        write_lock(&journal->j_state_lock);
+                        journal->j_tail_sequence =
+                                ++journal->j_transaction_sequence;
+                        write_unlock(&journal->j_state_lock);
+                        jbd2_mark_journal_empty(journal, WRITE_FLUSH_FUA);
                        mutex_unlock(&journal->j_checkpoint_mutex);
                } else
                        err = -EIO;
@@ -1975,7 +2004,7 @@ int jbd2_journal_flush(journal_t *journal)
         * the magic code for a fully-recovered superblock.  Any future
         * commits of data to the journal will restore the current
         * s_start value. */
-        jbd2_mark_journal_empty(journal);
+        jbd2_mark_journal_empty(journal, WRITE_FUA);
        mutex_unlock(&journal->j_checkpoint_mutex);
        write_lock(&journal->j_state_lock);
        J_ASSERT(!journal->j_running_transaction);
@@ -2021,7 +2050,7 @@ int jbd2_journal_wipe(journal_t *journal, int write)
        if (write) {
                /* Lock to make assertions happy... */
                mutex_lock(&journal->j_checkpoint_mutex);
-                jbd2_mark_journal_empty(journal);
+                jbd2_mark_journal_empty(journal, WRITE_FUA);
                mutex_unlock(&journal->j_checkpoint_mutex);
        }
@@ -2565,7 +2594,7 @@ void jbd2_journal_release_jbd_inode(journal_t *journal,
 restart:
        spin_lock(&journal->j_list_lock);
        /* Is commit writing out inode - we have to wait */
-        if (test_bit(__JI_COMMIT_RUNNING, &jinode->i_flags)) {
+        if (jinode->i_flags & JI_COMMIT_RUNNING) {
                wait_queue_head_t *wq;
                DEFINE_WAIT_BIT(wait, &jinode->i_flags, __JI_COMMIT_RUNNING);
                wq = bit_waitqueue(&jinode->i_flags, __JI_COMMIT_RUNNING);
diff --git a/fs/jbd2/recovery.c b/fs/jbd2/recovery.c
index 7f277e49fe88..08a456b96e4e 100644
--- a/fs/jbd2/recovery.c
+++ b/fs/jbd2/recovery.c
@@ -174,8 +174,7 @@ static int jread(struct buffer_head **bhp, journal_t *journal,
        return 0;
 }
-static int jbd2_descr_block_csum_verify(journal_t *j,
+static int jbd2_descriptor_block_csum_verify(journal_t *j, void *buf)
-                                        void *buf)
 {
        struct jbd2_journal_block_tail *tail;
        __be32 provided;
@@ -522,8 +521,8 @@ static int do_one_pass(journal_t *journal,
                                descr_csum_size =
                                        sizeof(struct jbd2_journal_block_tail);
                        if (descr_csum_size > 0 &&
-                            !jbd2_descr_block_csum_verify(journal,
+                            !jbd2_descriptor_block_csum_verify(journal,
-                                                          bh->b_data)) {
+                                                               bh->b_data)) {
                                printk(KERN_ERR "JBD2: Invalid checksum "
                                       "recovering block %lu in log\n",
                                       next_log_block);
@@ -811,26 +810,6 @@ static int do_one_pass(journal_t *journal,
        return err;
 }
-static int jbd2_revoke_block_csum_verify(journal_t *j,
-                                         void *buf)
-{
-        struct jbd2_journal_revoke_tail *tail;
-        __be32 provided;
-        __u32 calculated;
-        if (!jbd2_journal_has_csum_v2or3(j))
-                return 1;
-        tail = (struct jbd2_journal_revoke_tail *)(buf + j->j_blocksize -
-                        sizeof(struct jbd2_journal_revoke_tail));
-        provided = tail->r_checksum;
-        tail->r_checksum = 0;
-        calculated = jbd2_chksum(j, j->j_csum_seed, buf, j->j_blocksize);
-        tail->r_checksum = provided;
-        return provided == cpu_to_be32(calculated);
-}
 /* Scan a revoke record, marking all blocks mentioned as revoked. */
 static int scan_revoke_records(journal_t *journal, struct buffer_head *bh,
@@ -846,11 +825,11 @@ static int scan_revoke_records(journal_t *journal, struct buffer_head *bh,
        offset = sizeof(jbd2_journal_revoke_header_t);
        rcount = be32_to_cpu(header->r_count);
-        if (!jbd2_revoke_block_csum_verify(journal, header))
+        if (!jbd2_descriptor_block_csum_verify(journal, header))
                return -EFSBADCRC;
        if (jbd2_journal_has_csum_v2or3(journal))
-                csum_size = sizeof(struct jbd2_journal_revoke_tail);
+                csum_size = sizeof(struct jbd2_journal_block_tail);
        if (rcount > journal->j_blocksize - csum_size)
                return -EINVAL;
        max = rcount;
diff --git a/fs/jbd2/revoke.c b/fs/jbd2/revoke.c
index 705ae577882b..91171dc352cb 100644
--- a/fs/jbd2/revoke.c
+++ b/fs/jbd2/revoke.c
@@ -122,11 +122,11 @@ struct jbd2_revoke_table_s
 #ifdef __KERNEL__
-static void write_one_revoke_record(journal_t *, transaction_t *,
+static void write_one_revoke_record(transaction_t *,
                                    struct list_head *,
                                    struct buffer_head **, int *,
-                                    struct jbd2_revoke_record_s *, int);
+                                    struct jbd2_revoke_record_s *);
-static void flush_descriptor(journal_t *, struct buffer_head *, int, int);
+static void flush_descriptor(journal_t *, struct buffer_head *, int);
 #endif
 /* Utility functions to maintain the revoke table */
@@ -519,11 +519,10 @@ void jbd2_journal_switch_revoke_table(journal_t *journal)
 * Write revoke records to the journal for all entries in the current
 * revoke hash, deleting the entries as we go.
 */
-void jbd2_journal_write_revoke_records(journal_t *journal,
+void jbd2_journal_write_revoke_records(transaction_t *transaction,
-                                       transaction_t *transaction,
+                                       struct list_head *log_bufs)
-                                       struct list_head *log_bufs,
-                                       int write_op)
 {
+        journal_t *journal = transaction->t_journal;
        struct buffer_head *descriptor;
        struct jbd2_revoke_record_s *record;
        struct jbd2_revoke_table_s *revoke;
@@ -544,16 +543,15 @@ void jbd2_journal_write_revoke_records(journal_t *journal,
                while (!list_empty(hash_list)) {
                        record = (struct jbd2_revoke_record_s *)
                                hash_list->next;
-                        write_one_revoke_record(journal, transaction, log_bufs,
+                        write_one_revoke_record(transaction, log_bufs,
-                                                &descriptor, &offset,
+                                                &descriptor, &offset, record);
-                                                record, write_op);
                        count++;
                        list_del(&record->hash);
                        kmem_cache_free(jbd2_revoke_record_cache, record);
                }
        }
        if (descriptor)
-                flush_descriptor(journal, descriptor, offset, write_op);
+                flush_descriptor(journal, descriptor, offset);
        jbd_debug(1, "Wrote %d revoke records\n", count);
 }
@@ -562,18 +560,16 @@ void jbd2_journal_write_revoke_records(journal_t *journal,
 * block if the old one is full or if we have not already created one.
 */
-static void write_one_revoke_record(journal_t *journal,
+static void write_one_revoke_record(transaction_t *transaction,
-                                    transaction_t *transaction,
                                    struct list_head *log_bufs,
                                    struct buffer_head **descriptorp,
                                    int *offsetp,
-                                    struct jbd2_revoke_record_s *record,
+                                    struct jbd2_revoke_record_s *record)
-                                    int write_op)
 {
+        journal_t *journal = transaction->t_journal;
        int csum_size = 0;
        struct buffer_head *descriptor;
        int sz, offset;
-        journal_header_t *header;
        /* If we are already aborting, this all becomes a noop.  We
           still need to go round the loop in
@@ -587,7 +583,7 @@ static void write_one_revoke_record(journal_t *journal,
        /* Do we need to leave space at the end for a checksum? */
        if (jbd2_journal_has_csum_v2or3(journal))
-                csum_size = sizeof(struct jbd2_journal_revoke_tail);
+                csum_size = sizeof(struct jbd2_journal_block_tail);
        if (jbd2_has_feature_64bit(journal))
                sz = 8;
@@ -597,19 +593,16 @@ static void write_one_revoke_record(journal_t *journal,
        /* Make sure we have a descriptor with space left for the record */
        if (descriptor) {
                if (offset + sz > journal->j_blocksize - csum_size) {
-                        flush_descriptor(journal, descriptor, offset, write_op);
+                        flush_descriptor(journal, descriptor, offset);
                        descriptor = NULL;
                }
        }
        if (!descriptor) {
-                descriptor = jbd2_journal_get_descriptor_buffer(journal);
+                descriptor = jbd2_journal_get_descriptor_buffer(transaction,
+                                                        JBD2_REVOKE_BLOCK);
                if (!descriptor)
                        return;
-                header = (journal_header_t *)descriptor->b_data;
-                header->h_magic     = cpu_to_be32(JBD2_MAGIC_NUMBER);
-                header->h_blocktype = cpu_to_be32(JBD2_REVOKE_BLOCK);
-                header->h_sequence  = cpu_to_be32(transaction->t_tid);
                /* Record it so that we can wait for IO completion later */
                BUFFER_TRACE(descriptor, "file in log_bufs");
@@ -630,21 +623,6 @@ static void write_one_revoke_record(journal_t *journal,
        *offsetp = offset;
 }
-static void jbd2_revoke_csum_set(journal_t *j, struct buffer_head *bh)
-{
-        struct jbd2_journal_revoke_tail *tail;
-        __u32 csum;
-        if (!jbd2_journal_has_csum_v2or3(j))
-                return;
-        tail = (struct jbd2_journal_revoke_tail *)(bh->b_data + j->j_blocksize -
-                        sizeof(struct jbd2_journal_revoke_tail));
-        tail->r_checksum = 0;
-        csum = jbd2_chksum(j, j->j_csum_seed, bh->b_data, j->j_blocksize);
-        tail->r_checksum = cpu_to_be32(csum);
-}
 /*
 * Flush a revoke descriptor out to the journal.  If we are aborting,
 * this is a noop; otherwise we are generating a buffer which needs to
@@ -654,7 +632,7 @@ static void jbd2_revoke_csum_set(journal_t *j, struct buffer_head *bh)
 static void flush_descriptor(journal_t *journal,
                             struct buffer_head *descriptor,
-                             int offset, int write_op)
+                             int offset)
 {
        jbd2_journal_revoke_header_t *header;
@@ -665,12 +643,12 @@ static void flush_descriptor(journal_t *journal,
        header = (jbd2_journal_revoke_header_t *)descriptor->b_data;
        header->r_count = cpu_to_be32(offset);
-        jbd2_revoke_csum_set(journal, descriptor);
+        jbd2_descriptor_block_csum_set(journal, descriptor);
        set_buffer_jwrite(descriptor);
        BUFFER_TRACE(descriptor, "write");
        set_buffer_dirty(descriptor);
-        write_dirty_buffer(descriptor, write_op);
+        write_dirty_buffer(descriptor, WRITE_SYNC);
 }
 #endif
diff --git a/fs/jbd2/transaction.c b/fs/jbd2/transaction.c
index 081dff087fc0..01e4652d88f6 100644
--- a/fs/jbd2/transaction.c
+++ b/fs/jbd2/transaction.c
@@ -966,14 +966,8 @@ repeat:
                if (!frozen_buffer) {
                        JBUFFER_TRACE(jh, "allocate memory for buffer");
                        jbd_unlock_bh_state(bh);
-                        frozen_buffer = jbd2_alloc(jh2bh(jh)->b_size, GFP_NOFS);
+                        frozen_buffer = jbd2_alloc(jh2bh(jh)->b_size,
-                        if (!frozen_buffer) {
+                                                   GFP_NOFS | __GFP_NOFAIL);
-                                printk(KERN_ERR "%s: OOM for frozen_buffer\n",
-                                       __func__);
-                                JBUFFER_TRACE(jh, "oom!");
-                                error = -ENOMEM;
-                                goto out;
-                        }
                        goto repeat;
                }
                jh->b_frozen_data = frozen_buffer;
@@ -1226,15 +1220,9 @@ int jbd2_journal_get_undo_access(handle_t *handle, struct buffer_head *bh)
                goto out;
 repeat:
-        if (!jh->b_committed_data) {
+        if (!jh->b_committed_data)
-                committed_data = jbd2_alloc(jh2bh(jh)->b_size, GFP_NOFS);
+                committed_data = jbd2_alloc(jh2bh(jh)->b_size,
-                if (!committed_data) {
+                                            GFP_NOFS|__GFP_NOFAIL);
-                        printk(KERN_ERR "%s: No memory for committed data\n",
-                                __func__);
-                        err = -ENOMEM;
-                        goto out;
-                }
-        }
        jbd_lock_bh_state(bh);
        if (!jh->b_committed_data) {
diff --git a/fs/jffs2/gc.c b/fs/jffs2/gc.c
index 95d5880a63ee..7e553f286775 100644
--- a/fs/jffs2/gc.c
+++ b/fs/jffs2/gc.c
@@ -134,37 +134,59 @@ int jffs2_garbage_collect_pass(struct jffs2_sb_info *c)
        if (mutex_lock_interruptible(&c->alloc_sem))
                return -EINTR;
        for (;;) {
+                /* We can't start doing GC until we've finished checking
+                   the node CRCs etc. */
+                int bucket, want_ino;
                spin_lock(&c->erase_completion_lock);
                if (!c->unchecked_size)
                        break;
-                /* We can't start doing GC yet. We haven't finished checking
-                   the node CRCs etc. Do it now. */
-                /* checked_ino is protected by the alloc_sem */
-                if (c->checked_ino > c->highest_ino && xattr) {
-                        pr_crit("Checked all inodes but still 0x%x bytes of unchecked space?\n",
-                                c->unchecked_size);
-                        jffs2_dbg_dump_block_lists_nolock(c);
-                        spin_unlock(&c->erase_completion_lock);
-                        mutex_unlock(&c->alloc_sem);
-                        return -ENOSPC;
-                }
                spin_unlock(&c->erase_completion_lock);
                if (!xattr)
                        xattr = jffs2_verify_xattr(c);
                spin_lock(&c->inocache_lock);
+                /* Instead of doing the inodes in numeric order, doing a lookup
+                 * in the hash for each possible number, just walk the hash
+                 * buckets of *existing* inodes. This means that we process
+                 * them out-of-order, but it can be a lot faster if there's
+                 * a sparse inode# space. Which there often is. */
+                want_ino = c->check_ino;
+                for (bucket = c->check_ino % c->inocache_hashsize ; bucket < c->inocache_hashsize; bucket++) {
+                        for (ic = c->inocache_list[bucket]; ic; ic = ic->next) {
+                                if (ic->ino < want_ino)
+                                        continue;
+                                if (ic->state != INO_STATE_CHECKEDABSENT &&
+                                    ic->state != INO_STATE_PRESENT)
+                                        goto got_next; /* with inocache_lock held */
+                                jffs2_dbg(1, "Skipping ino #%u already checked\n",
+                                          ic->ino);
+                        }
+                        want_ino = 0;
+                }
-                ic = jffs2_get_ino_cache(c, c->checked_ino++);
+                /* Point c->check_ino past the end of the last bucket. */
+                c->check_ino = ((c->highest_ino + c->inocache_hashsize + 1) &
+                                ~c->inocache_hashsize) - 1;
-                if (!ic) {
+                spin_unlock(&c->inocache_lock);
-                        spin_unlock(&c->inocache_lock);
-                        continue;
+                pr_crit("Checked all inodes but still 0x%x bytes of unchecked space?\n",
-                }
+                        c->unchecked_size);
+                jffs2_dbg_dump_block_lists_nolock(c);
+                mutex_unlock(&c->alloc_sem);
+                return -ENOSPC;
+        got_next:
+                /* For next time round the loop, we want c->checked_ino to indicate
+                 * the *next* one we want to check. And since we're walking the
+                 * buckets rather than doing it sequentially, it's: */
+                c->check_ino = ic->ino + c->inocache_hashsize;
                if (!ic->pino_nlink) {
                        jffs2_dbg(1, "Skipping check of ino #%d with nlink/pino zero\n",
@@ -176,8 +198,6 @@ int jffs2_garbage_collect_pass(struct jffs2_sb_info *c)
                switch(ic->state) {
                case INO_STATE_CHECKEDABSENT:
                case INO_STATE_PRESENT:
-                        jffs2_dbg(1, "Skipping ino #%u already checked\n",
-                                  ic->ino);
                        spin_unlock(&c->inocache_lock);
                        continue;
@@ -196,7 +216,7 @@ int jffs2_garbage_collect_pass(struct jffs2_sb_info *c)
                                  ic->ino);
                        /* We need to come back again for the _same_ inode. We've
                         made no progress in this case, but that should be OK */
-                        c->checked_ino--;
+                        c->check_ino = ic->ino;
                        mutex_unlock(&c->alloc_sem);
                        sleep_on_spinunlock(&c->inocache_wq, &c->inocache_lock);
diff --git a/fs/jffs2/jffs2_fs_sb.h b/fs/jffs2/jffs2_fs_sb.h
index 046fee8b6e9b..778275f48a87 100644
--- a/fs/jffs2/jffs2_fs_sb.h
+++ b/fs/jffs2/jffs2_fs_sb.h
@@ -49,7 +49,7 @@ struct jffs2_sb_info {
        struct mtd_info *mtd;
        uint32_t highest_ino;
-        uint32_t checked_ino;
+        uint32_t check_ino;             /* *NEXT* inode to be checked */
        unsigned int flags;
diff --git a/fs/jffs2/nodemgmt.c b/fs/jffs2/nodemgmt.c
index b6bd4affd9ad..cda0774c2c9c 100644
--- a/fs/jffs2/nodemgmt.c
+++ b/fs/jffs2/nodemgmt.c
@@ -846,8 +846,8 @@ int jffs2_thread_should_wake(struct jffs2_sb_info *c)
                return 1;
        if (c->unchecked_size) {
-                jffs2_dbg(1, "jffs2_thread_should_wake(): unchecked_size %d, checked_ino #%d\n",
+                jffs2_dbg(1, "jffs2_thread_should_wake(): unchecked_size %d, check_ino #%d\n",
-                          c->unchecked_size, c->checked_ino);
+                          c->unchecked_size, c->check_ino);
                return 1;
        }
diff --git a/fs/jffs2/wbuf.c b/fs/jffs2/wbuf.c
index 5a3da3f52908..b25d28a21212 100644
--- a/fs/jffs2/wbuf.c
+++ b/fs/jffs2/wbuf.c
@@ -1183,22 +1183,20 @@ void jffs2_dirty_trigger(struct jffs2_sb_info *c)
 int jffs2_nand_flash_setup(struct jffs2_sb_info *c)
 {
-        struct nand_ecclayout *oinfo = c->mtd->ecclayout;
        if (!c->mtd->oobsize)
                return 0;
        /* Cleanmarker is out-of-band, so inline size zero */
        c->cleanmarker_size = 0;
-        if (!oinfo || oinfo->oobavail == 0) {
+        if (c->mtd->oobavail == 0) {
                pr_err("inconsistent device description\n");
                return -EINVAL;
        }
        jffs2_dbg(1, "using OOB on NAND\n");
-        c->oobavail = oinfo->oobavail;
+        c->oobavail = c->mtd->oobavail;
        /* Initialise write buffer */
        init_rwsem(&c->wbuf_sem);
diff --git a/fs/kernfs/dir.c b/fs/kernfs/dir.c
index 996b7742c90b..03b688d19f69 100644
--- a/fs/kernfs/dir.c
+++ b/fs/kernfs/dir.c
@@ -44,28 +44,122 @@ static int kernfs_name_locked(struct kernfs_node *kn, char *buf, size_t buflen)
        return strlcpy(buf, kn->parent ? kn->name : "/", buflen);
 }
-static char * __must_check kernfs_path_locked(struct kernfs_node *kn, char *buf,
+/* kernfs_node_depth - compute depth from @from to @to */
-                                              size_t buflen)
+static size_t kernfs_depth(struct kernfs_node *from, struct kernfs_node *to)
 {
-        char *p = buf + buflen;
+        size_t depth = 0;
-        int len;
-        *--p = '\0';
+        while (to->parent && to != from) {
+                depth++;
+                to = to->parent;
+        }
+        return depth;
+}
-        do {
+static struct kernfs_node *kernfs_common_ancestor(struct kernfs_node *a,
-                len = strlen(kn->name);
+                                                  struct kernfs_node *b)
-                if (p - buf < len + 1) {
+{
-                        buf[0] = '\0';
+        size_t da, db;
-                        p = NULL;
+        struct kernfs_root *ra = kernfs_root(a), *rb = kernfs_root(b);
-                        break;
-                }
+        if (ra != rb)
-                p -= len;
+                return NULL;
-                memcpy(p, kn->name, len);
-                *--p = '/';
+        da = kernfs_depth(ra->kn, a);
-                kn = kn->parent;
+        db = kernfs_depth(rb->kn, b);
-        } while (kn && kn->parent);
+        while (da > db) {
+                a = a->parent;
+                da--;
+        }
+        while (db > da) {
+                b = b->parent;
+                db--;
+        }
+        /* worst case b and a will be the same at root */
+        while (b != a) {
+                b = b->parent;
+                a = a->parent;
+        }
+        return a;
+}
+/**
+ * kernfs_path_from_node_locked - find a pseudo-absolute path to @kn_to,
+ * where kn_from is treated as root of the path.
+ * @kn_from: kernfs node which should be treated as root for the path
+ * @kn_to: kernfs node to which path is needed
+ * @buf: buffer to copy the path into
+ * @buflen: size of @buf
+ *
+ * We need to handle couple of scenarios here:
+ * [1] when @kn_from is an ancestor of @kn_to at some level
+ * kn_from: /n1/n2/n3
+ * kn_to:   /n1/n2/n3/n4/n5
+ * result:  /n4/n5
+ *
+ * [2] when @kn_from is on a different hierarchy and we need to find common
+ * ancestor between @kn_from and @kn_to.
+ * kn_from: /n1/n2/n3/n4
+ * kn_to:   /n1/n2/n5
+ * result:  /../../n5
+ * OR
+ * kn_from: /n1/n2/n3/n4/n5   [depth=5]
+ * kn_to:   /n1/n2/n3         [depth=3]
+ * result:  /../..
+ *
+ * return value: length of the string.  If greater than buflen,
+ * then contents of buf are undefined.  On error, -1 is returned.
+ */
+static int kernfs_path_from_node_locked(struct kernfs_node *kn_to,
+                                        struct kernfs_node *kn_from,
+                                        char *buf, size_t buflen)
+{
+        struct kernfs_node *kn, *common;
+        const char parent_str[] = "/..";
+        size_t depth_from, depth_to, len = 0, nlen = 0;
+        char *p;
+        int i;
+        if (!kn_from)
+                kn_from = kernfs_root(kn_to)->kn;
+        if (kn_from == kn_to)
+                return strlcpy(buf, "/", buflen);
+        common = kernfs_common_ancestor(kn_from, kn_to);
+        if (WARN_ON(!common))
+                return -1;
+        depth_to = kernfs_depth(common, kn_to);
+        depth_from = kernfs_depth(common, kn_from);
+        if (buf)
+                buf[0] = '\0';
+        for (i = 0; i < depth_from; i++)
+                len += strlcpy(buf + len, parent_str,
+                               len < buflen ? buflen - len : 0);
+        /* Calculate how many bytes we need for the rest */
+        for (kn = kn_to; kn != common; kn = kn->parent)
+                nlen += strlen(kn->name) + 1;
+        if (len + nlen >= buflen)
+                return len + nlen;
+        p = buf + len + nlen;
+        *p = '\0';
+        for (kn = kn_to; kn != common; kn = kn->parent) {
+                nlen = strlen(kn->name);
+                p -= nlen;
+                memcpy(p, kn->name, nlen);
+                *(--p) = '/';
+        }
-        return p;
+        return len + nlen;
 }
 /**
@@ -115,6 +209,34 @@ size_t kernfs_path_len(struct kernfs_node *kn)
 }
 /**
+ * kernfs_path_from_node - build path of node @to relative to @from.
+ * @from: parent kernfs_node relative to which we need to build the path
+ * @to: kernfs_node of interest
+ * @buf: buffer to copy @to's path into
+ * @buflen: size of @buf
+ *
+ * Builds @to's path relative to @from in @buf. @from and @to must
+ * be on the same kernfs-root. If @from is not parent of @to, then a relative
+ * path (which includes '..'s) as needed to reach from @from to @to is
+ * returned.
+ *
+ * If @buf isn't long enough, the return value will be greater than @buflen
+ * and @buf contents are undefined.
+ */
+int kernfs_path_from_node(struct kernfs_node *to, struct kernfs_node *from,
+                          char *buf, size_t buflen)
+{
+        unsigned long flags;
+        int ret;
+        spin_lock_irqsave(&kernfs_rename_lock, flags);
+        ret = kernfs_path_from_node_locked(to, from, buf, buflen);
+        spin_unlock_irqrestore(&kernfs_rename_lock, flags);
+        return ret;
+}
+EXPORT_SYMBOL_GPL(kernfs_path_from_node);
+/**
 * kernfs_path - build full path of a given node
 * @kn: kernfs_node of interest
 * @buf: buffer to copy @kn's name into
@@ -127,13 +249,12 @@ size_t kernfs_path_len(struct kernfs_node *kn)
 */
 char *kernfs_path(struct kernfs_node *kn, char *buf, size_t buflen)
 {
-        unsigned long flags;
+        int ret;
-        char *p;
-        spin_lock_irqsave(&kernfs_rename_lock, flags);
+        ret = kernfs_path_from_node(kn, NULL, buf, buflen);
-        p = kernfs_path_locked(kn, buf, buflen);
+        if (ret < 0 || ret >= buflen)
-        spin_unlock_irqrestore(&kernfs_rename_lock, flags);
+                return NULL;
-        return p;
+        return buf;
 }
 EXPORT_SYMBOL_GPL(kernfs_path);
@@ -164,17 +285,25 @@ void pr_cont_kernfs_name(struct kernfs_node *kn)
 void pr_cont_kernfs_path(struct kernfs_node *kn)
 {
        unsigned long flags;
-        char *p;
+        int sz;
        spin_lock_irqsave(&kernfs_rename_lock, flags);
-        p = kernfs_path_locked(kn, kernfs_pr_cont_buf,
+        sz = kernfs_path_from_node_locked(kn, NULL, kernfs_pr_cont_buf,
-                               sizeof(kernfs_pr_cont_buf));
+                                          sizeof(kernfs_pr_cont_buf));
-        if (p)
+        if (sz < 0) {
-                pr_cont("%s", p);
+                pr_cont("(error)");
-        else
+                goto out;
-                pr_cont("<name too long>");
+        }
+        if (sz >= sizeof(kernfs_pr_cont_buf)) {
+                pr_cont("(name too long)");
+                goto out;
+        }
+        pr_cont("%s", kernfs_pr_cont_buf);
+out:
        spin_unlock_irqrestore(&kernfs_rename_lock, flags);
 }
@@ -691,15 +820,22 @@ static struct kernfs_node *kernfs_walk_ns(struct kernfs_node *parent,
                                          const unsigned char *path,
                                          const void *ns)
 {
-        static char path_buf[PATH_MAX]; /* protected by kernfs_mutex */
+        size_t len;
-        size_t len = strlcpy(path_buf, path, PATH_MAX);
+        char *p, *name;
-        char *p = path_buf;
-        char *name;
        lockdep_assert_held(&kernfs_mutex);
-        if (len >= PATH_MAX)
+        /* grab kernfs_rename_lock to piggy back on kernfs_pr_cont_buf */
+        spin_lock_irq(&kernfs_rename_lock);
+        len = strlcpy(kernfs_pr_cont_buf, path, sizeof(kernfs_pr_cont_buf));
+        if (len >= sizeof(kernfs_pr_cont_buf)) {
+                spin_unlock_irq(&kernfs_rename_lock);
                return NULL;
+        }
+        p = kernfs_pr_cont_buf;
        while ((name = strsep(&p, "/")) && parent) {
                if (*name == '\0')
@@ -707,6 +843,8 @@ static struct kernfs_node *kernfs_walk_ns(struct kernfs_node *parent,
                parent = kernfs_find_ns(parent, name, ns);
        }
+        spin_unlock_irq(&kernfs_rename_lock);
        return parent;
 }
diff --git a/fs/kernfs/mount.c b/fs/kernfs/mount.c
index 8eaf417187f1..b67dbccdaf88 100644
--- a/fs/kernfs/mount.c
+++ b/fs/kernfs/mount.c
@@ -14,6 +14,7 @@
 #include <linux/magic.h>
 #include <linux/slab.h>
 #include <linux/pagemap.h>
+#include <linux/namei.h>
 #include "kernfs-internal.h"
@@ -62,6 +63,74 @@ struct kernfs_root *kernfs_root_from_sb(struct super_block *sb)
        return NULL;
 }
+/*
+ * find the next ancestor in the path down to @child, where @parent was the
+ * ancestor whose descendant we want to find.
+ *
+ * Say the path is /a/b/c/d.  @child is d, @parent is NULL.  We return the root
+ * node.  If @parent is b, then we return the node for c.
+ * Passing in d as @parent is not ok.
+ */
+static struct kernfs_node *find_next_ancestor(struct kernfs_node *child,
+                                              struct kernfs_node *parent)
+{
+        if (child == parent) {
+                pr_crit_once("BUG in find_next_ancestor: called with parent == child");
+                return NULL;
+        }
+        while (child->parent != parent) {
+                if (!child->parent)
+                        return NULL;
+                child = child->parent;
+        }
+        return child;
+}
+/**
+ * kernfs_node_dentry - get a dentry for the given kernfs_node
+ * @kn: kernfs_node for which a dentry is needed
+ * @sb: the kernfs super_block
+ */
+struct dentry *kernfs_node_dentry(struct kernfs_node *kn,
+                                  struct super_block *sb)
+{
+        struct dentry *dentry;
+        struct kernfs_node *knparent = NULL;
+        BUG_ON(sb->s_op != &kernfs_sops);
+        dentry = dget(sb->s_root);
+        /* Check if this is the root kernfs_node */
+        if (!kn->parent)
+                return dentry;
+        knparent = find_next_ancestor(kn, NULL);
+        if (WARN_ON(!knparent))
+                return ERR_PTR(-EINVAL);
+        do {
+                struct dentry *dtmp;
+                struct kernfs_node *kntmp;
+                if (kn == knparent)
+                        return dentry;
+                kntmp = find_next_ancestor(kn, knparent);
+                if (WARN_ON(!kntmp))
+                        return ERR_PTR(-EINVAL);
+                mutex_lock(&d_inode(dentry)->i_mutex);
+                dtmp = lookup_one_len(kntmp->name, dentry, strlen(kntmp->name));
+                mutex_unlock(&d_inode(dentry)->i_mutex);
+                dput(dentry);
+                if (IS_ERR(dtmp))
+                        return dtmp;
+                knparent = kntmp;
+                dentry = dtmp;
+        } while (true);
+}
 static int kernfs_fill_super(struct super_block *sb, unsigned long magic)
 {
        struct kernfs_super_info *info = kernfs_info(sb);
diff --git a/fs/mbcache.c b/fs/mbcache.c
index 187477ded6b3..eccda3a02de6 100644
--- a/fs/mbcache.c
+++ b/fs/mbcache.c
@@ -1,858 +1,433 @@
-/*
+#include <linux/spinlock.h>
- * linux/fs/mbcache.c
- * (C) 2001-2002 Andreas Gruenbacher, <a.gruenbacher@computer.org>
- */
-/*
- * Filesystem Meta Information Block Cache (mbcache)
- *
- * The mbcache caches blocks of block devices that need to be located
- * by their device/block number, as well as by other criteria (such
- * as the block's contents).
- *
- * There can only be one cache entry in a cache per device and block number.
- * Additional indexes need not be unique in this sense. The number of
- * additional indexes (=other criteria) can be hardwired at compile time
- * or specified at cache create time.
- *
- * Each cache entry is of fixed size. An entry may be `valid' or `invalid'
- * in the cache. A valid entry is in the main hash tables of the cache,
- * and may also be in the lru list. An invalid entry is not in any hashes
- * or lists.
- *
- * A valid cache entry is only in the lru list if no handles refer to it.
- * Invalid cache entries will be freed when the last handle to the cache
- * entry is released. Entries that cannot be freed immediately are put
- * back on the lru list.
- */
-/*
- * Lock descriptions and usage:
- *
- * Each hash chain of both the block and index hash tables now contains
- * a built-in lock used to serialize accesses to the hash chain.
- *
- * Accesses to global data structures mb_cache_list and mb_cache_lru_list
- * are serialized via the global spinlock mb_cache_spinlock.
- *
- * Each mb_cache_entry contains a spinlock, e_entry_lock, to serialize
- * accesses to its local data, such as e_used and e_queued.
- *
- * Lock ordering:
- *
- * Each block hash chain's lock has the highest lock order, followed by an
- * index hash chain's lock, mb_cache_bg_lock (used to implement mb_cache_entry's
- * lock), and mb_cach_spinlock, with the lowest order.  While holding
- * either a block or index hash chain lock, a thread can acquire an
- * mc_cache_bg_lock, which in turn can also acquire mb_cache_spinlock.
- *
- * Synchronization:
- *
- * Since both mb_cache_entry_get and mb_cache_entry_find scan the block and
- * index hash chian, it needs to lock the corresponding hash chain.  For each
- * mb_cache_entry within the chain, it needs to lock the mb_cache_entry to
- * prevent either any simultaneous release or free on the entry and also
- * to serialize accesses to either the e_used or e_queued member of the entry.
- *
- * To avoid having a dangling reference to an already freed
- * mb_cache_entry, an mb_cache_entry is only freed when it is not on a
- * block hash chain and also no longer being referenced, both e_used,
- * and e_queued are 0's.  When an mb_cache_entry is explicitly freed it is
- * first removed from a block hash chain.
- */
-#include <linux/kernel.h>
-#include <linux/module.h>
-#include <linux/hash.h>
-#include <linux/fs.h>
-#include <linux/mm.h>
 #include <linux/slab.h>
-#include <linux/sched.h>
+#include <linux/list.h>
 #include <linux/list_bl.h>
+#include <linux/module.h>
+#include <linux/sched.h>
+#include <linux/workqueue.h>
 #include <linux/mbcache.h>
-#include <linux/init.h>
-#include <linux/blockgroup_lock.h>
-#include <linux/log2.h>
-#ifdef MB_CACHE_DEBUG
-# define mb_debug(f...) do { \
-                printk(KERN_DEBUG f); \
-                printk("\n"); \
-        } while (0)
-#define mb_assert(c) do { if (!(c)) \
-                printk(KERN_ERR "assertion " #c " failed\n"); \
-        } while(0)
-#else
-# define mb_debug(f...) do { } while(0)
-# define mb_assert(c) do { } while(0)
-#endif
-#define mb_error(f...) do { \
-                printk(KERN_ERR f); \
-                printk("\n"); \
-        } while(0)
-#define MB_CACHE_WRITER ((unsigned short)~0U >> 1)
-#define MB_CACHE_ENTRY_LOCK_BITS        ilog2(NR_BG_LOCKS)
-#define MB_CACHE_ENTRY_LOCK_INDEX(ce)                   \
-        (hash_long((unsigned long)ce, MB_CACHE_ENTRY_LOCK_BITS))
-static DECLARE_WAIT_QUEUE_HEAD(mb_cache_queue);
-static struct blockgroup_lock *mb_cache_bg_lock;
-static struct kmem_cache *mb_cache_kmem_cache;
-MODULE_AUTHOR("Andreas Gruenbacher <a.gruenbacher@computer.org>");
-MODULE_DESCRIPTION("Meta block cache (for extended attributes)");
-MODULE_LICENSE("GPL");
-EXPORT_SYMBOL(mb_cache_create);
-EXPORT_SYMBOL(mb_cache_shrink);
-EXPORT_SYMBOL(mb_cache_destroy);
-EXPORT_SYMBOL(mb_cache_entry_alloc);
-EXPORT_SYMBOL(mb_cache_entry_insert);
-EXPORT_SYMBOL(mb_cache_entry_release);
-EXPORT_SYMBOL(mb_cache_entry_free);
-EXPORT_SYMBOL(mb_cache_entry_get);
-#if !defined(MB_CACHE_INDEXES_COUNT) || (MB_CACHE_INDEXES_COUNT > 0)
-EXPORT_SYMBOL(mb_cache_entry_find_first);
-EXPORT_SYMBOL(mb_cache_entry_find_next);
-#endif
 /*
- * Global data: list of all mbcache's, lru list, and a spinlock for
+ * Mbcache is a simple key-value store. Keys need not be unique, however
- * accessing cache data structures on SMP machines. The lru list is
+ * key-value pairs are expected to be unique (we use this fact in
- * global across all mbcaches.
+ * mb_cache_entry_delete_block()).
+ *
+ * Ext2 and ext4 use this cache for deduplication of extended attribute blocks.
+ * They use hash of a block contents as a key and block number as a value.
+ * That's why keys need not be unique (different xattr blocks may end up having
+ * the same hash). However block number always uniquely identifies a cache
+ * entry.
+ *
+ * We provide functions for creation and removal of entries, search by key,
+ * and a special "delete entry with given key-value pair" operation. Fixed
+ * size hash table is used for fast key lookups.
 */
-static LIST_HEAD(mb_cache_list);
+struct mb_cache {
-static LIST_HEAD(mb_cache_lru_list);
+        /* Hash table of entries */
-static DEFINE_SPINLOCK(mb_cache_spinlock);
+        struct hlist_bl_head    *c_hash;
+        /* log2 of hash table size */
-static inline void
+        int                     c_bucket_bits;
-__spin_lock_mb_cache_entry(struct mb_cache_entry *ce)
+        /* Maximum entries in cache to avoid degrading hash too much */
-{
+        int                     c_max_entries;
-        spin_lock(bgl_lock_ptr(mb_cache_bg_lock,
+        /* Protects c_list, c_entry_count */
-                MB_CACHE_ENTRY_LOCK_INDEX(ce)));
+        spinlock_t              c_list_lock;
-}
+        struct list_head        c_list;
+        /* Number of entries in cache */
-static inline void
+        unsigned long           c_entry_count;
-__spin_unlock_mb_cache_entry(struct mb_cache_entry *ce)
+        struct shrinker         c_shrink;
-{
+        /* Work for shrinking when the cache has too many entries */
-        spin_unlock(bgl_lock_ptr(mb_cache_bg_lock,
+        struct work_struct      c_shrink_work;
-                MB_CACHE_ENTRY_LOCK_INDEX(ce)));
+};
-}
-static inline int
-__mb_cache_entry_is_block_hashed(struct mb_cache_entry *ce)
-{
-        return !hlist_bl_unhashed(&ce->e_block_list);
-}
+static struct kmem_cache *mb_entry_cache;
-static inline void
+static unsigned long mb_cache_shrink(struct mb_cache *cache,
-__mb_cache_entry_unhash_block(struct mb_cache_entry *ce)
+                                     unsigned int nr_to_scan);
-{
-        if (__mb_cache_entry_is_block_hashed(ce))
-                hlist_bl_del_init(&ce->e_block_list);
-}
-static inline int
+static inline struct hlist_bl_head *mb_cache_entry_head(struct mb_cache *cache,
-__mb_cache_entry_is_index_hashed(struct mb_cache_entry *ce)
+                                                        u32 key)
 {
-        return !hlist_bl_unhashed(&ce->e_index.o_list);
+        return &cache->c_hash[hash_32(key, cache->c_bucket_bits)];
 }
-static inline void
+/*
-__mb_cache_entry_unhash_index(struct mb_cache_entry *ce)
+ * Number of entries to reclaim synchronously when there are too many entries
-{
+ * in cache
-        if (__mb_cache_entry_is_index_hashed(ce))
+ */
-                hlist_bl_del_init(&ce->e_index.o_list);
+#define SYNC_SHRINK_BATCH 64
-}
 /*
- * __mb_cache_entry_unhash_unlock()
+ * mb_cache_entry_create - create entry in cache
- *
+ * @cache - cache where the entry should be created
- * This function is called to unhash both the block and index hash
+ * @mask - gfp mask with which the entry should be allocated
- * chain.
+ * @key - key of the entry
- * It assumes both the block and index hash chain is locked upon entry.
+ * @block - block that contains data
- * It also unlock both hash chains both exit
+ * @reusable - is the block reusable by other inodes?
+ *
+ * Creates entry in @cache with key @key and records that data is stored in
+ * block @block. The function returns -EBUSY if entry with the same key
+ * and for the same block already exists in cache. Otherwise 0 is returned.
 */
-static inline void
+int mb_cache_entry_create(struct mb_cache *cache, gfp_t mask, u32 key,
-__mb_cache_entry_unhash_unlock(struct mb_cache_entry *ce)
+                          sector_t block, bool reusable)
 {
-        __mb_cache_entry_unhash_index(ce);
+        struct mb_cache_entry *entry, *dup;
-        hlist_bl_unlock(ce->e_index_hash_p);
+        struct hlist_bl_node *dup_node;
-        __mb_cache_entry_unhash_block(ce);
+        struct hlist_bl_head *head;
-        hlist_bl_unlock(ce->e_block_hash_p);
+        /* Schedule background reclaim if there are too many entries */
+        if (cache->c_entry_count >= cache->c_max_entries)
+                schedule_work(&cache->c_shrink_work);
+        /* Do some sync reclaim if background reclaim cannot keep up */
+        if (cache->c_entry_count >= 2*cache->c_max_entries)
+                mb_cache_shrink(cache, SYNC_SHRINK_BATCH);
+        entry = kmem_cache_alloc(mb_entry_cache, mask);
+        if (!entry)
+                return -ENOMEM;
+        INIT_LIST_HEAD(&entry->e_list);
+        /* One ref for hash, one ref returned */
+        atomic_set(&entry->e_refcnt, 1);
+        entry->e_key = key;
+        entry->e_block = block;
+        entry->e_reusable = reusable;
+        head = mb_cache_entry_head(cache, key);
+        hlist_bl_lock(head);
+        hlist_bl_for_each_entry(dup, dup_node, head, e_hash_list) {
+                if (dup->e_key == key && dup->e_block == block) {
+                        hlist_bl_unlock(head);
+                        kmem_cache_free(mb_entry_cache, entry);
+                        return -EBUSY;
+                }
+        }
+        hlist_bl_add_head(&entry->e_hash_list, head);
+        hlist_bl_unlock(head);
+        spin_lock(&cache->c_list_lock);
+        list_add_tail(&entry->e_list, &cache->c_list);
+        /* Grab ref for LRU list */
+        atomic_inc(&entry->e_refcnt);
+        cache->c_entry_count++;
+        spin_unlock(&cache->c_list_lock);
+        return 0;
 }
+EXPORT_SYMBOL(mb_cache_entry_create);
-static void
+void __mb_cache_entry_free(struct mb_cache_entry *entry)
-__mb_cache_entry_forget(struct mb_cache_entry *ce, gfp_t gfp_mask)
 {
-        struct mb_cache *cache = ce->e_cache;
+        kmem_cache_free(mb_entry_cache, entry);
-        mb_assert(!(ce->e_used || ce->e_queued || atomic_read(&ce->e_refcnt)));
-        kmem_cache_free(cache->c_entry_cache, ce);
-        atomic_dec(&cache->c_entry_count);
 }
+EXPORT_SYMBOL(__mb_cache_entry_free);
-static void
+static struct mb_cache_entry *__entry_find(struct mb_cache *cache,
-__mb_cache_entry_release(struct mb_cache_entry *ce)
+                                           struct mb_cache_entry *entry,
+                                           u32 key)
 {
-        /* First lock the entry to serialize access to its local data. */
+        struct mb_cache_entry *old_entry = entry;
-        __spin_lock_mb_cache_entry(ce);
+        struct hlist_bl_node *node;
-        /* Wake up all processes queuing for this cache entry. */
+        struct hlist_bl_head *head;
-        if (ce->e_queued)
-                wake_up_all(&mb_cache_queue);
+        head = mb_cache_entry_head(cache, key);
-        if (ce->e_used >= MB_CACHE_WRITER)
+        hlist_bl_lock(head);
-                ce->e_used -= MB_CACHE_WRITER;
+        if (entry && !hlist_bl_unhashed(&entry->e_hash_list))
-        /*
+                node = entry->e_hash_list.next;
-         * Make sure that all cache entries on lru_list have
+        else
-         * both e_used and e_qued of 0s.
+                node = hlist_bl_first(head);
-         */
+        while (node) {
-        ce->e_used--;
+                entry = hlist_bl_entry(node, struct mb_cache_entry,
-        if (!(ce->e_used || ce->e_queued || atomic_read(&ce->e_refcnt))) {
+                                       e_hash_list);
-                if (!__mb_cache_entry_is_block_hashed(ce)) {
+                if (entry->e_key == key && entry->e_reusable) {
-                        __spin_unlock_mb_cache_entry(ce);
+                        atomic_inc(&entry->e_refcnt);
-                        goto forget;
+                        goto out;
                }
-                /*
+                node = node->next;
-                 * Need access to lru list, first drop entry lock,
-                 * then reacquire the lock in the proper order.
-                 */
-                spin_lock(&mb_cache_spinlock);
-                if (list_empty(&ce->e_lru_list))
-                        list_add_tail(&ce->e_lru_list, &mb_cache_lru_list);
-                spin_unlock(&mb_cache_spinlock);
        }
-        __spin_unlock_mb_cache_entry(ce);
+        entry = NULL;
-        return;
+out:
-forget:
+        hlist_bl_unlock(head);
-        mb_assert(list_empty(&ce->e_lru_list));
+        if (old_entry)
-        __mb_cache_entry_forget(ce, GFP_KERNEL);
+                mb_cache_entry_put(cache, old_entry);
+        return entry;
 }
 /*
- * mb_cache_shrink_scan()  memory pressure callback
+ * mb_cache_entry_find_first - find the first entry in cache with given key
- *
+ * @cache: cache where we should search
- * This function is called by the kernel memory management when memory
+ * @key: key to look for
- * gets low.
 *
- * @shrink: (ignored)
+ * Search in @cache for entry with key @key. Grabs reference to the first
- * @sc: shrink_control passed from reclaim
+ * entry found and returns the entry.
- *
- * Returns the number of objects freed.
 */
-static unsigned long
+struct mb_cache_entry *mb_cache_entry_find_first(struct mb_cache *cache,
-mb_cache_shrink_scan(struct shrinker *shrink, struct shrink_control *sc)
+                                                 u32 key)
 {
-        LIST_HEAD(free_list);
+        return __entry_find(cache, NULL, key);
-        struct mb_cache_entry *entry, *tmp;
-        int nr_to_scan = sc->nr_to_scan;
-        gfp_t gfp_mask = sc->gfp_mask;
-        unsigned long freed = 0;
-        mb_debug("trying to free %d entries", nr_to_scan);
-        spin_lock(&mb_cache_spinlock);
-        while ((nr_to_scan-- > 0) && !list_empty(&mb_cache_lru_list)) {
-                struct mb_cache_entry *ce =
-                        list_entry(mb_cache_lru_list.next,
-                                struct mb_cache_entry, e_lru_list);
-                list_del_init(&ce->e_lru_list);
-                if (ce->e_used || ce->e_queued || atomic_read(&ce->e_refcnt))
-                        continue;
-                spin_unlock(&mb_cache_spinlock);
-                /* Prevent any find or get operation on the entry */
-                hlist_bl_lock(ce->e_block_hash_p);
-                hlist_bl_lock(ce->e_index_hash_p);
-                /* Ignore if it is touched by a find/get */
-                if (ce->e_used || ce->e_queued || atomic_read(&ce->e_refcnt) ||
-                        !list_empty(&ce->e_lru_list)) {
-                        hlist_bl_unlock(ce->e_index_hash_p);
-                        hlist_bl_unlock(ce->e_block_hash_p);
-                        spin_lock(&mb_cache_spinlock);
-                        continue;
-                }
-                __mb_cache_entry_unhash_unlock(ce);
-                list_add_tail(&ce->e_lru_list, &free_list);
-                spin_lock(&mb_cache_spinlock);
-        }
-        spin_unlock(&mb_cache_spinlock);
-        list_for_each_entry_safe(entry, tmp, &free_list, e_lru_list) {
-                __mb_cache_entry_forget(entry, gfp_mask);
-                freed++;
-        }
-        return freed;
 }
+EXPORT_SYMBOL(mb_cache_entry_find_first);
-static unsigned long
+/*
-mb_cache_shrink_count(struct shrinker *shrink, struct shrink_control *sc)
+ * mb_cache_entry_find_next - find next entry in cache with the same
+ * @cache: cache where we should search
+ * @entry: entry to start search from
+ *
+ * Finds next entry in the hash chain which has the same key as @entry.
+ * If @entry is unhashed (which can happen when deletion of entry races
+ * with the search), finds the first entry in the hash chain. The function
+ * drops reference to @entry and returns with a reference to the found entry.
+ */
+struct mb_cache_entry *mb_cache_entry_find_next(struct mb_cache *cache,
+                                                struct mb_cache_entry *entry)
 {
-        struct mb_cache *cache;
+        return __entry_find(cache, entry, entry->e_key);
-        unsigned long count = 0;
-        spin_lock(&mb_cache_spinlock);
-        list_for_each_entry(cache, &mb_cache_list, c_cache_list) {
-                mb_debug("cache %s (%d)", cache->c_name,
-                          atomic_read(&cache->c_entry_count));
-                count += atomic_read(&cache->c_entry_count);
-        }
-        spin_unlock(&mb_cache_spinlock);
-        return vfs_pressure_ratio(count);
 }
+EXPORT_SYMBOL(mb_cache_entry_find_next);
-static struct shrinker mb_cache_shrinker = {
-        .count_objects = mb_cache_shrink_count,
-        .scan_objects = mb_cache_shrink_scan,
-        .seeks = DEFAULT_SEEKS,
-};
 /*
- * mb_cache_create()  create a new cache
+ * mb_cache_entry_get - get a cache entry by block number (and key)
- *
+ * @cache - cache we work with
- * All entries in one cache are equal size. Cache entries may be from
+ * @key - key of block number @block
- * multiple devices. If this is the first mbcache created, registers
+ * @block - block number
- * the cache with kernel memory management. Returns NULL if no more
- * memory was available.
- *
- * @name: name of the cache (informal)
- * @bucket_bits: log2(number of hash buckets)
 */
-struct mb_cache *
+struct mb_cache_entry *mb_cache_entry_get(struct mb_cache *cache, u32 key,
-mb_cache_create(const char *name, int bucket_bits)
+                                          sector_t block)
 {
-        int n, bucket_count = 1 << bucket_bits;
+        struct hlist_bl_node *node;
-        struct mb_cache *cache = NULL;
+        struct hlist_bl_head *head;
+        struct mb_cache_entry *entry;
-        if (!mb_cache_bg_lock) {
-                mb_cache_bg_lock = kmalloc(sizeof(struct blockgroup_lock),
+        head = mb_cache_entry_head(cache, key);
-                        GFP_KERNEL);
+        hlist_bl_lock(head);
-                if (!mb_cache_bg_lock)
+        hlist_bl_for_each_entry(entry, node, head, e_hash_list) {
-                        return NULL;
+                if (entry->e_key == key && entry->e_block == block) {
-                bgl_lock_init(mb_cache_bg_lock);
+                        atomic_inc(&entry->e_refcnt);
-        }
+                        goto out;
+                }
-        cache = kmalloc(sizeof(struct mb_cache), GFP_KERNEL);
-        if (!cache)
-                return NULL;
-        cache->c_name = name;
-        atomic_set(&cache->c_entry_count, 0);
-        cache->c_bucket_bits = bucket_bits;
-        cache->c_block_hash = kmalloc(bucket_count *
-                sizeof(struct hlist_bl_head), GFP_KERNEL);
-        if (!cache->c_block_hash)
-                goto fail;
-        for (n=0; n<bucket_count; n++)
-                INIT_HLIST_BL_HEAD(&cache->c_block_hash[n]);
-        cache->c_index_hash = kmalloc(bucket_count *
-                sizeof(struct hlist_bl_head), GFP_KERNEL);
-        if (!cache->c_index_hash)
-                goto fail;
-        for (n=0; n<bucket_count; n++)
-                INIT_HLIST_BL_HEAD(&cache->c_index_hash[n]);
-        if (!mb_cache_kmem_cache) {
-                mb_cache_kmem_cache = kmem_cache_create(name,
-                        sizeof(struct mb_cache_entry), 0,
-                        SLAB_RECLAIM_ACCOUNT|SLAB_MEM_SPREAD, NULL);
-                if (!mb_cache_kmem_cache)
-                        goto fail2;
        }
-        cache->c_entry_cache = mb_cache_kmem_cache;
+        entry = NULL;
+out:
-        /*
+        hlist_bl_unlock(head);
-         * Set an upper limit on the number of cache entries so that the hash
+        return entry;
-         * chains won't grow too long.
-         */
-        cache->c_max_entries = bucket_count << 4;
-        spin_lock(&mb_cache_spinlock);
-        list_add(&cache->c_cache_list, &mb_cache_list);
-        spin_unlock(&mb_cache_spinlock);
-        return cache;
-fail2:
-        kfree(cache->c_index_hash);
-fail:
-        kfree(cache->c_block_hash);
-        kfree(cache);
-        return NULL;
 }
+EXPORT_SYMBOL(mb_cache_entry_get);
+/* mb_cache_entry_delete_block - remove information about block from cache
-/*
+ * @cache - cache we work with
- * mb_cache_shrink()
+ * @key - key of block @block
- *
+ * @block - block number
- * Removes all cache entries of a device from the cache. All cache entries
- * currently in use cannot be freed, and thus remain in the cache. All others
- * are freed.
 *
- * @bdev: which device's cache entries to shrink
+ * Remove entry from cache @cache with key @key with data stored in @block.
 */
-void
+void mb_cache_entry_delete_block(struct mb_cache *cache, u32 key,
-mb_cache_shrink(struct block_device *bdev)
+                                 sector_t block)
 {
-        LIST_HEAD(free_list);
+        struct hlist_bl_node *node;
-        struct list_head *l;
+        struct hlist_bl_head *head;
-        struct mb_cache_entry *ce, *tmp;
+        struct mb_cache_entry *entry;
-        l = &mb_cache_lru_list;
+        head = mb_cache_entry_head(cache, key);
-        spin_lock(&mb_cache_spinlock);
+        hlist_bl_lock(head);
-        while (!list_is_last(l, &mb_cache_lru_list)) {
+        hlist_bl_for_each_entry(entry, node, head, e_hash_list) {
-                l = l->next;
+                if (entry->e_key == key && entry->e_block == block) {
-                ce = list_entry(l, struct mb_cache_entry, e_lru_list);
+                        /* We keep hash list reference to keep entry alive */
-                if (ce->e_bdev == bdev) {
+                        hlist_bl_del_init(&entry->e_hash_list);
-                        list_del_init(&ce->e_lru_list);
+                        hlist_bl_unlock(head);
-                        if (ce->e_used || ce->e_queued ||
+                        spin_lock(&cache->c_list_lock);
-                                atomic_read(&ce->e_refcnt))
+                        if (!list_empty(&entry->e_list)) {
-                                continue;
+                                list_del_init(&entry->e_list);
-                        spin_unlock(&mb_cache_spinlock);
+                                cache->c_entry_count--;
-                        /*
+                                atomic_dec(&entry->e_refcnt);
-                         * Prevent any find or get operation on the entry.
-                         */
-                        hlist_bl_lock(ce->e_block_hash_p);
-                        hlist_bl_lock(ce->e_index_hash_p);
-                        /* Ignore if it is touched by a find/get */
-                        if (ce->e_used || ce->e_queued ||
-                                atomic_read(&ce->e_refcnt) ||
-                                !list_empty(&ce->e_lru_list)) {
-                                hlist_bl_unlock(ce->e_index_hash_p);
-                                hlist_bl_unlock(ce->e_block_hash_p);
-                                l = &mb_cache_lru_list;
-                                spin_lock(&mb_cache_spinlock);
-                                continue;
                        }
-                        __mb_cache_entry_unhash_unlock(ce);
+                        spin_unlock(&cache->c_list_lock);
-                        mb_assert(!(ce->e_used || ce->e_queued ||
+                        mb_cache_entry_put(cache, entry);
-                                atomic_read(&ce->e_refcnt)));
+                        return;
-                        list_add_tail(&ce->e_lru_list, &free_list);
-                        l = &mb_cache_lru_list;
-                        spin_lock(&mb_cache_spinlock);
                }
        }
-        spin_unlock(&mb_cache_spinlock);
+        hlist_bl_unlock(head);
-        list_for_each_entry_safe(ce, tmp, &free_list, e_lru_list) {
-                __mb_cache_entry_forget(ce, GFP_KERNEL);
-        }
 }
+EXPORT_SYMBOL(mb_cache_entry_delete_block);
+/* mb_cache_entry_touch - cache entry got used
-/*
+ * @cache - cache the entry belongs to
- * mb_cache_destroy()
+ * @entry - entry that got used
 *
- * Shrinks the cache to its minimum possible size (hopefully 0 entries),
+ * Marks entry as used to give hit higher chances of surviving in cache.
- * and then destroys it. If this was the last mbcache, un-registers the
- * mbcache from kernel memory management.
 */
-void
+void mb_cache_entry_touch(struct mb_cache *cache,
-mb_cache_destroy(struct mb_cache *cache)
+                          struct mb_cache_entry *entry)
 {
-        LIST_HEAD(free_list);
+        entry->e_referenced = 1;
-        struct mb_cache_entry *ce, *tmp;
-        spin_lock(&mb_cache_spinlock);
-        list_for_each_entry_safe(ce, tmp, &mb_cache_lru_list, e_lru_list) {
-                if (ce->e_cache == cache)
-                        list_move_tail(&ce->e_lru_list, &free_list);
-        }
-        list_del(&cache->c_cache_list);
-        spin_unlock(&mb_cache_spinlock);
-        list_for_each_entry_safe(ce, tmp, &free_list, e_lru_list) {
-                list_del_init(&ce->e_lru_list);
-                /*
-                 * Prevent any find or get operation on the entry.
-                 */
-                hlist_bl_lock(ce->e_block_hash_p);
-                hlist_bl_lock(ce->e_index_hash_p);
-                mb_assert(!(ce->e_used || ce->e_queued ||
-                        atomic_read(&ce->e_refcnt)));
-                __mb_cache_entry_unhash_unlock(ce);
-                __mb_cache_entry_forget(ce, GFP_KERNEL);
-        }
-        if (atomic_read(&cache->c_entry_count) > 0) {
-                mb_error("cache %s: %d orphaned entries",
-                          cache->c_name,
-                          atomic_read(&cache->c_entry_count));
-        }
-        if (list_empty(&mb_cache_list)) {
-                kmem_cache_destroy(mb_cache_kmem_cache);
-                mb_cache_kmem_cache = NULL;
-        }
-        kfree(cache->c_index_hash);
-        kfree(cache->c_block_hash);
-        kfree(cache);
 }
+EXPORT_SYMBOL(mb_cache_entry_touch);
-/*
+static unsigned long mb_cache_count(struct shrinker *shrink,
- * mb_cache_entry_alloc()
+                                    struct shrink_control *sc)
- *
- * Allocates a new cache entry. The new entry will not be valid initially,
- * and thus cannot be looked up yet. It should be filled with data, and
- * then inserted into the cache using mb_cache_entry_insert(). Returns NULL
- * if no more memory was available.
- */
-struct mb_cache_entry *
-mb_cache_entry_alloc(struct mb_cache *cache, gfp_t gfp_flags)
 {
-        struct mb_cache_entry *ce;
+        struct mb_cache *cache = container_of(shrink, struct mb_cache,
+                                              c_shrink);
-        if (atomic_read(&cache->c_entry_count) >= cache->c_max_entries) {
-                struct list_head *l;
-                l = &mb_cache_lru_list;
-                spin_lock(&mb_cache_spinlock);
-                while (!list_is_last(l, &mb_cache_lru_list)) {
-                        l = l->next;
-                        ce = list_entry(l, struct mb_cache_entry, e_lru_list);
-                        if (ce->e_cache == cache) {
-                                list_del_init(&ce->e_lru_list);
-                                if (ce->e_used || ce->e_queued ||
-                                        atomic_read(&ce->e_refcnt))
-                                        continue;
-                                spin_unlock(&mb_cache_spinlock);
-                                /*
-                                 * Prevent any find or get operation on the
-                                 * entry.
-                                 */
-                                hlist_bl_lock(ce->e_block_hash_p);
-                                hlist_bl_lock(ce->e_index_hash_p);
-                                /* Ignore if it is touched by a find/get */
-                                if (ce->e_used || ce->e_queued ||
-                                        atomic_read(&ce->e_refcnt) ||
-                                        !list_empty(&ce->e_lru_list)) {
-                                        hlist_bl_unlock(ce->e_index_hash_p);
-                                        hlist_bl_unlock(ce->e_block_hash_p);
-                                        l = &mb_cache_lru_list;
-                                        spin_lock(&mb_cache_spinlock);
-                                        continue;
-                                }
-                                mb_assert(list_empty(&ce->e_lru_list));
-                                mb_assert(!(ce->e_used || ce->e_queued ||
-                                        atomic_read(&ce->e_refcnt)));
-                                __mb_cache_entry_unhash_unlock(ce);
-                                goto found;
-                        }
-                }
-                spin_unlock(&mb_cache_spinlock);
-        }
-        ce = kmem_cache_alloc(cache->c_entry_cache, gfp_flags);
+        return cache->c_entry_count;
-        if (!ce)
-                return NULL;
-        atomic_inc(&cache->c_entry_count);
-        INIT_LIST_HEAD(&ce->e_lru_list);
-        INIT_HLIST_BL_NODE(&ce->e_block_list);
-        INIT_HLIST_BL_NODE(&ce->e_index.o_list);
-        ce->e_cache = cache;
-        ce->e_queued = 0;
-        atomic_set(&ce->e_refcnt, 0);
-found:
-        ce->e_block_hash_p = &cache->c_block_hash[0];
-        ce->e_index_hash_p = &cache->c_index_hash[0];
-        ce->e_used = 1 + MB_CACHE_WRITER;
-        return ce;
 }
+/* Shrink number of entries in cache */
-/*
+static unsigned long mb_cache_shrink(struct mb_cache *cache,
- * mb_cache_entry_insert()
+                                     unsigned int nr_to_scan)
- *
- * Inserts an entry that was allocated using mb_cache_entry_alloc() into
- * the cache. After this, the cache entry can be looked up, but is not yet
- * in the lru list as the caller still holds a handle to it. Returns 0 on
- * success, or -EBUSY if a cache entry for that device + inode exists
- * already (this may happen after a failed lookup, but when another process
- * has inserted the same cache entry in the meantime).
- *
- * @bdev: device the cache entry belongs to
- * @block: block number
- * @key: lookup key
- */
-int
-mb_cache_entry_insert(struct mb_cache_entry *ce, struct block_device *bdev,
-                      sector_t block, unsigned int key)
 {
-        struct mb_cache *cache = ce->e_cache;
+        struct mb_cache_entry *entry;
-        unsigned int bucket;
+        struct hlist_bl_head *head;
-        struct hlist_bl_node *l;
+        unsigned int shrunk = 0;
-        struct hlist_bl_head *block_hash_p;
-        struct hlist_bl_head *index_hash_p;
+        spin_lock(&cache->c_list_lock);
-        struct mb_cache_entry *lce;
+        while (nr_to_scan-- && !list_empty(&cache->c_list)) {
+                entry = list_first_entry(&cache->c_list,
-        mb_assert(ce);
+                                         struct mb_cache_entry, e_list);
-        bucket = hash_long((unsigned long)bdev + (block & 0xffffffff), 
+                if (entry->e_referenced) {
-                           cache->c_bucket_bits);
+                        entry->e_referenced = 0;
-        block_hash_p = &cache->c_block_hash[bucket];
+                        list_move_tail(&cache->c_list, &entry->e_list);
-        hlist_bl_lock(block_hash_p);
+                        continue;
-        hlist_bl_for_each_entry(lce, l, block_hash_p, e_block_list) {
-                if (lce->e_bdev == bdev && lce->e_block == block) {
-                        hlist_bl_unlock(block_hash_p);
-                        return -EBUSY;
                }
+                list_del_init(&entry->e_list);
+                cache->c_entry_count--;
+                /*
+                 * We keep LRU list reference so that entry doesn't go away
+                 * from under us.
+                 */
+                spin_unlock(&cache->c_list_lock);
+                head = mb_cache_entry_head(cache, entry->e_key);
+                hlist_bl_lock(head);
+                if (!hlist_bl_unhashed(&entry->e_hash_list)) {
+                        hlist_bl_del_init(&entry->e_hash_list);
+                        atomic_dec(&entry->e_refcnt);
+                }
+                hlist_bl_unlock(head);
+                if (mb_cache_entry_put(cache, entry))
+                        shrunk++;
+                cond_resched();
+                spin_lock(&cache->c_list_lock);
        }
-        mb_assert(!__mb_cache_entry_is_block_hashed(ce));
+        spin_unlock(&cache->c_list_lock);
-        __mb_cache_entry_unhash_block(ce);
-        __mb_cache_entry_unhash_index(ce);
-        ce->e_bdev = bdev;
-        ce->e_block = block;
-        ce->e_block_hash_p = block_hash_p;
-        ce->e_index.o_key = key;
-        hlist_bl_add_head(&ce->e_block_list, block_hash_p);
-        hlist_bl_unlock(block_hash_p);
-        bucket = hash_long(key, cache->c_bucket_bits);
-        index_hash_p = &cache->c_index_hash[bucket];
-        hlist_bl_lock(index_hash_p);
-        ce->e_index_hash_p = index_hash_p;
-        hlist_bl_add_head(&ce->e_index.o_list, index_hash_p);
-        hlist_bl_unlock(index_hash_p);
-        return 0;
-}
+        return shrunk;
+}
-/*
+static unsigned long mb_cache_scan(struct shrinker *shrink,
- * mb_cache_entry_release()
+                                   struct shrink_control *sc)
- *
- * Release a handle to a cache entry. When the last handle to a cache entry
- * is released it is either freed (if it is invalid) or otherwise inserted
- * in to the lru list.
- */
-void
-mb_cache_entry_release(struct mb_cache_entry *ce)
 {
-        __mb_cache_entry_release(ce);
+        int nr_to_scan = sc->nr_to_scan;
+        struct mb_cache *cache = container_of(shrink, struct mb_cache,
+                                              c_shrink);
+        return mb_cache_shrink(cache, nr_to_scan);
 }
+/* We shrink 1/X of the cache when we have too many entries in it */
+#define SHRINK_DIVISOR 16
-/*
+static void mb_cache_shrink_worker(struct work_struct *work)
- * mb_cache_entry_free()
- *
- */
-void
-mb_cache_entry_free(struct mb_cache_entry *ce)
 {
-        mb_assert(ce);
+        struct mb_cache *cache = container_of(work, struct mb_cache,
-        mb_assert(list_empty(&ce->e_lru_list));
+                                              c_shrink_work);
-        hlist_bl_lock(ce->e_index_hash_p);
+        mb_cache_shrink(cache, cache->c_max_entries / SHRINK_DIVISOR);
-        __mb_cache_entry_unhash_index(ce);
-        hlist_bl_unlock(ce->e_index_hash_p);
-        hlist_bl_lock(ce->e_block_hash_p);
-        __mb_cache_entry_unhash_block(ce);
-        hlist_bl_unlock(ce->e_block_hash_p);
-        __mb_cache_entry_release(ce);
 }
 /*
- * mb_cache_entry_get()
+ * mb_cache_create - create cache
+ * @bucket_bits: log2 of the hash table size
 *
- * Get a cache entry  by device / block number. (There can only be one entry
+ * Create cache for keys with 2^bucket_bits hash entries.
- * in the cache per device and block.) Returns NULL if no such cache entry
- * exists. The returned cache entry is locked for exclusive access ("single
- * writer").
 */
-struct mb_cache_entry *
+struct mb_cache *mb_cache_create(int bucket_bits)
-mb_cache_entry_get(struct mb_cache *cache, struct block_device *bdev,
-                   sector_t block)
 {
-        unsigned int bucket;
+        struct mb_cache *cache;
-        struct hlist_bl_node *l;
+        int bucket_count = 1 << bucket_bits;
-        struct mb_cache_entry *ce;
+        int i;
-        struct hlist_bl_head *block_hash_p;
-        bucket = hash_long((unsigned long)bdev + (block & 0xffffffff),
-                           cache->c_bucket_bits);
-        block_hash_p = &cache->c_block_hash[bucket];
-        /* First serialize access to the block corresponding hash chain. */
-        hlist_bl_lock(block_hash_p);
-        hlist_bl_for_each_entry(ce, l, block_hash_p, e_block_list) {
-                mb_assert(ce->e_block_hash_p == block_hash_p);
-                if (ce->e_bdev == bdev && ce->e_block == block) {
-                        /*
-                         * Prevent a free from removing the entry.
-                         */
-                        atomic_inc(&ce->e_refcnt);
-                        hlist_bl_unlock(block_hash_p);
-                        __spin_lock_mb_cache_entry(ce);
-                        atomic_dec(&ce->e_refcnt);
-                        if (ce->e_used > 0) {
-                                DEFINE_WAIT(wait);
-                                while (ce->e_used > 0) {
-                                        ce->e_queued++;
-                                        prepare_to_wait(&mb_cache_queue, &wait,
-                                                        TASK_UNINTERRUPTIBLE);
-                                        __spin_unlock_mb_cache_entry(ce);
-                                        schedule();
-                                        __spin_lock_mb_cache_entry(ce);
-                                        ce->e_queued--;
-                                }
-                                finish_wait(&mb_cache_queue, &wait);
-                        }
-                        ce->e_used += 1 + MB_CACHE_WRITER;
-                        __spin_unlock_mb_cache_entry(ce);
-                        if (!list_empty(&ce->e_lru_list)) {
+        if (!try_module_get(THIS_MODULE))
-                                spin_lock(&mb_cache_spinlock);
+                return NULL;
-                                list_del_init(&ce->e_lru_list);
-                                spin_unlock(&mb_cache_spinlock);
+        cache = kzalloc(sizeof(struct mb_cache), GFP_KERNEL);
-                        }
+        if (!cache)
-                        if (!__mb_cache_entry_is_block_hashed(ce)) {
+                goto err_out;
-                                __mb_cache_entry_release(ce);
+        cache->c_bucket_bits = bucket_bits;
-                                return NULL;
+        cache->c_max_entries = bucket_count << 4;
-                        }
+        INIT_LIST_HEAD(&cache->c_list);
-                        return ce;
+        spin_lock_init(&cache->c_list_lock);
-                }
+        cache->c_hash = kmalloc(bucket_count * sizeof(struct hlist_bl_head),
+                                GFP_KERNEL);
+        if (!cache->c_hash) {
+                kfree(cache);
+                goto err_out;
        }
-        hlist_bl_unlock(block_hash_p);
+        for (i = 0; i < bucket_count; i++)
-        return NULL;
+                INIT_HLIST_BL_HEAD(&cache->c_hash[i]);
-}
-#if !defined(MB_CACHE_INDEXES_COUNT) || (MB_CACHE_INDEXES_COUNT > 0)
+        cache->c_shrink.count_objects = mb_cache_count;
+        cache->c_shrink.scan_objects = mb_cache_scan;
+        cache->c_shrink.seeks = DEFAULT_SEEKS;
+        register_shrinker(&cache->c_shrink);
-static struct mb_cache_entry *
+        INIT_WORK(&cache->c_shrink_work, mb_cache_shrink_worker);
-__mb_cache_entry_find(struct hlist_bl_node *l, struct hlist_bl_head *head,
-                      struct block_device *bdev, unsigned int key)
-{
-        /* The index hash chain is alredy acquire by caller. */
+        return cache;
-        while (l != NULL) {
-                struct mb_cache_entry *ce =
+err_out:
-                        hlist_bl_entry(l, struct mb_cache_entry,
+        module_put(THIS_MODULE);
-                                e_index.o_list);
-                mb_assert(ce->e_index_hash_p == head);
-                if (ce->e_bdev == bdev && ce->e_index.o_key == key) {
-                        /*
-                         * Prevent a free from removing the entry.
-                         */
-                        atomic_inc(&ce->e_refcnt);
-                        hlist_bl_unlock(head);
-                        __spin_lock_mb_cache_entry(ce);
-                        atomic_dec(&ce->e_refcnt);
-                        ce->e_used++;
-                        /* Incrementing before holding the lock gives readers
-                           priority over writers. */
-                        if (ce->e_used >= MB_CACHE_WRITER) {
-                                DEFINE_WAIT(wait);
-                                while (ce->e_used >= MB_CACHE_WRITER) {
-                                        ce->e_queued++;
-                                        prepare_to_wait(&mb_cache_queue, &wait,
-                                                        TASK_UNINTERRUPTIBLE);
-                                        __spin_unlock_mb_cache_entry(ce);
-                                        schedule();
-                                        __spin_lock_mb_cache_entry(ce);
-                                        ce->e_queued--;
-                                }
-                                finish_wait(&mb_cache_queue, &wait);
-                        }
-                        __spin_unlock_mb_cache_entry(ce);
-                        if (!list_empty(&ce->e_lru_list)) {
-                                spin_lock(&mb_cache_spinlock);
-                                list_del_init(&ce->e_lru_list);
-                                spin_unlock(&mb_cache_spinlock);
-                        }
-                        if (!__mb_cache_entry_is_block_hashed(ce)) {
-                                __mb_cache_entry_release(ce);
-                                return ERR_PTR(-EAGAIN);
-                        }
-                        return ce;
-                }
-                l = l->next;
-        }
-        hlist_bl_unlock(head);
        return NULL;
 }
+EXPORT_SYMBOL(mb_cache_create);
 /*
- * mb_cache_entry_find_first()
+ * mb_cache_destroy - destroy cache
- *
+ * @cache: the cache to destroy
- * Find the first cache entry on a given device with a certain key in
- * an additional index. Additional matches can be found with
- * mb_cache_entry_find_next(). Returns NULL if no match was found. The
- * returned cache entry is locked for shared access ("multiple readers").
 *
- * @cache: the cache to search
+ * Free all entries in cache and cache itself. Caller must make sure nobody
- * @bdev: the device the cache entry should belong to
+ * (except shrinker) can reach @cache when calling this.
- * @key: the key in the index
 */
-struct mb_cache_entry *
+void mb_cache_destroy(struct mb_cache *cache)
-mb_cache_entry_find_first(struct mb_cache *cache, struct block_device *bdev,
-                          unsigned int key)
 {
-        unsigned int bucket = hash_long(key, cache->c_bucket_bits);
+        struct mb_cache_entry *entry, *next;
-        struct hlist_bl_node *l;
-        struct mb_cache_entry *ce = NULL;
-        struct hlist_bl_head *index_hash_p;
-        index_hash_p = &cache->c_index_hash[bucket];
-        hlist_bl_lock(index_hash_p);
-        if (!hlist_bl_empty(index_hash_p)) {
-                l = hlist_bl_first(index_hash_p);
-                ce = __mb_cache_entry_find(l, index_hash_p, bdev, key);
-        } else
-                hlist_bl_unlock(index_hash_p);
-        return ce;
-}
+        unregister_shrinker(&cache->c_shrink);
-/*
+        /*
- * mb_cache_entry_find_next()
+         * We don't bother with any locking. Cache must not be used at this
- *
+         * point.
- * Find the next cache entry on a given device with a certain key in an
+         */
- * additional index. Returns NULL if no match could be found. The previous
+        list_for_each_entry_safe(entry, next, &cache->c_list, e_list) {
- * entry is atomatically released, so that mb_cache_entry_find_next() can
+                if (!hlist_bl_unhashed(&entry->e_hash_list)) {
- * be called like this:
+                        hlist_bl_del_init(&entry->e_hash_list);
- *
+                        atomic_dec(&entry->e_refcnt);
- * entry = mb_cache_entry_find_first();
+                } else
- * while (entry) {
+                        WARN_ON(1);
- *      ...
+                list_del(&entry->e_list);
- *      entry = mb_cache_entry_find_next(entry, ...);
+                WARN_ON(atomic_read(&entry->e_refcnt) != 1);
- * }
+                mb_cache_entry_put(cache, entry);
- *
+        }
- * @prev: The previous match
+        kfree(cache->c_hash);
- * @bdev: the device the cache entry should belong to
+        kfree(cache);
- * @key: the key in the index
+        module_put(THIS_MODULE);
- */
-struct mb_cache_entry *
-mb_cache_entry_find_next(struct mb_cache_entry *prev,
-                         struct block_device *bdev, unsigned int key)
-{
-        struct mb_cache *cache = prev->e_cache;
-        unsigned int bucket = hash_long(key, cache->c_bucket_bits);
-        struct hlist_bl_node *l;
-        struct mb_cache_entry *ce;
-        struct hlist_bl_head *index_hash_p;
-        index_hash_p = &cache->c_index_hash[bucket];
-        mb_assert(prev->e_index_hash_p == index_hash_p);
-        hlist_bl_lock(index_hash_p);
-        mb_assert(!hlist_bl_empty(index_hash_p));
-        l = prev->e_index.o_list.next;
-        ce = __mb_cache_entry_find(l, index_hash_p, bdev, key);
-        __mb_cache_entry_release(prev);
-        return ce;
 }
+EXPORT_SYMBOL(mb_cache_destroy);
-#endif  /* !defined(MB_CACHE_INDEXES_COUNT) || (MB_CACHE_INDEXES_COUNT > 0) */
+static int __init mbcache_init(void)
-static int __init init_mbcache(void)
 {
-        register_shrinker(&mb_cache_shrinker);
+        mb_entry_cache = kmem_cache_create("mbcache",
+                                sizeof(struct mb_cache_entry), 0,
+                                SLAB_RECLAIM_ACCOUNT|SLAB_MEM_SPREAD, NULL);
+        BUG_ON(!mb_entry_cache);
        return 0;
 }
-static void __exit exit_mbcache(void)
+static void __exit mbcache_exit(void)
 {
-        unregister_shrinker(&mb_cache_shrinker);
+        kmem_cache_destroy(mb_entry_cache);
 }
-module_init(init_mbcache)
+module_init(mbcache_init)
-module_exit(exit_mbcache)
+module_exit(mbcache_exit)
+MODULE_AUTHOR("Jan Kara <jack@suse.cz>");
+MODULE_DESCRIPTION("Meta block cache (for extended attributes)");
+MODULE_LICENSE("GPL");
diff --git a/fs/mpage.c b/fs/mpage.c
index 1480d3a18037..6bd9fd90964e 100644
--- a/fs/mpage.c
+++ b/fs/mpage.c
@@ -24,6 +24,7 @@
 #include <linux/highmem.h>
 #include <linux/prefetch.h>
 #include <linux/mpage.h>
+#include <linux/mm_inline.h>
 #include <linux/writeback.h>
 #include <linux/backing-dev.h>
 #include <linux/pagevec.h>
@@ -366,7 +367,7 @@ mpage_readpages(struct address_space *mapping, struct list_head *pages,
        map_bh.b_state = 0;
        map_bh.b_size = 0;
        for (page_idx = 0; page_idx < nr_pages; page_idx++) {
-                struct page *page = list_entry(pages->prev, struct page, lru);
+                struct page *page = lru_to_page(pages);
                prefetchw(&page->flags);
                list_del(&page->lru);
diff --git a/fs/namei.c b/fs/namei.c
index 9c590e0f66e9..1d9ca2d5dff6 100644
--- a/fs/namei.c
+++ b/fs/namei.c
@@ -1220,8 +1220,8 @@ static int follow_managed(struct path *path, struct nameidata *nd)
        if (need_mntput && path->mnt == mnt)
                mntput(path->mnt);
-        if (ret == -EISDIR)
+        if (ret == -EISDIR || !ret)
-                ret = 0;
+                ret = 1;
        if (need_mntput)
                nd->flags |= LOOKUP_JUMPED;
        if (unlikely(ret < 0))
@@ -1444,40 +1444,26 @@ static int follow_dotdot(struct nameidata *nd)
 * This looks up the name in dcache, possibly revalidates the old dentry and
 * allocates a new one if not found or not valid.  In the need_lookup argument
 * returns whether i_op->lookup is necessary.
- *
- * dir->d_inode->i_mutex must be held
 */
-static struct dentry *lookup_dcache(struct qstr *name, struct dentry *dir,
+static struct dentry *lookup_dcache(const struct qstr *name,
-                                    unsigned int flags, bool *need_lookup)
+                                    struct dentry *dir,
+                                    unsigned int flags)
 {
        struct dentry *dentry;
        int error;
-        *need_lookup = false;
        dentry = d_lookup(dir, name);
        if (dentry) {
                if (dentry->d_flags & DCACHE_OP_REVALIDATE) {
                        error = d_revalidate(dentry, flags);
                        if (unlikely(error <= 0)) {
-                                if (error < 0) {
+                                if (!error)
-                                        dput(dentry);
-                                        return ERR_PTR(error);
-                                } else {
                                        d_invalidate(dentry);
-                                        dput(dentry);
+                                dput(dentry);
-                                        dentry = NULL;
+                                return ERR_PTR(error);
-                                }
                        }
                }
        }
-        if (!dentry) {
-                dentry = d_alloc(dir, name);
-                if (unlikely(!dentry))
-                        return ERR_PTR(-ENOMEM);
-                *need_lookup = true;
-        }
        return dentry;
 }
@@ -1506,45 +1492,44 @@ static struct dentry *lookup_real(struct inode *dir, struct dentry *dentry,
        return dentry;
 }
-static struct dentry *__lookup_hash(struct qstr *name,
+static struct dentry *__lookup_hash(const struct qstr *name,
                struct dentry *base, unsigned int flags)
 {
-        bool need_lookup;
+        struct dentry *dentry = lookup_dcache(name, base, flags);
-        struct dentry *dentry;
-        dentry = lookup_dcache(name, base, flags, &need_lookup);
+        if (dentry)
-        if (!need_lookup)
                return dentry;
+        dentry = d_alloc(base, name);
+        if (unlikely(!dentry))
+                return ERR_PTR(-ENOMEM);
        return lookup_real(base->d_inode, dentry, flags);
 }
-/*
- *  It's more convoluted than I'd like it to be, but... it's still fairly
- *  small and for now I'd prefer to have fast path as straight as possible.
- *  It _is_ time-critical.
- */
 static int lookup_fast(struct nameidata *nd,
                       struct path *path, struct inode **inode,
                       unsigned *seqp)
 {
        struct vfsmount *mnt = nd->path.mnt;
        struct dentry *dentry, *parent = nd->path.dentry;
-        int need_reval = 1;
        int status = 1;
        int err;
        /*
         * Rename seqlock is not required here because in the off chance
-         * of a false negative due to a concurrent rename, we're going to
+         * of a false negative due to a concurrent rename, the caller is
-         * do the non-racy lookup, below.
+         * going to fall back to non-racy lookup.
         */
        if (nd->flags & LOOKUP_RCU) {
                unsigned seq;
                bool negative;
                dentry = __d_lookup_rcu(parent, &nd->last, &seq);
-                if (!dentry)
+                if (unlikely(!dentry)) {
-                        goto unlazy;
+                        if (unlazy_walk(nd, NULL, 0))
+                                return -ECHILD;
+                        return 0;
+                }
                /*
                 * This sequence count validates that the inode matches
@@ -1552,7 +1537,7 @@ static int lookup_fast(struct nameidata *nd,
                 */
                *inode = d_backing_inode(dentry);
                negative = d_is_negative(dentry);
-                if (read_seqcount_retry(&dentry->d_seq, seq))
+                if (unlikely(read_seqcount_retry(&dentry->d_seq, seq)))
                        return -ECHILD;
                /*
@@ -1562,81 +1547,89 @@ static int lookup_fast(struct nameidata *nd,
                 * The memory barrier in read_seqcount_begin of child is
                 *  enough, we can use __read_seqcount_retry here.
                 */
-                if (__read_seqcount_retry(&parent->d_seq, nd->seq))
+                if (unlikely(__read_seqcount_retry(&parent->d_seq, nd->seq)))
                        return -ECHILD;
                *seqp = seq;
-                if (unlikely(dentry->d_flags & DCACHE_OP_REVALIDATE)) {
+                if (unlikely(dentry->d_flags & DCACHE_OP_REVALIDATE))
                        status = d_revalidate(dentry, nd->flags);
-                        if (unlikely(status <= 0)) {
+                if (unlikely(status <= 0)) {
-                                if (status != -ECHILD)
+                        if (unlazy_walk(nd, dentry, seq))
-                                        need_reval = 0;
+                                return -ECHILD;
-                                goto unlazy;
+                        if (status == -ECHILD)
-                        }
+                                status = d_revalidate(dentry, nd->flags);
+                } else {
+                        /*
+                         * Note: do negative dentry check after revalidation in
+                         * case that drops it.
+                         */
+                        if (unlikely(negative))
+                                return -ENOENT;
+                        path->mnt = mnt;
+                        path->dentry = dentry;
+                        if (likely(__follow_mount_rcu(nd, path, inode, seqp)))
+                                return 1;
+                        if (unlazy_walk(nd, dentry, seq))
+                                return -ECHILD;
                }
-                /*
-                 * Note: do negative dentry check after revalidation in
-                 * case that drops it.
-                 */
-                if (negative)
-                        return -ENOENT;
-                path->mnt = mnt;
-                path->dentry = dentry;
-                if (likely(__follow_mount_rcu(nd, path, inode, seqp)))
-                        return 0;
-unlazy:
-                if (unlazy_walk(nd, dentry, seq))
-                        return -ECHILD;
        } else {
                dentry = __d_lookup(parent, &nd->last);
+                if (unlikely(!dentry))
+                        return 0;
+                if (unlikely(dentry->d_flags & DCACHE_OP_REVALIDATE))
+                        status = d_revalidate(dentry, nd->flags);
        }
-        if (unlikely(!dentry))
-                goto need_lookup;
-        if (unlikely(dentry->d_flags & DCACHE_OP_REVALIDATE) && need_reval)
-                status = d_revalidate(dentry, nd->flags);
        if (unlikely(status <= 0)) {
-                if (status < 0) {
+                if (!status)
-                        dput(dentry);
+                        d_invalidate(dentry);
-                        return status;
-                }
-                d_invalidate(dentry);
                dput(dentry);
-                goto need_lookup;
+                return status;
        }
        if (unlikely(d_is_negative(dentry))) {
                dput(dentry);
                return -ENOENT;
        }
        path->mnt = mnt;
        path->dentry = dentry;
        err = follow_managed(path, nd);
-        if (likely(!err))
+        if (likely(err > 0))
                *inode = d_backing_inode(path->dentry);
        return err;
-need_lookup:
-        return 1;
 }
 /* Fast lookup failed, do it the slow way */
-static int lookup_slow(struct nameidata *nd, struct path *path)
+static struct dentry *lookup_slow(const struct qstr *name,
+                                  struct dentry *dir,
+                                  unsigned int flags)
 {
-        struct dentry *dentry, *parent;
+        struct dentry *dentry;
+        inode_lock(dir->d_inode);
-        parent = nd->path.dentry;
+        dentry = d_lookup(dir, name);
-        BUG_ON(nd->inode != parent->d_inode);
+        if (unlikely(dentry)) {
+                if ((dentry->d_flags & DCACHE_OP_REVALIDATE) &&
-        inode_lock(parent->d_inode);
+                    !(flags & LOOKUP_NO_REVAL)) {
-        dentry = __lookup_hash(&nd->last, parent, nd->flags);
+                        int error = d_revalidate(dentry, flags);
-        inode_unlock(parent->d_inode);
+                        if (unlikely(error <= 0)) {
-        if (IS_ERR(dentry))
+                                if (!error)
-                return PTR_ERR(dentry);
+                                        d_invalidate(dentry);
-        path->mnt = nd->path.mnt;
+                                dput(dentry);
-        path->dentry = dentry;
+                                dentry = ERR_PTR(error);
-        return follow_managed(path, nd);
+                        }
+                }
+                if (dentry) {
+                        inode_unlock(dir->d_inode);
+                        return dentry;
+                }
+        }
+        dentry = d_alloc(dir, name);
+        if (unlikely(!dentry)) {
+                inode_unlock(dir->d_inode);
+                return ERR_PTR(-ENOMEM);
+        }
+        dentry = lookup_real(dir->d_inode, dentry, flags);
+        inode_unlock(dir->d_inode);
+        return dentry;
 }
 static inline int may_lookup(struct nameidata *nd)
@@ -1740,18 +1733,25 @@ static int walk_component(struct nameidata *nd, int flags)
                return err;
        }
        err = lookup_fast(nd, &path, &inode, &seq);
-        if (unlikely(err)) {
+        if (unlikely(err <= 0)) {
                if (err < 0)
                        return err;
+                path.dentry = lookup_slow(&nd->last, nd->path.dentry,
-                err = lookup_slow(nd, &path);
+                                          nd->flags);
-                if (err < 0)
+                if (IS_ERR(path.dentry))
+                        return PTR_ERR(path.dentry);
+                path.mnt = nd->path.mnt;
+                err = follow_managed(&path, nd);
+                if (unlikely(err < 0))
                        return err;
+                if (unlikely(d_is_negative(path.dentry))) {
+                        path_to_nameidata(&path, nd);
+                        return -ENOENT;
+                }
                seq = 0;        /* we are already out of RCU mode */
-                err = -ENOENT;
-                if (d_is_negative(path.dentry))
-                        goto out_path_put;
                inode = d_backing_inode(path.dentry);
        }
@@ -1764,10 +1764,6 @@ static int walk_component(struct nameidata *nd, int flags)
        nd->inode = inode;
        nd->seq = seq;
        return 0;
-out_path_put:
-        path_to_nameidata(&path, nd);
-        return err;
 }
 /*
@@ -2373,21 +2369,9 @@ struct dentry *lookup_one_len_unlocked(const char *name,
        if (err)
                return ERR_PTR(err);
-        /*
+        ret = lookup_dcache(&this, base, 0);
-         * __d_lookup() is used to try to get a quick answer and avoid the
+        if (!ret)
-         * mutex.  A false-negative does no harm.
+                ret = lookup_slow(&this, base, 0);
-         */
-        ret = __d_lookup(base, &this);
-        if (ret && unlikely(ret->d_flags & DCACHE_OP_REVALIDATE)) {
-                dput(ret);
-                ret = NULL;
-        }
-        if (ret)
-                return ret;
-        inode_lock(base->d_inode);
-        ret =  __lookup_hash(&this, base, 0);
-        inode_unlock(base->d_inode);
        return ret;
 }
 EXPORT_SYMBOL(lookup_one_len_unlocked);
@@ -2465,31 +2449,21 @@ mountpoint_last(struct nameidata *nd, struct path *path)
                if (error)
                        return error;
                dentry = dget(nd->path.dentry);
-                goto done;
+        } else {
-        }
+                dentry = d_lookup(dir, &nd->last);
-        inode_lock(dir->d_inode);
-        dentry = d_lookup(dir, &nd->last);
-        if (!dentry) {
-                /*
-                 * No cached dentry. Mounted dentries are pinned in the cache,
-                 * so that means that this dentry is probably a symlink or the
-                 * path doesn't actually point to a mounted dentry.
-                 */
-                dentry = d_alloc(dir, &nd->last);
                if (!dentry) {
-                        inode_unlock(dir->d_inode);
+                        /*
-                        return -ENOMEM;
+                         * No cached dentry. Mounted dentries are pinned in the
-                }
+                         * cache, so that means that this dentry is probably
-                dentry = lookup_real(dir->d_inode, dentry, nd->flags);
+                         * a symlink or the path doesn't actually point
-                if (IS_ERR(dentry)) {
+                         * to a mounted dentry.
-                        inode_unlock(dir->d_inode);
+                         */
-                        return PTR_ERR(dentry);
+                        dentry = lookup_slow(&nd->last, dir,
+                                             nd->flags | LOOKUP_NO_REVAL);
+                        if (IS_ERR(dentry))
+                                return PTR_ERR(dentry);
                }
        }
-        inode_unlock(dir->d_inode);
-done:
        if (d_is_negative(dentry)) {
                dput(dentry);
                return -ENOENT;
@@ -3018,16 +2992,22 @@ static int lookup_open(struct nameidata *nd, struct path *path,
        struct inode *dir_inode = dir->d_inode;
        struct dentry *dentry;
        int error;
-        bool need_lookup;
+        bool need_lookup = false;
        *opened &= ~FILE_CREATED;
-        dentry = lookup_dcache(&nd->last, dir, nd->flags, &need_lookup);
+        dentry = lookup_dcache(&nd->last, dir, nd->flags);
        if (IS_ERR(dentry))
                return PTR_ERR(dentry);
-        /* Cached positive dentry: will open in f_op->open */
+        if (!dentry) {
-        if (!need_lookup && dentry->d_inode)
+                dentry = d_alloc(dir, &nd->last);
+                if (unlikely(!dentry))
+                        return -ENOMEM;
+                need_lookup = true;
+        } else if (dentry->d_inode) {
+                /* Cached positive dentry: will open in f_op->open */
                goto out_no_open;
+        }
        if ((nd->flags & LOOKUP_OPEN) && dir_inode->i_op->atomic_open) {
                return atomic_open(nd, dentry, path, file, op, got_write,
@@ -3111,13 +3091,14 @@ static int do_last(struct nameidata *nd,
                        nd->flags |= LOOKUP_FOLLOW | LOOKUP_DIRECTORY;
                /* we _can_ be in RCU mode here */
                error = lookup_fast(nd, &path, &inode, &seq);
-                if (likely(!error))
+                if (likely(error > 0))
                        goto finish_lookup;
                if (error < 0)
                        return error;
                BUG_ON(nd->inode != dir->d_inode);
+                BUG_ON(nd->flags & LOOKUP_RCU);
        } else {
                /* create side of things */
                /*
@@ -3172,12 +3153,6 @@ retry_lookup:
        }
        /*
-         * create/update audit record if it already exists.
-         */
-        if (d_is_positive(path.dentry))
-                audit_inode(nd->name, path.dentry, 0);
-        /*
         * If atomic_open() acquired write access it is dropped now due to
         * possible mount and symlink following (this might be optimized away if
         * necessary...)
@@ -3187,6 +3162,16 @@ retry_lookup:
                got_write = false;
        }
+        if (unlikely(d_is_negative(path.dentry))) {
+                path_to_nameidata(&path, nd);
+                return -ENOENT;
+        }
+        /*
+         * create/update audit record if it already exists.
+         */
+        audit_inode(nd->name, path.dentry, 0);
        if (unlikely((open_flag & (O_EXCL | O_CREAT)) == (O_EXCL | O_CREAT))) {
                path_to_nameidata(&path, nd);
                return -EEXIST;
@@ -3196,12 +3181,7 @@ retry_lookup:
        if (unlikely(error < 0))
                return error;
-        BUG_ON(nd->flags & LOOKUP_RCU);
        seq = 0;        /* out of RCU mode, so the value doesn't matter */
-        if (unlikely(d_is_negative(path.dentry))) {
-                path_to_nameidata(&path, nd);
-                return -ENOENT;
-        }
        inode = d_backing_inode(path.dentry);
 finish_lookup:
        if (nd->depth)
@@ -3707,31 +3687,6 @@ SYSCALL_DEFINE2(mkdir, const char __user *, pathname, umode_t, mode)
        return sys_mkdirat(AT_FDCWD, pathname, mode);
 }
-/*
- * The dentry_unhash() helper will try to drop the dentry early: we
- * should have a usage count of 1 if we're the only user of this
- * dentry, and if that is true (possibly after pruning the dcache),
- * then we drop the dentry now.
- *
- * A low-level filesystem can, if it choses, legally
- * do a
- *
- *      if (!d_unhashed(dentry))
- *              return -EBUSY;
- *
- * if it cannot handle the case of removing a directory
- * that is still in use by something else..
- */
-void dentry_unhash(struct dentry *dentry)
-{
-        shrink_dcache_parent(dentry);
-        spin_lock(&dentry->d_lock);
-        if (dentry->d_lockref.count == 1)
-                __d_drop(dentry);
-        spin_unlock(&dentry->d_lock);
-}
-EXPORT_SYMBOL(dentry_unhash);
 int vfs_rmdir(struct inode *dir, struct dentry *dentry)
 {
        int error = may_delete(dir, dentry, 1);
diff --git a/fs/nfs/blocklayout/blocklayout.c b/fs/nfs/blocklayout/blocklayout.c
index ddd0138f410c..02e4d87d2ed3 100644
--- a/fs/nfs/blocklayout/blocklayout.c
+++ b/fs/nfs/blocklayout/blocklayout.c
@@ -446,8 +446,8 @@ static void bl_free_layout_hdr(struct pnfs_layout_hdr *lo)
        kfree(bl);
 }
-static struct pnfs_layout_hdr *bl_alloc_layout_hdr(struct inode *inode,
+static struct pnfs_layout_hdr *__bl_alloc_layout_hdr(struct inode *inode,
-                                                   gfp_t gfp_flags)
+                gfp_t gfp_flags, bool is_scsi_layout)
 {
        struct pnfs_block_layout *bl;
@@ -460,9 +460,22 @@ static struct pnfs_layout_hdr *bl_alloc_layout_hdr(struct inode *inode,
        bl->bl_ext_ro = RB_ROOT;
        spin_lock_init(&bl->bl_ext_lock);
+        bl->bl_scsi_layout = is_scsi_layout;
        return &bl->bl_layout;
 }
+static struct pnfs_layout_hdr *bl_alloc_layout_hdr(struct inode *inode,
+                                                   gfp_t gfp_flags)
+{
+        return __bl_alloc_layout_hdr(inode, gfp_flags, false);
+}
+static struct pnfs_layout_hdr *sl_alloc_layout_hdr(struct inode *inode,
+                                                   gfp_t gfp_flags)
+{
+        return __bl_alloc_layout_hdr(inode, gfp_flags, true);
+}
 static void bl_free_lseg(struct pnfs_layout_segment *lseg)
 {
        dprintk("%s enter\n", __func__);
@@ -743,7 +756,7 @@ bl_set_layoutdriver(struct nfs_server *server, const struct nfs_fh *fh)
 static bool
 is_aligned_req(struct nfs_pageio_descriptor *pgio,
-                struct nfs_page *req, unsigned int alignment)
+                struct nfs_page *req, unsigned int alignment, bool is_write)
 {
        /*
         * Always accept buffered writes, higher layers take care of the
@@ -758,7 +771,8 @@ is_aligned_req(struct nfs_pageio_descriptor *pgio,
        if (IS_ALIGNED(req->wb_bytes, alignment))
                return true;
-        if (req_offset(req) + req->wb_bytes == i_size_read(pgio->pg_inode)) {
+        if (is_write &&
+            (req_offset(req) + req->wb_bytes == i_size_read(pgio->pg_inode))) {
                /*
                 * If the write goes up to the inode size, just write
                 * the full page.  Data past the inode size is
@@ -775,7 +789,7 @@ is_aligned_req(struct nfs_pageio_descriptor *pgio,
 static void
 bl_pg_init_read(struct nfs_pageio_descriptor *pgio, struct nfs_page *req)
 {
-        if (!is_aligned_req(pgio, req, SECTOR_SIZE)) {
+        if (!is_aligned_req(pgio, req, SECTOR_SIZE, false)) {
                nfs_pageio_reset_read_mds(pgio);
                return;
        }
@@ -791,7 +805,7 @@ static size_t
 bl_pg_test_read(struct nfs_pageio_descriptor *pgio, struct nfs_page *prev,
                struct nfs_page *req)
 {
-        if (!is_aligned_req(pgio, req, SECTOR_SIZE))
+        if (!is_aligned_req(pgio, req, SECTOR_SIZE, false))
                return 0;
        return pnfs_generic_pg_test(pgio, prev, req);
 }
@@ -824,7 +838,7 @@ bl_pg_init_write(struct nfs_pageio_descriptor *pgio, struct nfs_page *req)
 {
        u64 wb_size;
-        if (!is_aligned_req(pgio, req, PAGE_SIZE)) {
+        if (!is_aligned_req(pgio, req, PAGE_SIZE, true)) {
                nfs_pageio_reset_write_mds(pgio);
                return;
        }
@@ -846,7 +860,7 @@ static size_t
 bl_pg_test_write(struct nfs_pageio_descriptor *pgio, struct nfs_page *prev,
                 struct nfs_page *req)
 {
-        if (!is_aligned_req(pgio, req, PAGE_SIZE))
+        if (!is_aligned_req(pgio, req, PAGE_SIZE, true))
                return 0;
        return pnfs_generic_pg_test(pgio, prev, req);
 }
@@ -888,22 +902,53 @@ static struct pnfs_layoutdriver_type blocklayout_type = {
        .sync                           = pnfs_generic_sync,
 };
+static struct pnfs_layoutdriver_type scsilayout_type = {
+        .id                             = LAYOUT_SCSI,
+        .name                           = "LAYOUT_SCSI",
+        .owner                          = THIS_MODULE,
+        .flags                          = PNFS_LAYOUTRET_ON_SETATTR |
+                                          PNFS_READ_WHOLE_PAGE,
+        .read_pagelist                  = bl_read_pagelist,
+        .write_pagelist                 = bl_write_pagelist,
+        .alloc_layout_hdr               = sl_alloc_layout_hdr,
+        .free_layout_hdr                = bl_free_layout_hdr,
+        .alloc_lseg                     = bl_alloc_lseg,
+        .free_lseg                      = bl_free_lseg,
+        .return_range                   = bl_return_range,
+        .prepare_layoutcommit           = bl_prepare_layoutcommit,
+        .cleanup_layoutcommit           = bl_cleanup_layoutcommit,
+        .set_layoutdriver               = bl_set_layoutdriver,
+        .alloc_deviceid_node            = bl_alloc_deviceid_node,
+        .free_deviceid_node             = bl_free_deviceid_node,
+        .pg_read_ops                    = &bl_pg_read_ops,
+        .pg_write_ops                   = &bl_pg_write_ops,
+        .sync                           = pnfs_generic_sync,
+};
 static int __init nfs4blocklayout_init(void)
 {
        int ret;
        dprintk("%s: NFSv4 Block Layout Driver Registering...\n", __func__);
-        ret = pnfs_register_layoutdriver(&blocklayout_type);
+        ret = bl_init_pipefs();
        if (ret)
                goto out;
-        ret = bl_init_pipefs();
+        ret = pnfs_register_layoutdriver(&blocklayout_type);
        if (ret)
-                goto out_unregister;
+                goto out_cleanup_pipe;
+        ret = pnfs_register_layoutdriver(&scsilayout_type);
+        if (ret)
+                goto out_unregister_block;
        return 0;
-out_unregister:
+out_unregister_block:
        pnfs_unregister_layoutdriver(&blocklayout_type);
+out_cleanup_pipe:
+        bl_cleanup_pipefs();
 out:
        return ret;
 }
@@ -913,8 +958,9 @@ static void __exit nfs4blocklayout_exit(void)
        dprintk("%s: NFSv4 Block Layout Driver Unregistering...\n",
               __func__);
-        bl_cleanup_pipefs();
+        pnfs_unregister_layoutdriver(&scsilayout_type);
        pnfs_unregister_layoutdriver(&blocklayout_type);
+        bl_cleanup_pipefs();
 }
 MODULE_ALIAS("nfs-layouttype4-3");
diff --git a/fs/nfs/blocklayout/blocklayout.h b/fs/nfs/blocklayout/blocklayout.h
index c556640dcf3b..bc21205309e0 100644
--- a/fs/nfs/blocklayout/blocklayout.h
+++ b/fs/nfs/blocklayout/blocklayout.h
@@ -55,7 +55,6 @@ struct pnfs_block_dev;
 */
 #define PNFS_BLOCK_UUID_LEN     128
 struct pnfs_block_volume {
        enum pnfs_block_volume_type     type;
        union {
@@ -82,6 +81,13 @@ struct pnfs_block_volume {
                        u32             volumes_count;
                        u32             volumes[PNFS_BLOCK_MAX_DEVICES];
                } stripe;
+                struct {
+                        enum scsi_code_set              code_set;
+                        enum scsi_designator_type       designator_type;
+                        int                             designator_len;
+                        u8                              designator[256];
+                        u64                             pr_key;
+                } scsi;
        };
 };
@@ -106,6 +112,9 @@ struct pnfs_block_dev {
        struct block_device             *bdev;
        u64                             disk_offset;
+        u64                             pr_key;
+        bool                            pr_registered;
        bool (*map)(struct pnfs_block_dev *dev, u64 offset,
                        struct pnfs_block_dev_map *map);
 };
@@ -131,6 +140,7 @@ struct pnfs_block_layout {
        struct rb_root          bl_ext_rw;
        struct rb_root          bl_ext_ro;
        spinlock_t              bl_ext_lock;   /* Protects list manipulation */
+        bool                    bl_scsi_layout;
 };
 static inline struct pnfs_block_layout *
@@ -182,6 +192,6 @@ void ext_tree_mark_committed(struct nfs4_layoutcommit_args *arg, int status);
 dev_t bl_resolve_deviceid(struct nfs_server *server,
                struct pnfs_block_volume *b, gfp_t gfp_mask);
 int __init bl_init_pipefs(void);
-void __exit bl_cleanup_pipefs(void);
+void bl_cleanup_pipefs(void);
 #endif /* FS_NFS_NFS4BLOCKLAYOUT_H */
diff --git a/fs/nfs/blocklayout/dev.c b/fs/nfs/blocklayout/dev.c
index a861bbdfe577..e5b89675263e 100644
--- a/fs/nfs/blocklayout/dev.c
+++ b/fs/nfs/blocklayout/dev.c
@@ -1,11 +1,12 @@
 /*
- * Copyright (c) 2014 Christoph Hellwig.
+ * Copyright (c) 2014-2016 Christoph Hellwig.
 */
 #include <linux/sunrpc/svc.h>
 #include <linux/blkdev.h>
 #include <linux/nfs4.h>
 #include <linux/nfs_fs.h>
 #include <linux/nfs_xdr.h>
+#include <linux/pr.h>
 #include "blocklayout.h"
@@ -21,6 +22,17 @@ bl_free_device(struct pnfs_block_dev *dev)
                        bl_free_device(&dev->children[i]);
                kfree(dev->children);
        } else {
+                if (dev->pr_registered) {
+                        const struct pr_ops *ops =
+                                dev->bdev->bd_disk->fops->pr_ops;
+                        int error;
+                        error = ops->pr_register(dev->bdev, dev->pr_key, 0,
+                                false);
+                        if (error)
+                                pr_err("failed to unregister PR key.\n");
+                }
                if (dev->bdev)
                        blkdev_put(dev->bdev, FMODE_READ | FMODE_WRITE);
        }
@@ -113,6 +125,24 @@ nfs4_block_decode_volume(struct xdr_stream *xdr, struct pnfs_block_volume *b)
                for (i = 0; i < b->stripe.volumes_count; i++)
                        b->stripe.volumes[i] = be32_to_cpup(p++);
                break;
+        case PNFS_BLOCK_VOLUME_SCSI:
+                p = xdr_inline_decode(xdr, 4 + 4 + 4);
+                if (!p)
+                        return -EIO;
+                b->scsi.code_set = be32_to_cpup(p++);
+                b->scsi.designator_type = be32_to_cpup(p++);
+                b->scsi.designator_len = be32_to_cpup(p++);
+                p = xdr_inline_decode(xdr, b->scsi.designator_len);
+                if (!p)
+                        return -EIO;
+                if (b->scsi.designator_len > 256)
+                        return -EIO;
+                memcpy(&b->scsi.designator, p, b->scsi.designator_len);
+                p = xdr_inline_decode(xdr, 8);
+                if (!p)
+                        return -EIO;
+                p = xdr_decode_hyper(p, &b->scsi.pr_key);
+                break;
        default:
                dprintk("unknown volume type!\n");
                return -EIO;
@@ -216,6 +246,116 @@ bl_parse_simple(struct nfs_server *server, struct pnfs_block_dev *d,
        return 0;
 }
+static bool
+bl_validate_designator(struct pnfs_block_volume *v)
+{
+        switch (v->scsi.designator_type) {
+        case PS_DESIGNATOR_EUI64:
+                if (v->scsi.code_set != PS_CODE_SET_BINARY)
+                        return false;
+                if (v->scsi.designator_len != 8 &&
+                    v->scsi.designator_len != 10 &&
+                    v->scsi.designator_len != 16)
+                        return false;
+                return true;
+        case PS_DESIGNATOR_NAA:
+                if (v->scsi.code_set != PS_CODE_SET_BINARY)
+                        return false;
+                if (v->scsi.designator_len != 8 &&
+                    v->scsi.designator_len != 16)
+                        return false;
+                return true;
+        case PS_DESIGNATOR_T10:
+        case PS_DESIGNATOR_NAME:
+                pr_err("pNFS: unsupported designator "
+                        "(code set %d, type %d, len %d.\n",
+                        v->scsi.code_set,
+                        v->scsi.designator_type,
+                        v->scsi.designator_len);
+                return false;
+        default:
+                pr_err("pNFS: invalid designator "
+                        "(code set %d, type %d, len %d.\n",
+                        v->scsi.code_set,
+                        v->scsi.designator_type,
+                        v->scsi.designator_len);
+                return false;
+        }
+}
+static int
+bl_parse_scsi(struct nfs_server *server, struct pnfs_block_dev *d,
+                struct pnfs_block_volume *volumes, int idx, gfp_t gfp_mask)
+{
+        struct pnfs_block_volume *v = &volumes[idx];
+        const struct pr_ops *ops;
+        const char *devname;
+        int error;
+        if (!bl_validate_designator(v))
+                return -EINVAL;
+        switch (v->scsi.designator_len) {
+        case 8:
+                devname = kasprintf(GFP_KERNEL, "/dev/disk/by-id/wwn-0x%8phN",
+                                v->scsi.designator);
+                break;
+        case 12:
+                devname = kasprintf(GFP_KERNEL, "/dev/disk/by-id/wwn-0x%12phN",
+                                v->scsi.designator);
+                break;
+        case 16:
+                devname = kasprintf(GFP_KERNEL, "/dev/disk/by-id/wwn-0x%16phN",
+                                v->scsi.designator);
+                break;
+        default:
+                return -EINVAL;
+        }
+        d->bdev = blkdev_get_by_path(devname, FMODE_READ, NULL);
+        if (IS_ERR(d->bdev)) {
+                pr_warn("pNFS: failed to open device %s (%ld)\n",
+                        devname, PTR_ERR(d->bdev));
+                kfree(devname);
+                return PTR_ERR(d->bdev);
+        }
+        kfree(devname);
+        d->len = i_size_read(d->bdev->bd_inode);
+        d->map = bl_map_simple;
+        d->pr_key = v->scsi.pr_key;
+        pr_info("pNFS: using block device %s (reservation key 0x%llx)\n",
+                d->bdev->bd_disk->disk_name, d->pr_key);
+        ops = d->bdev->bd_disk->fops->pr_ops;
+        if (!ops) {
+                pr_err("pNFS: block device %s does not support reservations.",
+                                d->bdev->bd_disk->disk_name);
+                error = -EINVAL;
+                goto out_blkdev_put;
+        }
+        error = ops->pr_register(d->bdev, 0, d->pr_key, true);
+        if (error) {
+                pr_err("pNFS: failed to register key for block device %s.",
+                                d->bdev->bd_disk->disk_name);
+                goto out_blkdev_put;
+        }
+        d->pr_registered = true;
+        return 0;
+out_blkdev_put:
+        blkdev_put(d->bdev, FMODE_READ);
+        return error;
+}
 static int
 bl_parse_slice(struct nfs_server *server, struct pnfs_block_dev *d,
                struct pnfs_block_volume *volumes, int idx, gfp_t gfp_mask)
@@ -303,6 +443,8 @@ bl_parse_deviceid(struct nfs_server *server, struct pnfs_block_dev *d,
                return bl_parse_concat(server, d, volumes, idx, gfp_mask);
        case PNFS_BLOCK_VOLUME_STRIPE:
                return bl_parse_stripe(server, d, volumes, idx, gfp_mask);
+        case PNFS_BLOCK_VOLUME_SCSI:
+                return bl_parse_scsi(server, d, volumes, idx, gfp_mask);
        default:
                dprintk("unsupported volume type: %d\n", volumes[idx].type);
                return -EIO;
diff --git a/fs/nfs/blocklayout/extent_tree.c b/fs/nfs/blocklayout/extent_tree.c
index 35ab51c04814..720b3ff55fa9 100644
--- a/fs/nfs/blocklayout/extent_tree.c
+++ b/fs/nfs/blocklayout/extent_tree.c
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2014 Christoph Hellwig.
+ * Copyright (c) 2014-2016 Christoph Hellwig.
 */
 #include <linux/vmalloc.h>
@@ -462,10 +462,12 @@ out:
        return err;
 }
-static size_t ext_tree_layoutupdate_size(size_t count)
+static size_t ext_tree_layoutupdate_size(struct pnfs_block_layout *bl, size_t count)
 {
-        return sizeof(__be32) /* number of entries */ +
+        if (bl->bl_scsi_layout)
-                PNFS_BLOCK_EXTENT_SIZE * count;
+                return sizeof(__be32) + PNFS_SCSI_RANGE_SIZE * count;
+        else
+                return sizeof(__be32) + PNFS_BLOCK_EXTENT_SIZE * count;
 }
 static void ext_tree_free_commitdata(struct nfs4_layoutcommit_args *arg,
@@ -483,6 +485,23 @@ static void ext_tree_free_commitdata(struct nfs4_layoutcommit_args *arg,
        }
 }
+static __be32 *encode_block_extent(struct pnfs_block_extent *be, __be32 *p)
+{
+        p = xdr_encode_opaque_fixed(p, be->be_device->deviceid.data,
+                        NFS4_DEVICEID4_SIZE);
+        p = xdr_encode_hyper(p, be->be_f_offset << SECTOR_SHIFT);
+        p = xdr_encode_hyper(p, be->be_length << SECTOR_SHIFT);
+        p = xdr_encode_hyper(p, 0LL);
+        *p++ = cpu_to_be32(PNFS_BLOCK_READWRITE_DATA);
+        return p;
+}
+static __be32 *encode_scsi_range(struct pnfs_block_extent *be, __be32 *p)
+{
+        p = xdr_encode_hyper(p, be->be_f_offset << SECTOR_SHIFT);
+        return xdr_encode_hyper(p, be->be_length << SECTOR_SHIFT);
+}
 static int ext_tree_encode_commit(struct pnfs_block_layout *bl, __be32 *p,
                size_t buffer_size, size_t *count)
 {
@@ -496,19 +515,16 @@ static int ext_tree_encode_commit(struct pnfs_block_layout *bl, __be32 *p,
                        continue;
                (*count)++;
-                if (ext_tree_layoutupdate_size(*count) > buffer_size) {
+                if (ext_tree_layoutupdate_size(bl, *count) > buffer_size) {
                        /* keep counting.. */
                        ret = -ENOSPC;
                        continue;
                }
-                p = xdr_encode_opaque_fixed(p, be->be_device->deviceid.data,
+                if (bl->bl_scsi_layout)
-                                NFS4_DEVICEID4_SIZE);
+                        p = encode_scsi_range(be, p);
-                p = xdr_encode_hyper(p, be->be_f_offset << SECTOR_SHIFT);
+                else
-                p = xdr_encode_hyper(p, be->be_length << SECTOR_SHIFT);
+                        p = encode_block_extent(be, p);
-                p = xdr_encode_hyper(p, 0LL);
-                *p++ = cpu_to_be32(PNFS_BLOCK_READWRITE_DATA);
                be->be_tag = EXTENT_COMMITTING;
        }
        spin_unlock(&bl->bl_ext_lock);
@@ -537,7 +553,7 @@ retry:
        if (unlikely(ret)) {
                ext_tree_free_commitdata(arg, buffer_size);
-                buffer_size = ext_tree_layoutupdate_size(count);
+                buffer_size = ext_tree_layoutupdate_size(bl, count);
                count = 0;
                arg->layoutupdate_pages =
@@ -556,7 +572,7 @@ retry:
        }
        *start_p = cpu_to_be32(count);
-        arg->layoutupdate_len = ext_tree_layoutupdate_size(count);
+        arg->layoutupdate_len = ext_tree_layoutupdate_size(bl, count);
        if (unlikely(arg->layoutupdate_pages != &arg->layoutupdate_page)) {
                void *p = start_p, *end = p + arg->layoutupdate_len;
diff --git a/fs/nfs/blocklayout/rpc_pipefs.c b/fs/nfs/blocklayout/rpc_pipefs.c
index dbe5839cdeba..9fb067a6f7e0 100644
--- a/fs/nfs/blocklayout/rpc_pipefs.c
+++ b/fs/nfs/blocklayout/rpc_pipefs.c
@@ -281,7 +281,7 @@ out:
        return ret;
 }
-void __exit bl_cleanup_pipefs(void)
+void bl_cleanup_pipefs(void)
 {
        rpc_pipefs_notifier_unregister(&nfs4blocklayout_block);
        unregister_pernet_subsys(&nfs4blocklayout_net_ops);
diff --git a/fs/nfs/callback.h b/fs/nfs/callback.h
index ff8195bd75ea..5fe1cecbf9f0 100644
--- a/fs/nfs/callback.h
+++ b/fs/nfs/callback.h
@@ -37,10 +37,11 @@ enum nfs4_callback_opnum {
        OP_CB_ILLEGAL = 10044,
 };
+struct nfs4_slot;
 struct cb_process_state {
        __be32                  drc_status;
        struct nfs_client       *clp;
-        u32                     slotid;
+        struct nfs4_slot        *slot;
        u32                     minorversion;
        struct net              *net;
 };
diff --git a/fs/nfs/callback_proc.c b/fs/nfs/callback_proc.c
index f0939d097406..618ced381a14 100644
--- a/fs/nfs/callback_proc.c
+++ b/fs/nfs/callback_proc.c
@@ -354,47 +354,38 @@ out:
 * a single outstanding callback request at a time.
 */
 static __be32
-validate_seqid(struct nfs4_slot_table *tbl, struct cb_sequenceargs * args)
+validate_seqid(const struct nfs4_slot_table *tbl, const struct nfs4_slot *slot,
+                const struct cb_sequenceargs * args)
 {
-        struct nfs4_slot *slot;
+        dprintk("%s enter. slotid %u seqid %u, slot table seqid: %u\n",
+                __func__, args->csa_slotid, args->csa_sequenceid, slot->seq_nr);
-        dprintk("%s enter. slotid %u seqid %u\n",
-                __func__, args->csa_slotid, args->csa_sequenceid);
-        if (args->csa_slotid >= NFS41_BC_MAX_CALLBACKS)
+        if (args->csa_slotid > tbl->server_highest_slotid)
                return htonl(NFS4ERR_BADSLOT);
-        slot = tbl->slots + args->csa_slotid;
-        dprintk("%s slot table seqid: %u\n", __func__, slot->seq_nr);
-        /* Normal */
-        if (likely(args->csa_sequenceid == slot->seq_nr + 1))
-                goto out_ok;
        /* Replay */
        if (args->csa_sequenceid == slot->seq_nr) {
                dprintk("%s seqid %u is a replay\n",
                        __func__, args->csa_sequenceid);
+                if (nfs4_test_locked_slot(tbl, slot->slot_nr))
+                        return htonl(NFS4ERR_DELAY);
                /* Signal process_op to set this error on next op */
                if (args->csa_cachethis == 0)
                        return htonl(NFS4ERR_RETRY_UNCACHED_REP);
-                /* The ca_maxresponsesize_cached is 0 with no DRC */
+                /* Liar! We never allowed you to set csa_cachethis != 0 */
-                else if (args->csa_cachethis == 1)
+                return htonl(NFS4ERR_SEQ_FALSE_RETRY);
-                        return htonl(NFS4ERR_REP_TOO_BIG_TO_CACHE);
        }
        /* Wraparound */
-        if (args->csa_sequenceid == 1 && (slot->seq_nr + 1) == 0) {
+        if (unlikely(slot->seq_nr == 0xFFFFFFFFU)) {
-                slot->seq_nr = 1;
+                if (args->csa_sequenceid == 1)
-                goto out_ok;
+                        return htonl(NFS4_OK);
-        }
+        } else if (likely(args->csa_sequenceid == slot->seq_nr + 1))
+                return htonl(NFS4_OK);
        /* Misordered request */
        return htonl(NFS4ERR_SEQ_MISORDERED);
-out_ok:
-        tbl->highest_used_slotid = args->csa_slotid;
-        return htonl(NFS4_OK);
 }
 /*
@@ -473,6 +464,12 @@ __be32 nfs4_callback_sequence(struct cb_sequenceargs *args,
        tbl = &clp->cl_session->bc_slot_table;
        slot = tbl->slots + args->csa_slotid;
+        /* Set up res before grabbing the spinlock */
+        memcpy(&res->csr_sessionid, &args->csa_sessionid,
+               sizeof(res->csr_sessionid));
+        res->csr_sequenceid = args->csa_sequenceid;
+        res->csr_slotid = args->csa_slotid;
        spin_lock(&tbl->slot_tbl_lock);
        /* state manager is resetting the session */
        if (test_bit(NFS4_SLOT_TBL_DRAINING, &tbl->slot_tbl_state)) {
@@ -485,18 +482,26 @@ __be32 nfs4_callback_sequence(struct cb_sequenceargs *args,
                goto out_unlock;
        }
-        memcpy(&res->csr_sessionid, &args->csa_sessionid,
+        status = htonl(NFS4ERR_BADSLOT);
-               sizeof(res->csr_sessionid));
+        slot = nfs4_lookup_slot(tbl, args->csa_slotid);
-        res->csr_sequenceid = args->csa_sequenceid;
+        if (IS_ERR(slot))
-        res->csr_slotid = args->csa_slotid;
+                goto out_unlock;
-        res->csr_highestslotid = NFS41_BC_MAX_CALLBACKS - 1;
-        res->csr_target_highestslotid = NFS41_BC_MAX_CALLBACKS - 1;
+        res->csr_highestslotid = tbl->server_highest_slotid;
+        res->csr_target_highestslotid = tbl->target_highest_slotid;
-        status = validate_seqid(tbl, args);
+        status = validate_seqid(tbl, slot, args);
        if (status)
                goto out_unlock;
+        if (!nfs4_try_to_lock_slot(tbl, slot)) {
+                status = htonl(NFS4ERR_DELAY);
+                goto out_unlock;
+        }
+        cps->slot = slot;
-        cps->slotid = args->csa_slotid;
+        /* The ca_maxresponsesize_cached is 0 with no DRC */
+        if (args->csa_cachethis != 0)
+                return htonl(NFS4ERR_REP_TOO_BIG_TO_CACHE);
        /*
         * Check for pending referring calls.  If a match is found, a
@@ -513,7 +518,7 @@ __be32 nfs4_callback_sequence(struct cb_sequenceargs *args,
         * If CB_SEQUENCE returns an error, then the state of the slot
         * (sequence ID, cached reply) MUST NOT change.
         */
-        slot->seq_nr++;
+        slot->seq_nr = args->csa_sequenceid;
 out_unlock:
        spin_unlock(&tbl->slot_tbl_lock);
diff --git a/fs/nfs/callback_xdr.c b/fs/nfs/callback_xdr.c
index 646cdac73488..976c90608e56 100644
--- a/fs/nfs/callback_xdr.c
+++ b/fs/nfs/callback_xdr.c
@@ -752,7 +752,8 @@ preprocess_nfs41_op(int nop, unsigned int op_nr, struct callback_op **op)
        return htonl(NFS_OK);
 }
-static void nfs4_callback_free_slot(struct nfs4_session *session)
+static void nfs4_callback_free_slot(struct nfs4_session *session,
+                struct nfs4_slot *slot)
 {
        struct nfs4_slot_table *tbl = &session->bc_slot_table;
@@ -761,15 +762,17 @@ static void nfs4_callback_free_slot(struct nfs4_session *session)
         * Let the state manager know callback processing done.
         * A single slot, so highest used slotid is either 0 or -1
         */
-        tbl->highest_used_slotid = NFS4_NO_SLOT;
+        nfs4_free_slot(tbl, slot);
        nfs4_slot_tbl_drain_complete(tbl);
        spin_unlock(&tbl->slot_tbl_lock);
 }
 static void nfs4_cb_free_slot(struct cb_process_state *cps)
 {
-        if (cps->slotid != NFS4_NO_SLOT)
+        if (cps->slot) {
-                nfs4_callback_free_slot(cps->clp->cl_session);
+                nfs4_callback_free_slot(cps->clp->cl_session, cps->slot);
+                cps->slot = NULL;
+        }
 }
 #else /* CONFIG_NFS_V4_1 */
@@ -893,7 +896,6 @@ static __be32 nfs4_callback_compound(struct svc_rqst *rqstp, void *argp, void *r
        struct cb_process_state cps = {
                .drc_status = 0,
                .clp = NULL,
-                .slotid = NFS4_NO_SLOT,
                .net = SVC_NET(rqstp),
        };
        unsigned int nops = 0;
diff --git a/fs/nfs/dir.c b/fs/nfs/dir.c
index 9cce67043f92..4bfa7d8bcade 100644
--- a/fs/nfs/dir.c
+++ b/fs/nfs/dir.c
@@ -1360,19 +1360,15 @@ struct dentry *nfs_lookup(struct inode *dir, struct dentry * dentry, unsigned in
        dfprintk(VFS, "NFS: lookup(%pd2)\n", dentry);
        nfs_inc_stats(dir, NFSIOS_VFSLOOKUP);
-        res = ERR_PTR(-ENAMETOOLONG);
+        if (unlikely(dentry->d_name.len > NFS_SERVER(dir)->namelen))
-        if (dentry->d_name.len > NFS_SERVER(dir)->namelen)
+                return ERR_PTR(-ENAMETOOLONG);
-                goto out;
        /*
         * If we're doing an exclusive create, optimize away the lookup
         * but don't hash the dentry.
         */
-        if (nfs_is_exclusive_create(dir, flags)) {
+        if (nfs_is_exclusive_create(dir, flags))
-                d_instantiate(dentry, NULL);
+                return NULL;
-                res = NULL;
-                goto out;
-        }
        res = ERR_PTR(-ENOMEM);
        fhandle = nfs_alloc_fhandle();
diff --git a/fs/nfs/file.c b/fs/nfs/file.c
index 748bb813b8ec..89bf093d342a 100644
--- a/fs/nfs/file.c
+++ b/fs/nfs/file.c
@@ -233,7 +233,7 @@ EXPORT_SYMBOL_GPL(nfs_file_mmap);
 * nfs_file_write() that a write error occurred, and hence cause it to
 * fall back to doing a synchronous write.
 */
-int
+static int
 nfs_file_fsync_commit(struct file *file, loff_t start, loff_t end, int datasync)
 {
        struct nfs_open_context *ctx = nfs_file_open_context(file);
@@ -263,9 +263,8 @@ nfs_file_fsync_commit(struct file *file, loff_t start, loff_t end, int datasync)
 out:
        return ret;
 }
-EXPORT_SYMBOL_GPL(nfs_file_fsync_commit);
-static int
+int
 nfs_file_fsync(struct file *file, loff_t start, loff_t end, int datasync)
 {
        int ret;
@@ -273,13 +272,15 @@ nfs_file_fsync(struct file *file, loff_t start, loff_t end, int datasync)
        trace_nfs_fsync_enter(inode);
-        nfs_inode_dio_wait(inode);
+        inode_dio_wait(inode);
        do {
                ret = filemap_write_and_wait_range(inode->i_mapping, start, end);
                if (ret != 0)
                        break;
                inode_lock(inode);
                ret = nfs_file_fsync_commit(file, start, end, datasync);
+                if (!ret)
+                        ret = pnfs_sync_inode(inode, !!datasync);
                inode_unlock(inode);
                /*
                 * If nfs_file_fsync_commit detected a server reboot, then
@@ -293,6 +294,7 @@ nfs_file_fsync(struct file *file, loff_t start, loff_t end, int datasync)
        trace_nfs_fsync_exit(inode, ret);
        return ret;
 }
+EXPORT_SYMBOL_GPL(nfs_file_fsync);
 /*
 * Decide whether a read/modify/write cycle may be more efficient
@@ -368,7 +370,7 @@ start:
        /*
         * Wait for O_DIRECT to complete
         */
-        nfs_inode_dio_wait(mapping->host);
+        inode_dio_wait(mapping->host);
        page = grab_cache_page_write_begin(mapping, index, flags);
        if (!page)
diff --git a/fs/nfs/flexfilelayout/flexfilelayoutdev.c b/fs/nfs/flexfilelayout/flexfilelayoutdev.c
index eb370460ce20..add0e5a70bd6 100644
--- a/fs/nfs/flexfilelayout/flexfilelayoutdev.c
+++ b/fs/nfs/flexfilelayout/flexfilelayoutdev.c
@@ -418,6 +418,8 @@ nfs4_ff_layout_prepare_ds(struct pnfs_layout_segment *lseg, u32 ds_idx,
                                pnfs_error_mark_layout_for_return(ino, lseg);
                } else
                        pnfs_error_mark_layout_for_return(ino, lseg);
+                ds = NULL;
+                goto out;
        }
 out_update_creds:
        if (ff_layout_update_mirror_cred(mirror, ds))
diff --git a/fs/nfs/inode.c b/fs/nfs/inode.c
index 86faecf8f328..33d18c411905 100644
--- a/fs/nfs/inode.c
+++ b/fs/nfs/inode.c
@@ -141,7 +141,7 @@ void nfs_evict_inode(struct inode *inode)
 int nfs_sync_inode(struct inode *inode)
 {
-        nfs_inode_dio_wait(inode);
+        inode_dio_wait(inode);
        return nfs_wb_all(inode);
 }
 EXPORT_SYMBOL_GPL(nfs_sync_inode);
diff --git a/fs/nfs/internal.h b/fs/nfs/internal.h
index 9a547aa3ec8e..565f8135ae1f 100644
--- a/fs/nfs/internal.h
+++ b/fs/nfs/internal.h
@@ -358,7 +358,7 @@ int nfs_mknod(struct inode *, struct dentry *, umode_t, dev_t);
 int nfs_rename(struct inode *, struct dentry *, struct inode *, struct dentry *);
 /* file.c */
-int nfs_file_fsync_commit(struct file *, loff_t, loff_t, int);
+int nfs_file_fsync(struct file *file, loff_t start, loff_t end, int datasync);
 loff_t nfs_file_llseek(struct file *, loff_t, int);
 ssize_t nfs_file_read(struct kiocb *, struct iov_iter *);
 ssize_t nfs_file_splice_read(struct file *, loff_t *, struct pipe_inode_info *,
@@ -515,10 +515,6 @@ extern int nfs_sillyrename(struct inode *dir, struct dentry *dentry);
 /* direct.c */
 void nfs_init_cinfo_from_dreq(struct nfs_commit_info *cinfo,
                              struct nfs_direct_req *dreq);
-static inline void nfs_inode_dio_wait(struct inode *inode)
-{
-        inode_dio_wait(inode);
-}
 extern ssize_t nfs_dreq_bytes_left(struct nfs_direct_req *dreq);
 /* nfs4proc.c */
diff --git a/fs/nfs/nfs4file.c b/fs/nfs/nfs4file.c
index 57ca1c8039c1..22c35abbee9d 100644
--- a/fs/nfs/nfs4file.c
+++ b/fs/nfs/nfs4file.c
@@ -128,37 +128,6 @@ nfs4_file_flush(struct file *file, fl_owner_t id)
        return vfs_fsync(file, 0);
 }
-static int
-nfs4_file_fsync(struct file *file, loff_t start, loff_t end, int datasync)
-{
-        int ret;
-        struct inode *inode = file_inode(file);
-        trace_nfs_fsync_enter(inode);
-        nfs_inode_dio_wait(inode);
-        do {
-                ret = filemap_write_and_wait_range(inode->i_mapping, start, end);
-                if (ret != 0)
-                        break;
-                inode_lock(inode);
-                ret = nfs_file_fsync_commit(file, start, end, datasync);
-                if (!ret)
-                        ret = pnfs_sync_inode(inode, !!datasync);
-                inode_unlock(inode);
-                /*
-                 * If nfs_file_fsync_commit detected a server reboot, then
-                 * resend all dirty pages that might have been covered by
-                 * the NFS_CONTEXT_RESEND_WRITES flag
-                 */
-                start = 0;
-                end = LLONG_MAX;
-        } while (ret == -EAGAIN);
-        trace_nfs_fsync_exit(inode, ret);
-        return ret;
-}
 #ifdef CONFIG_NFS_V4_2
 static loff_t nfs4_file_llseek(struct file *filep, loff_t offset, int whence)
 {
@@ -266,7 +235,7 @@ const struct file_operations nfs4_file_operations = {
        .open           = nfs4_file_open,
        .flush          = nfs4_file_flush,
        .release        = nfs_file_release,
-        .fsync          = nfs4_file_fsync,
+        .fsync          = nfs_file_fsync,
        .lock           = nfs_lock,
        .flock          = nfs_flock,
        .splice_read    = nfs_file_splice_read,
diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c
index 14881594dd07..327b8c34d360 100644
--- a/fs/nfs/nfs4proc.c
+++ b/fs/nfs/nfs4proc.c
@@ -2461,14 +2461,15 @@ static int _nfs4_open_and_get_state(struct nfs4_opendata *opendata,
        dentry = opendata->dentry;
        if (d_really_is_negative(dentry)) {
-                /* FIXME: Is this d_drop() ever needed? */
+                struct dentry *alias;
                d_drop(dentry);
-                dentry = d_add_unique(dentry, igrab(state->inode));
+                alias = d_exact_alias(dentry, state->inode);
-                if (dentry == NULL) {
+                if (!alias)
-                        dentry = opendata->dentry;
+                        alias = d_splice_alias(igrab(state->inode), dentry);
-                } else {
+                /* d_splice_alias() can't fail here - it's a non-directory */
+                if (alias) {
                        dput(ctx->dentry);
-                        ctx->dentry = dentry;
+                        ctx->dentry = dentry = alias;
                }
                nfs_set_verifier(dentry,
                                nfs_save_change_attribute(d_inode(opendata->dir)));
@@ -6782,13 +6783,26 @@ nfs41_same_server_scope(struct nfs41_server_scope *a,
        return false;
 }
+static void
+nfs4_bind_one_conn_to_session_done(struct rpc_task *task, void *calldata)
+{
+}
+static const struct rpc_call_ops nfs4_bind_one_conn_to_session_ops = {
+        .rpc_call_done =  &nfs4_bind_one_conn_to_session_done,
+};
 /*
- * nfs4_proc_bind_conn_to_session()
+ * nfs4_proc_bind_one_conn_to_session()
 *
 * The 4.1 client currently uses the same TCP connection for the
 * fore and backchannel.
 */
-int nfs4_proc_bind_conn_to_session(struct nfs_client *clp, struct rpc_cred *cred)
+static
+int nfs4_proc_bind_one_conn_to_session(struct rpc_clnt *clnt,
+                struct rpc_xprt *xprt,
+                struct nfs_client *clp,
+                struct rpc_cred *cred)
 {
        int status;
        struct nfs41_bind_conn_to_session_args args = {
@@ -6803,6 +6817,14 @@ int nfs4_proc_bind_conn_to_session(struct nfs_client *clp, struct rpc_cred *cred
                .rpc_resp = &res,
                .rpc_cred = cred,
        };
+        struct rpc_task_setup task_setup_data = {
+                .rpc_client = clnt,
+                .rpc_xprt = xprt,
+                .callback_ops = &nfs4_bind_one_conn_to_session_ops,
+                .rpc_message = &msg,
+                .flags = RPC_TASK_TIMEOUT,
+        };
+        struct rpc_task *task;
        dprintk("--> %s\n", __func__);
@@ -6810,7 +6832,16 @@ int nfs4_proc_bind_conn_to_session(struct nfs_client *clp, struct rpc_cred *cred
        if (!(clp->cl_session->flags & SESSION4_BACK_CHAN))
                args.dir = NFS4_CDFC4_FORE;
-        status = rpc_call_sync(clp->cl_rpcclient, &msg, RPC_TASK_TIMEOUT);
+        /* Do not set the backchannel flag unless this is clnt->cl_xprt */
+        if (xprt != rcu_access_pointer(clnt->cl_xprt))
+                args.dir = NFS4_CDFC4_FORE;
+        task = rpc_run_task(&task_setup_data);
+        if (!IS_ERR(task)) {
+                status = task->tk_status;
+                rpc_put_task(task);
+        } else
+                status = PTR_ERR(task);
        trace_nfs4_bind_conn_to_session(clp, status);
        if (status == 0) {
                if (memcmp(res.sessionid.data,
@@ -6837,6 +6868,31 @@ out:
        return status;
 }
+struct rpc_bind_conn_calldata {
+        struct nfs_client *clp;
+        struct rpc_cred *cred;
+};
+static int
+nfs4_proc_bind_conn_to_session_callback(struct rpc_clnt *clnt,
+                struct rpc_xprt *xprt,
+                void *calldata)
+{
+        struct rpc_bind_conn_calldata *p = calldata;
+        return nfs4_proc_bind_one_conn_to_session(clnt, xprt, p->clp, p->cred);
+}
+int nfs4_proc_bind_conn_to_session(struct nfs_client *clp, struct rpc_cred *cred)
+{
+        struct rpc_bind_conn_calldata data = {
+                .clp = clp,
+                .cred = cred,
+        };
+        return rpc_clnt_iterate_for_each_xprt(clp->cl_rpcclient,
+                        nfs4_proc_bind_conn_to_session_callback, &data);
+}
 /*
 * Minimum set of SP4_MACH_CRED operations from RFC 5661 in the enforce map
 * and operations we'd like to see to enable certain features in the allow map
@@ -7319,7 +7375,7 @@ static void nfs4_init_channel_attrs(struct nfs41_create_session_args *args)
        args->bc_attrs.max_resp_sz = PAGE_SIZE;
        args->bc_attrs.max_resp_sz_cached = 0;
        args->bc_attrs.max_ops = NFS4_MAX_BACK_CHANNEL_OPS;
-        args->bc_attrs.max_reqs = 1;
+        args->bc_attrs.max_reqs = NFS41_BC_MAX_CALLBACKS;
        dprintk("%s: Back Channel : max_rqst_sz=%u max_resp_sz=%u "
                "max_resp_sz_cached=%u max_ops=%u max_reqs=%u\n",
diff --git a/fs/nfs/nfs4session.c b/fs/nfs/nfs4session.c
index e23366effcfb..332d06e64fa9 100644
--- a/fs/nfs/nfs4session.c
+++ b/fs/nfs/nfs4session.c
@@ -135,6 +135,43 @@ static struct nfs4_slot *nfs4_find_or_create_slot(struct nfs4_slot_table  *tbl,
        return ERR_PTR(-ENOMEM);
 }
+static void nfs4_lock_slot(struct nfs4_slot_table *tbl,
+                struct nfs4_slot *slot)
+{
+        u32 slotid = slot->slot_nr;
+        __set_bit(slotid, tbl->used_slots);
+        if (slotid > tbl->highest_used_slotid ||
+            tbl->highest_used_slotid == NFS4_NO_SLOT)
+                tbl->highest_used_slotid = slotid;
+        slot->generation = tbl->generation;
+}
+/*
+ * nfs4_try_to_lock_slot - Given a slot try to allocate it
+ *
+ * Note: must be called with the slot_tbl_lock held.
+ */
+bool nfs4_try_to_lock_slot(struct nfs4_slot_table *tbl, struct nfs4_slot *slot)
+{
+        if (nfs4_test_locked_slot(tbl, slot->slot_nr))
+                return false;
+        nfs4_lock_slot(tbl, slot);
+        return true;
+}
+/*
+ * nfs4_lookup_slot - Find a slot but don't allocate it
+ *
+ * Note: must be called with the slot_tbl_lock held.
+ */
+struct nfs4_slot *nfs4_lookup_slot(struct nfs4_slot_table *tbl, u32 slotid)
+{
+        if (slotid <= tbl->max_slotid)
+                return nfs4_find_or_create_slot(tbl, slotid, 1, GFP_NOWAIT);
+        return ERR_PTR(-E2BIG);
+}
 /*
 * nfs4_alloc_slot - efficiently look for a free slot
 *
@@ -153,18 +190,11 @@ struct nfs4_slot *nfs4_alloc_slot(struct nfs4_slot_table *tbl)
                __func__, tbl->used_slots[0], tbl->highest_used_slotid,
                tbl->max_slotid + 1);
        slotid = find_first_zero_bit(tbl->used_slots, tbl->max_slotid + 1);
-        if (slotid > tbl->max_slotid)
+        if (slotid <= tbl->max_slotid) {
-                goto out;
+                ret = nfs4_find_or_create_slot(tbl, slotid, 1, GFP_NOWAIT);
-        ret = nfs4_find_or_create_slot(tbl, slotid, 1, GFP_NOWAIT);
+                if (!IS_ERR(ret))
-        if (IS_ERR(ret))
+                        nfs4_lock_slot(tbl, ret);
-                goto out;
+        }
-        __set_bit(slotid, tbl->used_slots);
-        if (slotid > tbl->highest_used_slotid ||
-                        tbl->highest_used_slotid == NFS4_NO_SLOT)
-                tbl->highest_used_slotid = slotid;
-        ret->generation = tbl->generation;
-out:
        dprintk("<-- %s used_slots=%04lx highest_used=%u slotid=%u\n",
                __func__, tbl->used_slots[0], tbl->highest_used_slotid,
                !IS_ERR(ret) ? ret->slot_nr : NFS4_NO_SLOT);
diff --git a/fs/nfs/nfs4session.h b/fs/nfs/nfs4session.h
index e3ea2c5324d6..5b51298d1d03 100644
--- a/fs/nfs/nfs4session.h
+++ b/fs/nfs/nfs4session.h
@@ -77,6 +77,8 @@ extern int nfs4_setup_slot_table(struct nfs4_slot_table *tbl,
                unsigned int max_reqs, const char *queue);
 extern void nfs4_shutdown_slot_table(struct nfs4_slot_table *tbl);
 extern struct nfs4_slot *nfs4_alloc_slot(struct nfs4_slot_table *tbl);
+extern struct nfs4_slot *nfs4_lookup_slot(struct nfs4_slot_table *tbl, u32 slotid);
+extern bool nfs4_try_to_lock_slot(struct nfs4_slot_table *tbl, struct nfs4_slot *slot);
 extern void nfs4_free_slot(struct nfs4_slot_table *tbl, struct nfs4_slot *slot);
 extern void nfs4_slot_tbl_drain_complete(struct nfs4_slot_table *tbl);
 bool nfs41_wake_and_assign_slot(struct nfs4_slot_table *tbl,
@@ -88,6 +90,12 @@ static inline bool nfs4_slot_tbl_draining(struct nfs4_slot_table *tbl)
        return !!test_bit(NFS4_SLOT_TBL_DRAINING, &tbl->slot_tbl_state);
 }
+static inline bool nfs4_test_locked_slot(const struct nfs4_slot_table *tbl,
+                u32 slotid)
+{
+        return !!test_bit(slotid, tbl->used_slots);
+}
 #if defined(CONFIG_NFS_V4_1)
 extern void nfs41_set_target_slotid(struct nfs4_slot_table *tbl,
                u32 target_highest_slotid);
diff --git a/fs/nfs/pnfs_nfs.c b/fs/nfs/pnfs_nfs.c
index 81ac6480f9e7..4aaed890048f 100644
--- a/fs/nfs/pnfs_nfs.c
+++ b/fs/nfs/pnfs_nfs.c
@@ -606,12 +606,22 @@ static int _nfs4_pnfs_v3_ds_connect(struct nfs_server *mds_srv,
                dprintk("%s: DS %s: trying address %s\n",
                        __func__, ds->ds_remotestr, da->da_remotestr);
-                clp = get_v3_ds_connect(mds_srv->nfs_client,
+                if (!IS_ERR(clp)) {
+                        struct xprt_create xprt_args = {
+                                .ident = XPRT_TRANSPORT_TCP,
+                                .net = clp->cl_net,
+                                .dstaddr = (struct sockaddr *)&da->da_addr,
+                                .addrlen = da->da_addrlen,
+                                .servername = clp->cl_hostname,
+                        };
+                        /* Add this address as an alias */
+                        rpc_clnt_add_xprt(clp->cl_rpcclient, &xprt_args,
+                                        rpc_clnt_test_and_add_xprt, NULL);
+                } else
+                        clp = get_v3_ds_connect(mds_srv->nfs_client,
                                        (struct sockaddr *)&da->da_addr,
                                        da->da_addrlen, IPPROTO_TCP,
                                        timeo, retrans, au_flavor);
-                if (!IS_ERR(clp))
-                        break;
        }
        if (IS_ERR(clp)) {
diff --git a/fs/nfsd/Kconfig b/fs/nfsd/Kconfig
index a0b77fc1bd39..c9f583d7bac8 100644
--- a/fs/nfsd/Kconfig
+++ b/fs/nfsd/Kconfig
@@ -84,12 +84,30 @@ config NFSD_V4
          If unsure, say N.
 config NFSD_PNFS
-        bool "NFSv4.1 server support for Parallel NFS (pNFS)"
+        bool
-        depends on NFSD_V4
+config NFSD_BLOCKLAYOUT
+        bool "NFSv4.1 server support for pNFS block layouts"
+        depends on NFSD_V4 && BLOCK
+        select NFSD_PNFS
+        help
+          This option enables support for the exporting pNFS block layouts
+          in the kernel's NFS server. The pNFS block layout enables NFS
+          clients to directly perform I/O to block devices accesible to both
+          the server and the clients.  See RFC 5663 for more details.
+          If unsure, say N.
+config NFSD_SCSILAYOUT
+        bool "NFSv4.1 server support for pNFS SCSI layouts"
+        depends on NFSD_V4 && BLOCK
+        select NFSD_PNFS
        help
-          This option enables support for the parallel NFS features of the
+          This option enables support for the exporting pNFS SCSI layouts
-          minor version 1 of the NFSv4 protocol (RFC5661) in the kernel's NFS
+          in the kernel's NFS server. The pNFS SCSI layout enables NFS
-          server.
+          clients to directly perform I/O to SCSI devices accesible to both
+          the server and the clients.  See draft-ietf-nfsv4-scsi-layout for
+          more details.
          If unsure, say N.
diff --git a/fs/nfsd/Makefile b/fs/nfsd/Makefile
index 9a6028e120c6..3ae5f3c77e28 100644
--- a/fs/nfsd/Makefile
+++ b/fs/nfsd/Makefile
@@ -17,4 +17,6 @@ nfsd-$(CONFIG_NFSD_V3)	+= nfs3proc.o nfs3xdr.o
 nfsd-$(CONFIG_NFSD_V3_ACL) += nfs3acl.o
 nfsd-$(CONFIG_NFSD_V4)  += nfs4proc.o nfs4xdr.o nfs4state.o nfs4idmap.o \
                           nfs4acl.o nfs4callback.o nfs4recover.o
-nfsd-$(CONFIG_NFSD_PNFS) += nfs4layouts.o blocklayout.o blocklayoutxdr.o
+nfsd-$(CONFIG_NFSD_PNFS) += nfs4layouts.o
+nfsd-$(CONFIG_NFSD_BLOCKLAYOUT) += blocklayout.o blocklayoutxdr.o
+nfsd-$(CONFIG_NFSD_SCSILAYOUT) += blocklayout.o blocklayoutxdr.o
diff --git a/fs/nfsd/blocklayout.c b/fs/nfsd/blocklayout.c
index c29d9421bd5e..e55b5242614d 100644
--- a/fs/nfsd/blocklayout.c
+++ b/fs/nfsd/blocklayout.c
@@ -1,11 +1,14 @@
 /*
- * Copyright (c) 2014 Christoph Hellwig.
+ * Copyright (c) 2014-2016 Christoph Hellwig.
 */
 #include <linux/exportfs.h>
 #include <linux/genhd.h>
 #include <linux/slab.h>
+#include <linux/pr.h>
 #include <linux/nfsd/debug.h>
+#include <scsi/scsi_proto.h>
+#include <scsi/scsi_common.h>
 #include "blocklayoutxdr.h"
 #include "pnfs.h"
@@ -13,37 +16,6 @@
 #define NFSDDBG_FACILITY        NFSDDBG_PNFS
-static int
-nfsd4_block_get_device_info_simple(struct super_block *sb,
-                struct nfsd4_getdeviceinfo *gdp)
-{
-        struct pnfs_block_deviceaddr *dev;
-        struct pnfs_block_volume *b;
-        dev = kzalloc(sizeof(struct pnfs_block_deviceaddr) +
-                      sizeof(struct pnfs_block_volume), GFP_KERNEL);
-        if (!dev)
-                return -ENOMEM;
-        gdp->gd_device = dev;
-        dev->nr_volumes = 1;
-        b = &dev->volumes[0];
-        b->type = PNFS_BLOCK_VOLUME_SIMPLE;
-        b->simple.sig_len = PNFS_BLOCK_UUID_LEN;
-        return sb->s_export_op->get_uuid(sb, b->simple.sig, &b->simple.sig_len,
-                        &b->simple.offset);
-}
-static __be32
-nfsd4_block_proc_getdeviceinfo(struct super_block *sb,
-                struct nfsd4_getdeviceinfo *gdp)
-{
-        if (sb->s_bdev != sb->s_bdev->bd_contains)
-                return nfserr_inval;
-        return nfserrno(nfsd4_block_get_device_info_simple(sb, gdp));
-}
 static __be32
 nfsd4_block_proc_layoutget(struct inode *inode, const struct svc_fh *fhp,
                struct nfsd4_layoutget *args)
@@ -141,20 +113,13 @@ out_layoutunavailable:
 }
 static __be32
-nfsd4_block_proc_layoutcommit(struct inode *inode,
+nfsd4_block_commit_blocks(struct inode *inode, struct nfsd4_layoutcommit *lcp,
-                struct nfsd4_layoutcommit *lcp)
+                struct iomap *iomaps, int nr_iomaps)
 {
        loff_t new_size = lcp->lc_last_wr + 1;
        struct iattr iattr = { .ia_valid = 0 };
-        struct iomap *iomaps;
-        int nr_iomaps;
        int error;
-        nr_iomaps = nfsd4_block_decode_layoutupdate(lcp->lc_up_layout,
-                        lcp->lc_up_len, &iomaps, 1 << inode->i_blkbits);
-        if (nr_iomaps < 0)
-                return nfserrno(nr_iomaps);
        if (lcp->lc_mtime.tv_nsec == UTIME_NOW ||
            timespec_compare(&lcp->lc_mtime, &inode->i_mtime) < 0)
                lcp->lc_mtime = current_fs_time(inode->i_sb);
@@ -172,6 +137,54 @@ nfsd4_block_proc_layoutcommit(struct inode *inode,
        return nfserrno(error);
 }
+#ifdef CONFIG_NFSD_BLOCKLAYOUT
+static int
+nfsd4_block_get_device_info_simple(struct super_block *sb,
+                struct nfsd4_getdeviceinfo *gdp)
+{
+        struct pnfs_block_deviceaddr *dev;
+        struct pnfs_block_volume *b;
+        dev = kzalloc(sizeof(struct pnfs_block_deviceaddr) +
+                      sizeof(struct pnfs_block_volume), GFP_KERNEL);
+        if (!dev)
+                return -ENOMEM;
+        gdp->gd_device = dev;
+        dev->nr_volumes = 1;
+        b = &dev->volumes[0];
+        b->type = PNFS_BLOCK_VOLUME_SIMPLE;
+        b->simple.sig_len = PNFS_BLOCK_UUID_LEN;
+        return sb->s_export_op->get_uuid(sb, b->simple.sig, &b->simple.sig_len,
+                        &b->simple.offset);
+}
+static __be32
+nfsd4_block_proc_getdeviceinfo(struct super_block *sb,
+                struct nfs4_client *clp,
+                struct nfsd4_getdeviceinfo *gdp)
+{
+        if (sb->s_bdev != sb->s_bdev->bd_contains)
+                return nfserr_inval;
+        return nfserrno(nfsd4_block_get_device_info_simple(sb, gdp));
+}
+static __be32
+nfsd4_block_proc_layoutcommit(struct inode *inode,
+                struct nfsd4_layoutcommit *lcp)
+{
+        struct iomap *iomaps;
+        int nr_iomaps;
+        nr_iomaps = nfsd4_block_decode_layoutupdate(lcp->lc_up_layout,
+                        lcp->lc_up_len, &iomaps, 1 << inode->i_blkbits);
+        if (nr_iomaps < 0)
+                return nfserrno(nr_iomaps);
+        return nfsd4_block_commit_blocks(inode, lcp, iomaps, nr_iomaps);
+}
 const struct nfsd4_layout_ops bl_layout_ops = {
        /*
         * Pretend that we send notification to the client.  This is a blatant
@@ -190,3 +203,206 @@ const struct nfsd4_layout_ops bl_layout_ops = {
        .encode_layoutget       = nfsd4_block_encode_layoutget,
        .proc_layoutcommit      = nfsd4_block_proc_layoutcommit,
 };
+#endif /* CONFIG_NFSD_BLOCKLAYOUT */
+#ifdef CONFIG_NFSD_SCSILAYOUT
+static int nfsd4_scsi_identify_device(struct block_device *bdev,
+                struct pnfs_block_volume *b)
+{
+        struct request_queue *q = bdev->bd_disk->queue;
+        struct request *rq;
+        size_t bufflen = 252, len, id_len;
+        u8 *buf, *d, type, assoc;
+        int error;
+        buf = kzalloc(bufflen, GFP_KERNEL);
+        if (!buf)
+                return -ENOMEM;
+        rq = blk_get_request(q, READ, GFP_KERNEL);
+        if (IS_ERR(rq)) {
+                error = -ENOMEM;
+                goto out_free_buf;
+        }
+        blk_rq_set_block_pc(rq);
+        error = blk_rq_map_kern(q, rq, buf, bufflen, GFP_KERNEL);
+        if (error)
+                goto out_put_request;
+        rq->cmd[0] = INQUIRY;
+        rq->cmd[1] = 1;
+        rq->cmd[2] = 0x83;
+        rq->cmd[3] = bufflen >> 8;
+        rq->cmd[4] = bufflen & 0xff;
+        rq->cmd_len = COMMAND_SIZE(INQUIRY);
+        error = blk_execute_rq(rq->q, NULL, rq, 1);
+        if (error) {
+                pr_err("pNFS: INQUIRY 0x83 failed with: %x\n",
+                        rq->errors);
+                goto out_put_request;
+        }
+        len = (buf[2] << 8) + buf[3] + 4;
+        if (len > bufflen) {
+                pr_err("pNFS: INQUIRY 0x83 response invalid (len = %zd)\n",
+                        len);
+                goto out_put_request;
+        }
+        d = buf + 4;
+        for (d = buf + 4; d < buf + len; d += id_len + 4) {
+                id_len = d[3];
+                type = d[1] & 0xf;
+                assoc = (d[1] >> 4) & 0x3;
+                /*
+                 * We only care about a EUI-64 and NAA designator types
+                 * with LU association.
+                 */
+                if (assoc != 0x00)
+                        continue;
+                if (type != 0x02 && type != 0x03)
+                        continue;
+                if (id_len != 8 && id_len != 12 && id_len != 16)
+                        continue;
+                b->scsi.code_set = PS_CODE_SET_BINARY;
+                b->scsi.designator_type = type == 0x02 ?
+                        PS_DESIGNATOR_EUI64 : PS_DESIGNATOR_NAA;
+                b->scsi.designator_len = id_len;
+                memcpy(b->scsi.designator, d + 4, id_len);
+                /*
+                 * If we found a 8 or 12 byte descriptor continue on to
+                 * see if a 16 byte one is available.  If we find a
+                 * 16 byte descriptor we're done.
+                 */
+                if (id_len == 16)
+                        break;
+        }
+out_put_request:
+        blk_put_request(rq);
+out_free_buf:
+        kfree(buf);
+        return error;
+}
+#define NFSD_MDS_PR_KEY         0x0100000000000000
+/*
+ * We use the client ID as a unique key for the reservations.
+ * This allows us to easily fence a client when recalls fail.
+ */
+static u64 nfsd4_scsi_pr_key(struct nfs4_client *clp)
+{
+        return ((u64)clp->cl_clientid.cl_boot << 32) | clp->cl_clientid.cl_id;
+}
+static int
+nfsd4_block_get_device_info_scsi(struct super_block *sb,
+                struct nfs4_client *clp,
+                struct nfsd4_getdeviceinfo *gdp)
+{
+        struct pnfs_block_deviceaddr *dev;
+        struct pnfs_block_volume *b;
+        const struct pr_ops *ops;
+        int error;
+        dev = kzalloc(sizeof(struct pnfs_block_deviceaddr) +
+                      sizeof(struct pnfs_block_volume), GFP_KERNEL);
+        if (!dev)
+                return -ENOMEM;
+        gdp->gd_device = dev;
+        dev->nr_volumes = 1;
+        b = &dev->volumes[0];
+        b->type = PNFS_BLOCK_VOLUME_SCSI;
+        b->scsi.pr_key = nfsd4_scsi_pr_key(clp);
+        error = nfsd4_scsi_identify_device(sb->s_bdev, b);
+        if (error)
+                return error;
+        ops = sb->s_bdev->bd_disk->fops->pr_ops;
+        if (!ops) {
+                pr_err("pNFS: device %s does not support PRs.\n",
+                        sb->s_id);
+                return -EINVAL;
+        }
+        error = ops->pr_register(sb->s_bdev, 0, NFSD_MDS_PR_KEY, true);
+        if (error) {
+                pr_err("pNFS: failed to register key for device %s.\n",
+                        sb->s_id);
+                return -EINVAL;
+        }
+        error = ops->pr_reserve(sb->s_bdev, NFSD_MDS_PR_KEY,
+                        PR_EXCLUSIVE_ACCESS_REG_ONLY, 0);
+        if (error) {
+                pr_err("pNFS: failed to reserve device %s.\n",
+                        sb->s_id);
+                return -EINVAL;
+        }
+        return 0;
+}
+static __be32
+nfsd4_scsi_proc_getdeviceinfo(struct super_block *sb,
+                struct nfs4_client *clp,
+                struct nfsd4_getdeviceinfo *gdp)
+{
+        if (sb->s_bdev != sb->s_bdev->bd_contains)
+                return nfserr_inval;
+        return nfserrno(nfsd4_block_get_device_info_scsi(sb, clp, gdp));
+}
+static __be32
+nfsd4_scsi_proc_layoutcommit(struct inode *inode,
+                struct nfsd4_layoutcommit *lcp)
+{
+        struct iomap *iomaps;
+        int nr_iomaps;
+        nr_iomaps = nfsd4_scsi_decode_layoutupdate(lcp->lc_up_layout,
+                        lcp->lc_up_len, &iomaps, 1 << inode->i_blkbits);
+        if (nr_iomaps < 0)
+                return nfserrno(nr_iomaps);
+        return nfsd4_block_commit_blocks(inode, lcp, iomaps, nr_iomaps);
+}
+static void
+nfsd4_scsi_fence_client(struct nfs4_layout_stateid *ls)
+{
+        struct nfs4_client *clp = ls->ls_stid.sc_client;
+        struct block_device *bdev = ls->ls_file->f_path.mnt->mnt_sb->s_bdev;
+        bdev->bd_disk->fops->pr_ops->pr_preempt(bdev, NFSD_MDS_PR_KEY,
+                        nfsd4_scsi_pr_key(clp), 0, true);
+}
+const struct nfsd4_layout_ops scsi_layout_ops = {
+        /*
+         * Pretend that we send notification to the client.  This is a blatant
+         * lie to force recent Linux clients to cache our device IDs.
+         * We rarely ever change the device ID, so the harm of leaking deviceids
+         * for a while isn't too bad.  Unfortunately RFC5661 is a complete mess
+         * in this regard, but I filed errata 4119 for this a while ago, and
+         * hopefully the Linux client will eventually start caching deviceids
+         * without this again.
+         */
+        .notify_types           =
+                        NOTIFY_DEVICEID4_DELETE | NOTIFY_DEVICEID4_CHANGE,
+        .proc_getdeviceinfo     = nfsd4_scsi_proc_getdeviceinfo,
+        .encode_getdeviceinfo   = nfsd4_block_encode_getdeviceinfo,
+        .proc_layoutget         = nfsd4_block_proc_layoutget,
+        .encode_layoutget       = nfsd4_block_encode_layoutget,
+        .proc_layoutcommit      = nfsd4_scsi_proc_layoutcommit,
+        .fence_client           = nfsd4_scsi_fence_client,
+};
+#endif /* CONFIG_NFSD_SCSILAYOUT */
diff --git a/fs/nfsd/blocklayoutxdr.c b/fs/nfsd/blocklayoutxdr.c
index 6d834dc9bbc8..6c3b316f932e 100644
--- a/fs/nfsd/blocklayoutxdr.c
+++ b/fs/nfsd/blocklayoutxdr.c
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2014 Christoph Hellwig.
+ * Copyright (c) 2014-2016 Christoph Hellwig.
 */
 #include <linux/sunrpc/svc.h>
 #include <linux/exportfs.h>
@@ -53,6 +53,18 @@ nfsd4_block_encode_volume(struct xdr_stream *xdr, struct pnfs_block_volume *b)
                p = xdr_encode_hyper(p, b->simple.offset);
                p = xdr_encode_opaque(p, b->simple.sig, b->simple.sig_len);
                break;
+        case PNFS_BLOCK_VOLUME_SCSI:
+                len = 4 + 4 + 4 + 4 + b->scsi.designator_len + 8;
+                p = xdr_reserve_space(xdr, len);
+                if (!p)
+                        return -ETOOSMALL;
+                *p++ = cpu_to_be32(b->type);
+                *p++ = cpu_to_be32(b->scsi.code_set);
+                *p++ = cpu_to_be32(b->scsi.designator_type);
+                p = xdr_encode_opaque(p, b->scsi.designator, b->scsi.designator_len);
+                p = xdr_encode_hyper(p, b->scsi.pr_key);
+                break;
        default:
                return -ENOTSUPP;
        }
@@ -93,18 +105,22 @@ nfsd4_block_decode_layoutupdate(__be32 *p, u32 len, struct iomap **iomapp,
                u32 block_size)
 {
        struct iomap *iomaps;
-        u32 nr_iomaps, expected, i;
+        u32 nr_iomaps, i;
        if (len < sizeof(u32)) {
                dprintk("%s: extent array too small: %u\n", __func__, len);
                return -EINVAL;
        }
+        len -= sizeof(u32);
+        if (len % PNFS_BLOCK_EXTENT_SIZE) {
+                dprintk("%s: extent array invalid: %u\n", __func__, len);
+                return -EINVAL;
+        }
        nr_iomaps = be32_to_cpup(p++);
-        expected = sizeof(__be32) + nr_iomaps * PNFS_BLOCK_EXTENT_SIZE;
+        if (nr_iomaps != len / PNFS_BLOCK_EXTENT_SIZE) {
-        if (len != expected) {
                dprintk("%s: extent array size mismatch: %u/%u\n",
-                        __func__, len, expected);
+                        __func__, len, nr_iomaps);
                return -EINVAL;
        }
@@ -155,3 +171,54 @@ fail:
        kfree(iomaps);
        return -EINVAL;
 }
+int
+nfsd4_scsi_decode_layoutupdate(__be32 *p, u32 len, struct iomap **iomapp,
+                u32 block_size)
+{
+        struct iomap *iomaps;
+        u32 nr_iomaps, expected, i;
+        if (len < sizeof(u32)) {
+                dprintk("%s: extent array too small: %u\n", __func__, len);
+                return -EINVAL;
+        }
+        nr_iomaps = be32_to_cpup(p++);
+        expected = sizeof(__be32) + nr_iomaps * PNFS_SCSI_RANGE_SIZE;
+        if (len != expected) {
+                dprintk("%s: extent array size mismatch: %u/%u\n",
+                        __func__, len, expected);
+                return -EINVAL;
+        }
+        iomaps = kcalloc(nr_iomaps, sizeof(*iomaps), GFP_KERNEL);
+        if (!iomaps) {
+                dprintk("%s: failed to allocate extent array\n", __func__);
+                return -ENOMEM;
+        }
+        for (i = 0; i < nr_iomaps; i++) {
+                u64 val;
+                p = xdr_decode_hyper(p, &val);
+                if (val & (block_size - 1)) {
+                        dprintk("%s: unaligned offset 0x%llx\n", __func__, val);
+                        goto fail;
+                }
+                iomaps[i].offset = val;
+                p = xdr_decode_hyper(p, &val);
+                if (val & (block_size - 1)) {
+                        dprintk("%s: unaligned length 0x%llx\n", __func__, val);
+                        goto fail;
+                }
+                iomaps[i].length = val;
+        }
+        *iomapp = iomaps;
+        return nr_iomaps;
+fail:
+        kfree(iomaps);
+        return -EINVAL;
+}
diff --git a/fs/nfsd/blocklayoutxdr.h b/fs/nfsd/blocklayoutxdr.h
index 6de925fe8499..397bc7563a49 100644
--- a/fs/nfsd/blocklayoutxdr.h
+++ b/fs/nfsd/blocklayoutxdr.h
@@ -15,6 +15,11 @@ struct pnfs_block_extent {
        enum pnfs_block_extent_state    es;
 };
+struct pnfs_block_range {
+        u64                             foff;
+        u64                             len;
+};
 /*
 * Random upper cap for the uuid length to avoid unbounded allocation.
 * Not actually limited by the protocol.
@@ -29,6 +34,13 @@ struct pnfs_block_volume {
                        u32             sig_len;
                        u8              sig[PNFS_BLOCK_UUID_LEN];
                } simple;
+                struct {
+                        enum scsi_code_set              code_set;
+                        enum scsi_designator_type       designator_type;
+                        int                             designator_len;
+                        u8                              designator[256];
+                        u64                             pr_key;
+                } scsi;
        };
 };
@@ -43,5 +55,7 @@ __be32 nfsd4_block_encode_layoutget(struct xdr_stream *xdr,
                struct nfsd4_layoutget *lgp);
 int nfsd4_block_decode_layoutupdate(__be32 *p, u32 len, struct iomap **iomapp,
                u32 block_size);
+int nfsd4_scsi_decode_layoutupdate(__be32 *p, u32 len, struct iomap **iomapp,
+                u32 block_size);
 #endif /* _NFSD_BLOCKLAYOUTXDR_H */
diff --git a/fs/nfsd/nfs3proc.c b/fs/nfsd/nfs3proc.c
index 7b755b7f785c..51c3b06e8036 100644
--- a/fs/nfsd/nfs3proc.c
+++ b/fs/nfsd/nfs3proc.c
@@ -147,6 +147,7 @@ nfsd3_proc_read(struct svc_rqst *rqstp, struct nfsd3_readargs *argp,
 {
        __be32  nfserr;
        u32     max_blocksize = svc_max_payload(rqstp);
+        unsigned long cnt = min(argp->count, max_blocksize);
        dprintk("nfsd: READ(3) %s %lu bytes at %Lu\n",
                                SVCFH_fmt(&argp->fh),
@@ -157,7 +158,7 @@ nfsd3_proc_read(struct svc_rqst *rqstp, struct nfsd3_readargs *argp,
         * 1 (status) + 22 (post_op_attr) + 1 (count) + 1 (eof)
         * + 1 (xdr opaque byte count) = 26
         */
-        resp->count = min(argp->count, max_blocksize);
+        resp->count = cnt;
        svc_reserve_auth(rqstp, ((1 + NFS3_POST_OP_ATTR_WORDS + 3)<<2) + resp->count +4);
        fh_copy(&resp->fh, &argp->fh);
@@ -167,8 +168,8 @@ nfsd3_proc_read(struct svc_rqst *rqstp, struct nfsd3_readargs *argp,
                                  &resp->count);
        if (nfserr == 0) {
                struct inode    *inode = d_inode(resp->fh.fh_dentry);
+                resp->eof = nfsd_eof_on_read(cnt, resp->count, argp->offset,
-                resp->eof = (argp->offset + resp->count) >= inode->i_size;
+                                                        inode->i_size);
        }
        RETURN_STATUS(nfserr);
diff --git a/fs/nfsd/nfs4layouts.c b/fs/nfsd/nfs4layouts.c
index ce2d010d3b17..825c7bc8d789 100644
--- a/fs/nfsd/nfs4layouts.c
+++ b/fs/nfsd/nfs4layouts.c
@@ -1,6 +1,7 @@
 /*
 * Copyright (c) 2014 Christoph Hellwig.
 */
+#include <linux/blkdev.h>
 #include <linux/kmod.h>
 #include <linux/file.h>
 #include <linux/jhash.h>
@@ -26,7 +27,12 @@ static const struct nfsd4_callback_ops nfsd4_cb_layout_ops;
 static const struct lock_manager_operations nfsd4_layouts_lm_ops;
 const struct nfsd4_layout_ops *nfsd4_layout_ops[LAYOUT_TYPE_MAX] =  {
+#ifdef CONFIG_NFSD_BLOCKLAYOUT
        [LAYOUT_BLOCK_VOLUME]   = &bl_layout_ops,
+#endif
+#ifdef CONFIG_NFSD_SCSILAYOUT
+        [LAYOUT_SCSI]           = &scsi_layout_ops,
+#endif
 };
 /* pNFS device ID to export fsid mapping */
@@ -121,10 +127,24 @@ void nfsd4_setup_layout_type(struct svc_export *exp)
        if (!(exp->ex_flags & NFSEXP_PNFS))
                return;
+        /*
+         * Check if the file system supports exporting a block-like layout.
+         * If the block device supports reservations prefer the SCSI layout,
+         * otherwise advertise the block layout.
+         */
+#ifdef CONFIG_NFSD_BLOCKLAYOUT
        if (sb->s_export_op->get_uuid &&
            sb->s_export_op->map_blocks &&
            sb->s_export_op->commit_blocks)
                exp->ex_layout_type = LAYOUT_BLOCK_VOLUME;
+#endif
+#ifdef CONFIG_NFSD_SCSILAYOUT
+        /* overwrite block layout selection if needed */
+        if (sb->s_export_op->map_blocks &&
+            sb->s_export_op->commit_blocks &&
+            sb->s_bdev && sb->s_bdev->bd_disk->fops->pr_ops)
+                exp->ex_layout_type = LAYOUT_SCSI;
+#endif
 }
 static void
@@ -590,8 +610,6 @@ nfsd4_cb_layout_fail(struct nfs4_layout_stateid *ls)
        rpc_ntop((struct sockaddr *)&clp->cl_addr, addr_str, sizeof(addr_str));
-        trace_layout_recall_fail(&ls->ls_stid.sc_stateid);
        printk(KERN_WARNING
                "nfsd: client %s failed to respond to layout recall. "
                "  Fencing..\n", addr_str);
@@ -626,6 +644,7 @@ nfsd4_cb_layout_done(struct nfsd4_callback *cb, struct rpc_task *task)
                container_of(cb, struct nfs4_layout_stateid, ls_recall);
        struct nfsd_net *nn;
        ktime_t now, cutoff;
+        const struct nfsd4_layout_ops *ops;
        LIST_HEAD(reaplist);
@@ -661,7 +680,13 @@ nfsd4_cb_layout_done(struct nfsd4_callback *cb, struct rpc_task *task)
                /*
                 * Unknown error or non-responding client, we'll need to fence.
                 */
-                nfsd4_cb_layout_fail(ls);
+                trace_layout_recall_fail(&ls->ls_stid.sc_stateid);
+                ops = nfsd4_layout_ops[ls->ls_layout_type];
+                if (ops->fence_client)
+                        ops->fence_client(ls);
+                else
+                        nfsd4_cb_layout_fail(ls);
                return -1;
        }
 }
diff --git a/fs/nfsd/nfs4proc.c b/fs/nfsd/nfs4proc.c
index 4cba7865f496..de1ff1d98bb1 100644
--- a/fs/nfsd/nfs4proc.c
+++ b/fs/nfsd/nfs4proc.c
@@ -864,12 +864,10 @@ static __be32
 nfsd4_secinfo(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
              struct nfsd4_secinfo *secinfo)
 {
-        struct svc_fh resfh;
        struct svc_export *exp;
        struct dentry *dentry;
        __be32 err;
-        fh_init(&resfh, NFS4_FHSIZE);
        err = fh_verify(rqstp, &cstate->current_fh, S_IFDIR, NFSD_MAY_EXEC);
        if (err)
                return err;
@@ -878,6 +876,7 @@ nfsd4_secinfo(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
                                    &exp, &dentry);
        if (err)
                return err;
+        fh_unlock(&cstate->current_fh);
        if (d_really_is_negative(dentry)) {
                exp_put(exp);
                err = nfserr_noent;
@@ -1269,8 +1268,10 @@ nfsd4_getdeviceinfo(struct svc_rqst *rqstp,
                goto out;
        nfserr = nfs_ok;
-        if (gdp->gd_maxcount != 0)
+        if (gdp->gd_maxcount != 0) {
-                nfserr = ops->proc_getdeviceinfo(exp->ex_path.mnt->mnt_sb, gdp);
+                nfserr = ops->proc_getdeviceinfo(exp->ex_path.mnt->mnt_sb,
+                                        cstate->session->se_client, gdp);
+        }
        gdp->gd_notify_types &= ops->notify_types;
 out:
diff --git a/fs/nfsd/nfs4recover.c b/fs/nfsd/nfs4recover.c
index dc8ebecf5618..66eaeb1e8c2c 100644
--- a/fs/nfsd/nfs4recover.c
+++ b/fs/nfsd/nfs4recover.c
@@ -32,10 +32,10 @@
 *
 */
+#include <crypto/hash.h>
 #include <linux/file.h>
 #include <linux/slab.h>
 #include <linux/namei.h>
-#include <linux/crypto.h>
 #include <linux/sched.h>
 #include <linux/fs.h>
 #include <linux/module.h>
@@ -104,29 +104,35 @@ static int
 nfs4_make_rec_clidname(char *dname, const struct xdr_netobj *clname)
 {
        struct xdr_netobj cksum;
-        struct hash_desc desc;
+        struct crypto_shash *tfm;
-        struct scatterlist sg;
        int status;
        dprintk("NFSD: nfs4_make_rec_clidname for %.*s\n",
                        clname->len, clname->data);
-        desc.flags = CRYPTO_TFM_REQ_MAY_SLEEP;
+        tfm = crypto_alloc_shash("md5", 0, 0);
-        desc.tfm = crypto_alloc_hash("md5", 0, CRYPTO_ALG_ASYNC);
+        if (IS_ERR(tfm)) {
-        if (IS_ERR(desc.tfm)) {
+                status = PTR_ERR(tfm);
-                status = PTR_ERR(desc.tfm);
                goto out_no_tfm;
        }
-        cksum.len = crypto_hash_digestsize(desc.tfm);
+        cksum.len = crypto_shash_digestsize(tfm);
        cksum.data = kmalloc(cksum.len, GFP_KERNEL);
        if (cksum.data == NULL) {
                status = -ENOMEM;
                goto out;
        }
-        sg_init_one(&sg, clname->data, clname->len);
+        {
+                SHASH_DESC_ON_STACK(desc, tfm);
+                desc->tfm = tfm;
+                desc->flags = CRYPTO_TFM_REQ_MAY_SLEEP;
+                status = crypto_shash_digest(desc, clname->data, clname->len,
+                                             cksum.data);
+                shash_desc_zero(desc);
+        }
-        status = crypto_hash_digest(&desc, &sg, sg.length, cksum.data);
        if (status)
                goto out;
@@ -135,7 +141,7 @@ nfs4_make_rec_clidname(char *dname, const struct xdr_netobj *clname)
        status = 0;
 out:
        kfree(cksum.data);
-        crypto_free_hash(desc.tfm);
+        crypto_free_shash(tfm);
 out_no_tfm:
        return status;
 }
@@ -1260,6 +1266,7 @@ nfsd4_umh_cltrack_init(struct net *net)
        /* XXX: The usermode helper s not working in container yet. */
        if (net != &init_net) {
                pr_warn("NFSD: attempt to initialize umh client tracking in a container ignored.\n");
+                kfree(grace_start);
                return -EINVAL;
        }
diff --git a/fs/nfsd/nfs4state.c b/fs/nfsd/nfs4state.c
index c484a2b6cd10..0462eeddfff9 100644
--- a/fs/nfsd/nfs4state.c
+++ b/fs/nfsd/nfs4state.c
@@ -2408,7 +2408,8 @@ nfsd4_exchange_id(struct svc_rqst *rqstp,
        default:                                /* checked by xdr code */
                WARN_ON_ONCE(1);
        case SP4_SSV:
-                return nfserr_encr_alg_unsupp;
+                status = nfserr_encr_alg_unsupp;
+                goto out_nolock;
        }
        /* Cases below refer to rfc 5661 section 18.35.4: */
@@ -2586,21 +2587,26 @@ static __be32 check_forechannel_attrs(struct nfsd4_channel_attrs *ca, struct nfs
        return nfs_ok;
 }
+/*
+ * Server's NFSv4.1 backchannel support is AUTH_SYS-only for now.
+ * These are based on similar macros in linux/sunrpc/msg_prot.h .
+ */
+#define RPC_MAX_HEADER_WITH_AUTH_SYS \
+        (RPC_CALLHDRSIZE + 2 * (2 + UNX_CALLSLACK))
+#define RPC_MAX_REPHEADER_WITH_AUTH_SYS \
+        (RPC_REPHDRSIZE + (2 + NUL_REPLYSLACK))
 #define NFSD_CB_MAX_REQ_SZ      ((NFS4_enc_cb_recall_sz + \
-                                 RPC_MAX_HEADER_WITH_AUTH) * sizeof(__be32))
+                                 RPC_MAX_HEADER_WITH_AUTH_SYS) * sizeof(__be32))
 #define NFSD_CB_MAX_RESP_SZ     ((NFS4_dec_cb_recall_sz + \
-                                 RPC_MAX_REPHEADER_WITH_AUTH) * sizeof(__be32))
+                                 RPC_MAX_REPHEADER_WITH_AUTH_SYS) * \
+                                 sizeof(__be32))
 static __be32 check_backchannel_attrs(struct nfsd4_channel_attrs *ca)
 {
        ca->headerpadsz = 0;
-        /*
-         * These RPC_MAX_HEADER macros are overkill, especially since we
-         * don't even do gss on the backchannel yet.  But this is still
-         * less than 1k.  Tighten up this estimate in the unlikely event
-         * it turns out to be a problem for some client:
-         */
        if (ca->maxreq_sz < NFSD_CB_MAX_REQ_SZ)
                return nfserr_toosmall;
        if (ca->maxresp_sz < NFSD_CB_MAX_RESP_SZ)
@@ -2710,10 +2716,9 @@ nfsd4_create_session(struct svc_rqst *rqstp,
                goto out_free_conn;
        }
        status = nfs_ok;
-        /*
+        /* Persistent sessions are not supported */
-         * We do not support RDMA or persistent sessions
-         */
        cr_ses->flags &= ~SESSION4_PERSIST;
+        /* Upshifting from TCP to RDMA is not supported */
        cr_ses->flags &= ~SESSION4_RDMA;
        init_session(rqstp, new, conf, cr_ses);
diff --git a/fs/nfsd/nfs4xdr.c b/fs/nfsd/nfs4xdr.c
index d6ef0955a979..9df898ba648f 100644
--- a/fs/nfsd/nfs4xdr.c
+++ b/fs/nfsd/nfs4xdr.c
@@ -1072,8 +1072,9 @@ nfsd4_decode_rename(struct nfsd4_compoundargs *argp, struct nfsd4_rename *rename
        READ_BUF(4);
        rename->rn_snamelen = be32_to_cpup(p++);
-        READ_BUF(rename->rn_snamelen + 4);
+        READ_BUF(rename->rn_snamelen);
        SAVEMEM(rename->rn_sname, rename->rn_snamelen);
+        READ_BUF(4);
        rename->rn_tnamelen = be32_to_cpup(p++);
        READ_BUF(rename->rn_tnamelen);
        SAVEMEM(rename->rn_tname, rename->rn_tnamelen);
@@ -1155,13 +1156,14 @@ nfsd4_decode_setclientid(struct nfsd4_compoundargs *argp, struct nfsd4_setclient
        READ_BUF(8);
        setclientid->se_callback_prog = be32_to_cpup(p++);
        setclientid->se_callback_netid_len = be32_to_cpup(p++);
+        READ_BUF(setclientid->se_callback_netid_len);
-        READ_BUF(setclientid->se_callback_netid_len + 4);
        SAVEMEM(setclientid->se_callback_netid_val, setclientid->se_callback_netid_len);
+        READ_BUF(4);
        setclientid->se_callback_addr_len = be32_to_cpup(p++);
-        READ_BUF(setclientid->se_callback_addr_len + 4);
+        READ_BUF(setclientid->se_callback_addr_len);
        SAVEMEM(setclientid->se_callback_addr_val, setclientid->se_callback_addr_len);
+        READ_BUF(4);
        setclientid->se_callback_ident = be32_to_cpup(p++);
        DECODE_TAIL;
@@ -1835,8 +1837,9 @@ nfsd4_decode_compound(struct nfsd4_compoundargs *argp)
        READ_BUF(4);
        argp->taglen = be32_to_cpup(p++);
-        READ_BUF(argp->taglen + 8);
+        READ_BUF(argp->taglen);
        SAVEMEM(argp->tag, argp->taglen);
+        READ_BUF(8);
        argp->minorversion = be32_to_cpup(p++);
        argp->opcnt = be32_to_cpup(p++);
        max_reply += 4 + (XDR_QUADLEN(argp->taglen) << 2);
@@ -3060,7 +3063,7 @@ static __be32 nfsd4_encode_bind_conn_to_session(struct nfsd4_compoundres *resp,
                p = xdr_encode_opaque_fixed(p, bcts->sessionid.data,
                                                NFS4_MAX_SESSIONID_LEN);
                *p++ = cpu_to_be32(bcts->dir);
-                /* Sorry, we do not yet support RDMA over 4.1: */
+                /* Upshifting from TCP to RDMA is not supported */
                *p++ = cpu_to_be32(0);
        }
        return nfserr;
@@ -3362,6 +3365,7 @@ static __be32 nfsd4_encode_splice_read(
        struct xdr_stream *xdr = &resp->xdr;
        struct xdr_buf *buf = xdr->buf;
        u32 eof;
+        long len;
        int space_left;
        __be32 nfserr;
        __be32 *p = xdr->p - 2;
@@ -3370,6 +3374,7 @@ static __be32 nfsd4_encode_splice_read(
        if (xdr->end - xdr->p < 1)
                return nfserr_resource;
+        len = maxcount;
        nfserr = nfsd_splice_read(read->rd_rqstp, file,
                                  read->rd_offset, &maxcount);
        if (nfserr) {
@@ -3382,8 +3387,8 @@ static __be32 nfsd4_encode_splice_read(
                return nfserr;
        }
-        eof = (read->rd_offset + maxcount >=
+        eof = nfsd_eof_on_read(len, maxcount, read->rd_offset,
-               d_inode(read->rd_fhp->fh_dentry)->i_size);
+                                d_inode(read->rd_fhp->fh_dentry)->i_size);
        *(p++) = htonl(eof);
        *(p++) = htonl(maxcount);
@@ -3453,14 +3458,15 @@ static __be32 nfsd4_encode_readv(struct nfsd4_compoundres *resp,
        }
        read->rd_vlen = v;
+        len = maxcount;
        nfserr = nfsd_readv(file, read->rd_offset, resp->rqstp->rq_vec,
                        read->rd_vlen, &maxcount);
        if (nfserr)
                return nfserr;
        xdr_truncate_encode(xdr, starting_len + 8 + ((maxcount+3)&~3));
-        eof = (read->rd_offset + maxcount >=
+        eof = nfsd_eof_on_read(len, maxcount, read->rd_offset,
-               d_inode(read->rd_fhp->fh_dentry)->i_size);
+                                d_inode(read->rd_fhp->fh_dentry)->i_size);
        tmp = htonl(eof);
        write_bytes_to_xdr_buf(xdr->buf, starting_len    , &tmp, 4);
diff --git a/fs/nfsd/pnfs.h b/fs/nfsd/pnfs.h
index d4c4453674c6..7d073b9b1553 100644
--- a/fs/nfsd/pnfs.h
+++ b/fs/nfsd/pnfs.h
@@ -21,6 +21,7 @@ struct nfsd4_layout_ops {
        u32             notify_types;
        __be32 (*proc_getdeviceinfo)(struct super_block *sb,
+                        struct nfs4_client *clp,
                        struct nfsd4_getdeviceinfo *gdevp);
        __be32 (*encode_getdeviceinfo)(struct xdr_stream *xdr,
                        struct nfsd4_getdeviceinfo *gdevp);
@@ -32,10 +33,17 @@ struct nfsd4_layout_ops {
        __be32 (*proc_layoutcommit)(struct inode *inode,
                        struct nfsd4_layoutcommit *lcp);
+        void (*fence_client)(struct nfs4_layout_stateid *ls);
 };
 extern const struct nfsd4_layout_ops *nfsd4_layout_ops[];
+#ifdef CONFIG_NFSD_BLOCKLAYOUT
 extern const struct nfsd4_layout_ops bl_layout_ops;
+#endif
+#ifdef CONFIG_NFSD_SCSILAYOUT
+extern const struct nfsd4_layout_ops scsi_layout_ops;
+#endif
 __be32 nfsd4_preprocess_layout_stateid(struct svc_rqst *rqstp,
                struct nfsd4_compound_state *cstate, stateid_t *stateid,
diff --git a/fs/nfsd/vfs.c b/fs/nfsd/vfs.c
index 5d2a57e4c03a..d40010e4f1a9 100644
--- a/fs/nfsd/vfs.c
+++ b/fs/nfsd/vfs.c
@@ -870,7 +870,7 @@ __be32 nfsd_readv(struct file *file, loff_t offset, struct kvec *vec, int vlen,
        oldfs = get_fs();
        set_fs(KERNEL_DS);
-        host_err = vfs_readv(file, (struct iovec __user *)vec, vlen, &offset);
+        host_err = vfs_readv(file, (struct iovec __user *)vec, vlen, &offset, 0);
        set_fs(oldfs);
        return nfsd_finish_read(file, count, host_err);
 }
@@ -957,7 +957,7 @@ nfsd_vfs_write(struct svc_rqst *rqstp, struct svc_fh *fhp, struct file *file,
        /* Write the data. */
        oldfs = get_fs(); set_fs(KERNEL_DS);
-        host_err = vfs_writev(file, (struct iovec __user *)vec, vlen, &pos);
+        host_err = vfs_writev(file, (struct iovec __user *)vec, vlen, &pos, 0);
        set_fs(oldfs);
        if (host_err < 0)
                goto out_nfserr;
diff --git a/fs/nfsd/vfs.h b/fs/nfsd/vfs.h
index c11ba316f23f..2d573ec057f8 100644
--- a/fs/nfsd/vfs.h
+++ b/fs/nfsd/vfs.h
@@ -139,4 +139,23 @@ static inline int nfsd_create_is_exclusive(int createmode)
               || createmode == NFS4_CREATE_EXCLUSIVE4_1;
 }
+static inline bool nfsd_eof_on_read(long requested, long read,
+                                loff_t offset, loff_t size)
+{
+        /* We assume a short read means eof: */
+        if (requested > read)
+                return true;
+        /*
+         * A non-short read might also reach end of file.  The spec
+         * still requires us to set eof in that case.
+         *
+         * Further operations may have modified the file size since
+         * the read, so the following check is not atomic with the read.
+         * We've only seen that cause a problem for a client in the case
+         * where the read returned a count of 0 without setting eof.
+         * That case was fixed by the addition of the above check.
+         */
+        return (offset + read >= size);
+}
 #endif /* LINUX_NFSD_VFS_H */
diff --git a/fs/nilfs2/page.c b/fs/nilfs2/page.c
index 45d650addd56..c20df77eff99 100644
--- a/fs/nilfs2/page.c
+++ b/fs/nilfs2/page.c
@@ -180,7 +180,7 @@ void nilfs_page_bug(struct page *page)
        printk(KERN_CRIT "NILFS_PAGE_BUG(%p): cnt=%d index#=%llu flags=0x%lx "
               "mapping=%p ino=%lu\n",
-               page, atomic_read(&page->_count),
+               page, page_ref_count(page),
               (unsigned long long)page->index, page->flags, m, ino);
        if (page_has_buffers(page)) {
diff --git a/fs/ocfs2/Makefile b/fs/ocfs2/Makefile
index ce210d4951a1..e27e6527912b 100644
--- a/fs/ocfs2/Makefile
+++ b/fs/ocfs2/Makefile
@@ -41,7 +41,8 @@ ocfs2-objs := \
        quota_local.o           \
        quota_global.o          \
        xattr.o                 \
-        acl.o
+        acl.o   \
+        filecheck.o
 ocfs2_stackglue-objs := stackglue.o
 ocfs2_stack_o2cb-objs := stack_o2cb.o
diff --git a/fs/ocfs2/alloc.c b/fs/ocfs2/alloc.c
index d002579c6f2b..70907d638b60 100644
--- a/fs/ocfs2/alloc.c
+++ b/fs/ocfs2/alloc.c
@@ -2516,21 +2516,6 @@ static int ocfs2_update_edge_lengths(handle_t *handle,
        struct ocfs2_extent_block *eb;
        u32 range;
-        /*
-         * In normal tree rotation process, we will never touch the
-         * tree branch above subtree_index and ocfs2_extend_rotate_transaction
-         * doesn't reserve the credits for them either.
-         *
-         * But we do have a special case here which will update the rightmost
-         * records for all the bh in the path.
-         * So we have to allocate extra credits and access them.
-         */
-        ret = ocfs2_extend_trans(handle, subtree_index);
-        if (ret) {
-                mlog_errno(ret);
-                goto out;
-        }
        ret = ocfs2_journal_access_path(et->et_ci, handle, path);
        if (ret) {
                mlog_errno(ret);
@@ -2956,7 +2941,7 @@ static int __ocfs2_rotate_tree_left(handle_t *handle,
                     right_path->p_node[subtree_root].bh->b_blocknr,
                     right_path->p_tree_depth);
-                ret = ocfs2_extend_rotate_transaction(handle, subtree_root,
+                ret = ocfs2_extend_rotate_transaction(handle, 0,
                                                      orig_credits, left_path);
                if (ret) {
                        mlog_errno(ret);
@@ -3029,21 +3014,9 @@ static int ocfs2_remove_rightmost_path(handle_t *handle,
        struct ocfs2_extent_block *eb;
        struct ocfs2_extent_list *el;
        ret = ocfs2_et_sanity_check(et);
        if (ret)
                goto out;
-        /*
-         * There's two ways we handle this depending on
-         * whether path is the only existing one.
-         */
-        ret = ocfs2_extend_rotate_transaction(handle, 0,
-                                              handle->h_buffer_credits,
-                                              path);
-        if (ret) {
-                mlog_errno(ret);
-                goto out;
-        }
        ret = ocfs2_journal_access_path(et->et_ci, handle, path);
        if (ret) {
@@ -3641,6 +3614,14 @@ static int ocfs2_merge_rec_left(struct ocfs2_path *right_path,
                 */
                if (le16_to_cpu(right_rec->e_leaf_clusters) == 0 &&
                    le16_to_cpu(el->l_next_free_rec) == 1) {
+                        /* extend credit for ocfs2_remove_rightmost_path */
+                        ret = ocfs2_extend_rotate_transaction(handle, 0,
+                                        handle->h_buffer_credits,
+                                        right_path);
+                        if (ret) {
+                                mlog_errno(ret);
+                                goto out;
+                        }
                        ret = ocfs2_remove_rightmost_path(handle, et,
                                                          right_path,
@@ -3679,6 +3660,14 @@ static int ocfs2_try_to_merge_extent(handle_t *handle,
        BUG_ON(ctxt->c_contig_type == CONTIG_NONE);
        if (ctxt->c_split_covers_rec && ctxt->c_has_empty_extent) {
+                /* extend credit for ocfs2_remove_rightmost_path */
+                ret = ocfs2_extend_rotate_transaction(handle, 0,
+                                handle->h_buffer_credits,
+                                path);
+                if (ret) {
+                        mlog_errno(ret);
+                        goto out;
+                }
                /*
                 * The merge code will need to create an empty
                 * extent to take the place of the newly
@@ -3727,6 +3716,15 @@ static int ocfs2_try_to_merge_extent(handle_t *handle,
                 */
                BUG_ON(!ocfs2_is_empty_extent(&el->l_recs[0]));
+                /* extend credit for ocfs2_remove_rightmost_path */
+                ret = ocfs2_extend_rotate_transaction(handle, 0,
+                                        handle->h_buffer_credits,
+                                        path);
+                if (ret) {
+                        mlog_errno(ret);
+                        goto out;
+                }
                /* The merge left us with an empty extent, remove it. */
                ret = ocfs2_rotate_tree_left(handle, et, path, dealloc);
                if (ret) {
@@ -3748,6 +3746,15 @@ static int ocfs2_try_to_merge_extent(handle_t *handle,
                        goto out;
                }
+                /* extend credit for ocfs2_remove_rightmost_path */
+                ret = ocfs2_extend_rotate_transaction(handle, 0,
+                                handle->h_buffer_credits,
+                                path);
+                if (ret) {
+                        mlog_errno(ret);
+                        goto out;
+                }
                ret = ocfs2_rotate_tree_left(handle, et, path, dealloc);
                /*
                 * Error from this last rotate is not critical, so
@@ -3783,6 +3790,16 @@ static int ocfs2_try_to_merge_extent(handle_t *handle,
                }
                if (ctxt->c_split_covers_rec) {
+                        /* extend credit for ocfs2_remove_rightmost_path */
+                        ret = ocfs2_extend_rotate_transaction(handle, 0,
+                                        handle->h_buffer_credits,
+                                        path);
+                        if (ret) {
+                                mlog_errno(ret);
+                                ret = 0;
+                                goto out;
+                        }
                        /*
                         * The merge may have left an empty extent in
                         * our leaf. Try to rotate it away.
@@ -5342,6 +5359,15 @@ static int ocfs2_truncate_rec(handle_t *handle,
        struct ocfs2_extent_block *eb;
        if (ocfs2_is_empty_extent(&el->l_recs[0]) && index > 0) {
+                /* extend credit for ocfs2_remove_rightmost_path */
+                ret = ocfs2_extend_rotate_transaction(handle, 0,
+                                handle->h_buffer_credits,
+                                path);
+                if (ret) {
+                        mlog_errno(ret);
+                        goto out;
+                }
                ret = ocfs2_rotate_tree_left(handle, et, path, dealloc);
                if (ret) {
                        mlog_errno(ret);
@@ -5928,16 +5954,6 @@ static int ocfs2_replay_truncate_records(struct ocfs2_super *osb,
                ocfs2_journal_dirty(handle, tl_bh);
-                /* TODO: Perhaps we can calculate the bulk of the
-                 * credits up front rather than extending like
-                 * this. */
-                status = ocfs2_extend_trans(handle,
-                                            OCFS2_TRUNCATE_LOG_FLUSH_ONE_REC);
-                if (status < 0) {
-                        mlog_errno(status);
-                        goto bail;
-                }
                rec = tl->tl_recs[i];
                start_blk = ocfs2_clusters_to_blocks(data_alloc_inode->i_sb,
                                                    le32_to_cpu(rec.t_start));
@@ -5958,6 +5974,13 @@ static int ocfs2_replay_truncate_records(struct ocfs2_super *osb,
                                goto bail;
                        }
                }
+                status = ocfs2_extend_trans(handle,
+                                OCFS2_TRUNCATE_LOG_FLUSH_ONE_REC);
+                if (status < 0) {
+                        mlog_errno(status);
+                        goto bail;
+                }
                i--;
        }
@@ -6016,7 +6039,7 @@ int __ocfs2_flush_truncate_log(struct ocfs2_super *osb)
                goto out_mutex;
        }
-        handle = ocfs2_start_trans(osb, OCFS2_TRUNCATE_LOG_UPDATE);
+        handle = ocfs2_start_trans(osb, OCFS2_TRUNCATE_LOG_FLUSH_ONE_REC);
        if (IS_ERR(handle)) {
                status = PTR_ERR(handle);
                mlog_errno(status);
@@ -6079,7 +6102,7 @@ void ocfs2_schedule_truncate_log_flush(struct ocfs2_super *osb,
                if (cancel)
                        cancel_delayed_work(&osb->osb_truncate_log_wq);
-                queue_delayed_work(ocfs2_wq, &osb->osb_truncate_log_wq,
+                queue_delayed_work(osb->ocfs2_wq, &osb->osb_truncate_log_wq,
                                   OCFS2_TRUNCATE_LOG_FLUSH_INTERVAL);
        }
 }
@@ -6253,7 +6276,7 @@ void ocfs2_truncate_log_shutdown(struct ocfs2_super *osb)
        if (tl_inode) {
                cancel_delayed_work(&osb->osb_truncate_log_wq);
-                flush_workqueue(ocfs2_wq);
+                flush_workqueue(osb->ocfs2_wq);
                status = ocfs2_flush_truncate_log(osb);
                if (status < 0)
diff --git a/fs/ocfs2/aops.c b/fs/ocfs2/aops.c
index cda0361e95a4..1581240a7ca0 100644
--- a/fs/ocfs2/aops.c
+++ b/fs/ocfs2/aops.c
@@ -499,153 +499,6 @@ bail:
        return status;
 }
-/*
- * TODO: Make this into a generic get_blocks function.
- *
- * From do_direct_io in direct-io.c:
- *  "So what we do is to permit the ->get_blocks function to populate
- *   bh.b_size with the size of IO which is permitted at this offset and
- *   this i_blkbits."
- *
- * This function is called directly from get_more_blocks in direct-io.c.
- *
- * called like this: dio->get_blocks(dio->inode, fs_startblk,
- *                                      fs_count, map_bh, dio->rw == WRITE);
- */
-static int ocfs2_direct_IO_get_blocks(struct inode *inode, sector_t iblock,
-                                     struct buffer_head *bh_result, int create)
-{
-        int ret;
-        u32 cpos = 0;
-        int alloc_locked = 0;
-        u64 p_blkno, inode_blocks, contig_blocks;
-        unsigned int ext_flags;
-        unsigned char blocksize_bits = inode->i_sb->s_blocksize_bits;
-        unsigned long max_blocks = bh_result->b_size >> inode->i_blkbits;
-        unsigned long len = bh_result->b_size;
-        unsigned int clusters_to_alloc = 0, contig_clusters = 0;
-        cpos = ocfs2_blocks_to_clusters(inode->i_sb, iblock);
-        /* This function won't even be called if the request isn't all
-         * nicely aligned and of the right size, so there's no need
-         * for us to check any of that. */
-        inode_blocks = ocfs2_blocks_for_bytes(inode->i_sb, i_size_read(inode));
-        down_read(&OCFS2_I(inode)->ip_alloc_sem);
-        /* This figures out the size of the next contiguous block, and
-         * our logical offset */
-        ret = ocfs2_extent_map_get_blocks(inode, iblock, &p_blkno,
-                                          &contig_blocks, &ext_flags);
-        up_read(&OCFS2_I(inode)->ip_alloc_sem);
-        if (ret) {
-                mlog(ML_ERROR, "get_blocks() failed iblock=%llu\n",
-                     (unsigned long long)iblock);
-                ret = -EIO;
-                goto bail;
-        }
-        /* We should already CoW the refcounted extent in case of create. */
-        BUG_ON(create && (ext_flags & OCFS2_EXT_REFCOUNTED));
-        /* allocate blocks if no p_blkno is found, and create == 1 */
-        if (!p_blkno && create) {
-                ret = ocfs2_inode_lock(inode, NULL, 1);
-                if (ret < 0) {
-                        mlog_errno(ret);
-                        goto bail;
-                }
-                alloc_locked = 1;
-                down_write(&OCFS2_I(inode)->ip_alloc_sem);
-                /* fill hole, allocate blocks can't be larger than the size
-                 * of the hole */
-                clusters_to_alloc = ocfs2_clusters_for_bytes(inode->i_sb, len);
-                contig_clusters = ocfs2_clusters_for_blocks(inode->i_sb,
-                                contig_blocks);
-                if (clusters_to_alloc > contig_clusters)
-                        clusters_to_alloc = contig_clusters;
-                /* allocate extent and insert them into the extent tree */
-                ret = ocfs2_extend_allocation(inode, cpos,
-                                clusters_to_alloc, 0);
-                if (ret < 0) {
-                        up_write(&OCFS2_I(inode)->ip_alloc_sem);
-                        mlog_errno(ret);
-                        goto bail;
-                }
-                ret = ocfs2_extent_map_get_blocks(inode, iblock, &p_blkno,
-                                &contig_blocks, &ext_flags);
-                if (ret < 0) {
-                        up_write(&OCFS2_I(inode)->ip_alloc_sem);
-                        mlog(ML_ERROR, "get_blocks() failed iblock=%llu\n",
-                                        (unsigned long long)iblock);
-                        ret = -EIO;
-                        goto bail;
-                }
-                set_buffer_new(bh_result);
-                up_write(&OCFS2_I(inode)->ip_alloc_sem);
-        }
-        /*
-         * get_more_blocks() expects us to describe a hole by clearing
-         * the mapped bit on bh_result().
-         *
-         * Consider an unwritten extent as a hole.
-         */
-        if (p_blkno && !(ext_flags & OCFS2_EXT_UNWRITTEN))
-                map_bh(bh_result, inode->i_sb, p_blkno);
-        else
-                clear_buffer_mapped(bh_result);
-        /* make sure we don't map more than max_blocks blocks here as
-           that's all the kernel will handle at this point. */
-        if (max_blocks < contig_blocks)
-                contig_blocks = max_blocks;
-        bh_result->b_size = contig_blocks << blocksize_bits;
-bail:
-        if (alloc_locked)
-                ocfs2_inode_unlock(inode, 1);
-        return ret;
-}
-/*
- * ocfs2_dio_end_io is called by the dio core when a dio is finished.  We're
- * particularly interested in the aio/dio case.  We use the rw_lock DLM lock
- * to protect io on one node from truncation on another.
- */
-static void ocfs2_dio_end_io(struct kiocb *iocb,
-                             loff_t offset,
-                             ssize_t bytes,
-                             void *private)
-{
-        struct inode *inode = file_inode(iocb->ki_filp);
-        int level;
-        /* this io's submitter should not have unlocked this before we could */
-        BUG_ON(!ocfs2_iocb_is_rw_locked(iocb));
-        if (ocfs2_iocb_is_unaligned_aio(iocb)) {
-                ocfs2_iocb_clear_unaligned_aio(iocb);
-                mutex_unlock(&OCFS2_I(inode)->ip_unaligned_aio);
-        }
-        /* Let rw unlock to be done later to protect append direct io write */
-        if (offset + bytes <= i_size_read(inode)) {
-                ocfs2_iocb_clear_rw_locked(iocb);
-                level = ocfs2_iocb_rw_locked_level(iocb);
-                ocfs2_rw_unlock(inode, level);
-        }
-}
 static int ocfs2_releasepage(struct page *page, gfp_t wait)
 {
        if (!page_has_buffers(page))
@@ -653,363 +506,6 @@ static int ocfs2_releasepage(struct page *page, gfp_t wait)
        return try_to_free_buffers(page);
 }
-static int ocfs2_is_overwrite(struct ocfs2_super *osb,
-                struct inode *inode, loff_t offset)
-{
-        int ret = 0;
-        u32 v_cpos = 0;
-        u32 p_cpos = 0;
-        unsigned int num_clusters = 0;
-        unsigned int ext_flags = 0;
-        v_cpos = ocfs2_bytes_to_clusters(osb->sb, offset);
-        ret = ocfs2_get_clusters(inode, v_cpos, &p_cpos,
-                        &num_clusters, &ext_flags);
-        if (ret < 0) {
-                mlog_errno(ret);
-                return ret;
-        }
-        if (p_cpos && !(ext_flags & OCFS2_EXT_UNWRITTEN))
-                return 1;
-        return 0;
-}
-static int ocfs2_direct_IO_zero_extend(struct ocfs2_super *osb,
-                struct inode *inode, loff_t offset,
-                u64 zero_len, int cluster_align)
-{
-        u32 p_cpos = 0;
-        u32 v_cpos = ocfs2_bytes_to_clusters(osb->sb, i_size_read(inode));
-        unsigned int num_clusters = 0;
-        unsigned int ext_flags = 0;
-        int ret = 0;
-        if (offset <= i_size_read(inode) || cluster_align)
-                return 0;
-        ret = ocfs2_get_clusters(inode, v_cpos, &p_cpos, &num_clusters,
-                        &ext_flags);
-        if (ret < 0) {
-                mlog_errno(ret);
-                return ret;
-        }
-        if (p_cpos && !(ext_flags & OCFS2_EXT_UNWRITTEN)) {
-                u64 s = i_size_read(inode);
-                sector_t sector = ((u64)p_cpos << (osb->s_clustersize_bits - 9)) +
-                        (do_div(s, osb->s_clustersize) >> 9);
-                ret = blkdev_issue_zeroout(osb->sb->s_bdev, sector,
-                                zero_len >> 9, GFP_NOFS, false);
-                if (ret < 0)
-                        mlog_errno(ret);
-        }
-        return ret;
-}
-static int ocfs2_direct_IO_extend_no_holes(struct ocfs2_super *osb,
-                struct inode *inode, loff_t offset)
-{
-        u64 zero_start, zero_len, total_zero_len;
-        u32 p_cpos = 0, clusters_to_add;
-        u32 v_cpos = ocfs2_bytes_to_clusters(osb->sb, i_size_read(inode));
-        unsigned int num_clusters = 0;
-        unsigned int ext_flags = 0;
-        u32 size_div, offset_div;
-        int ret = 0;
-        {
-                u64 o = offset;
-                u64 s = i_size_read(inode);
-                offset_div = do_div(o, osb->s_clustersize);
-                size_div = do_div(s, osb->s_clustersize);
-        }
-        if (offset <= i_size_read(inode))
-                return 0;
-        clusters_to_add = ocfs2_bytes_to_clusters(inode->i_sb, offset) -
-                ocfs2_bytes_to_clusters(inode->i_sb, i_size_read(inode));
-        total_zero_len = offset - i_size_read(inode);
-        if (clusters_to_add)
-                total_zero_len -= offset_div;
-        /* Allocate clusters to fill out holes, and this is only needed
-         * when we add more than one clusters. Otherwise the cluster will
-         * be allocated during direct IO */
-        if (clusters_to_add > 1) {
-                ret = ocfs2_extend_allocation(inode,
-                                OCFS2_I(inode)->ip_clusters,
-                                clusters_to_add - 1, 0);
-                if (ret) {
-                        mlog_errno(ret);
-                        goto out;
-                }
-        }
-        while (total_zero_len) {
-                ret = ocfs2_get_clusters(inode, v_cpos, &p_cpos, &num_clusters,
-                                &ext_flags);
-                if (ret < 0) {
-                        mlog_errno(ret);
-                        goto out;
-                }
-                zero_start = ocfs2_clusters_to_bytes(osb->sb, p_cpos) +
-                        size_div;
-                zero_len = ocfs2_clusters_to_bytes(osb->sb, num_clusters) -
-                        size_div;
-                zero_len = min(total_zero_len, zero_len);
-                if (p_cpos && !(ext_flags & OCFS2_EXT_UNWRITTEN)) {
-                        ret = blkdev_issue_zeroout(osb->sb->s_bdev,
-                                        zero_start >> 9, zero_len >> 9,
-                                        GFP_NOFS, false);
-                        if (ret < 0) {
-                                mlog_errno(ret);
-                                goto out;
-                        }
-                }
-                total_zero_len -= zero_len;
-                v_cpos += ocfs2_bytes_to_clusters(osb->sb, zero_len + size_div);
-                /* Only at first iteration can be cluster not aligned.
-                 * So set size_div to 0 for the rest */
-                size_div = 0;
-        }
-out:
-        return ret;
-}
-static ssize_t ocfs2_direct_IO_write(struct kiocb *iocb,
-                struct iov_iter *iter,
-                loff_t offset)
-{
-        ssize_t ret = 0;
-        ssize_t written = 0;
-        bool orphaned = false;
-        int is_overwrite = 0;
-        struct file *file = iocb->ki_filp;
-        struct inode *inode = file_inode(file)->i_mapping->host;
-        struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
-        struct buffer_head *di_bh = NULL;
-        size_t count = iter->count;
-        journal_t *journal = osb->journal->j_journal;
-        u64 zero_len_head, zero_len_tail;
-        int cluster_align_head, cluster_align_tail;
-        loff_t final_size = offset + count;
-        int append_write = offset >= i_size_read(inode) ? 1 : 0;
-        unsigned int num_clusters = 0;
-        unsigned int ext_flags = 0;
-        {
-                u64 o = offset;
-                u64 s = i_size_read(inode);
-                zero_len_head = do_div(o, 1 << osb->s_clustersize_bits);
-                cluster_align_head = !zero_len_head;
-                zero_len_tail = osb->s_clustersize -
-                        do_div(s, osb->s_clustersize);
-                if ((offset - i_size_read(inode)) < zero_len_tail)
-                        zero_len_tail = offset - i_size_read(inode);
-                cluster_align_tail = !zero_len_tail;
-        }
-        /*
-         * when final_size > inode->i_size, inode->i_size will be
-         * updated after direct write, so add the inode to orphan
-         * dir first.
-         */
-        if (final_size > i_size_read(inode)) {
-                ret = ocfs2_add_inode_to_orphan(osb, inode);
-                if (ret < 0) {
-                        mlog_errno(ret);
-                        goto out;
-                }
-                orphaned = true;
-        }
-        if (append_write) {
-                ret = ocfs2_inode_lock(inode, NULL, 1);
-                if (ret < 0) {
-                        mlog_errno(ret);
-                        goto clean_orphan;
-                }
-                /* zeroing out the previously allocated cluster tail
-                 * that but not zeroed */
-                if (ocfs2_sparse_alloc(OCFS2_SB(inode->i_sb))) {
-                        down_read(&OCFS2_I(inode)->ip_alloc_sem);
-                        ret = ocfs2_direct_IO_zero_extend(osb, inode, offset,
-                                        zero_len_tail, cluster_align_tail);
-                        up_read(&OCFS2_I(inode)->ip_alloc_sem);
-                } else {
-                        down_write(&OCFS2_I(inode)->ip_alloc_sem);
-                        ret = ocfs2_direct_IO_extend_no_holes(osb, inode,
-                                        offset);
-                        up_write(&OCFS2_I(inode)->ip_alloc_sem);
-                }
-                if (ret < 0) {
-                        mlog_errno(ret);
-                        ocfs2_inode_unlock(inode, 1);
-                        goto clean_orphan;
-                }
-                is_overwrite = ocfs2_is_overwrite(osb, inode, offset);
-                if (is_overwrite < 0) {
-                        mlog_errno(is_overwrite);
-                        ret = is_overwrite;
-                        ocfs2_inode_unlock(inode, 1);
-                        goto clean_orphan;
-                }
-                ocfs2_inode_unlock(inode, 1);
-        }
-        written = __blockdev_direct_IO(iocb, inode, inode->i_sb->s_bdev, iter,
-                                       offset, ocfs2_direct_IO_get_blocks,
-                                       ocfs2_dio_end_io, NULL, 0);
-        /* overwrite aio may return -EIOCBQUEUED, and it is not an error */
-        if ((written < 0) && (written != -EIOCBQUEUED)) {
-                loff_t i_size = i_size_read(inode);
-                if (offset + count > i_size) {
-                        ret = ocfs2_inode_lock(inode, &di_bh, 1);
-                        if (ret < 0) {
-                                mlog_errno(ret);
-                                goto clean_orphan;
-                        }
-                        if (i_size == i_size_read(inode)) {
-                                ret = ocfs2_truncate_file(inode, di_bh,
-                                                i_size);
-                                if (ret < 0) {
-                                        if (ret != -ENOSPC)
-                                                mlog_errno(ret);
-                                        ocfs2_inode_unlock(inode, 1);
-                                        brelse(di_bh);
-                                        di_bh = NULL;
-                                        goto clean_orphan;
-                                }
-                        }
-                        ocfs2_inode_unlock(inode, 1);
-                        brelse(di_bh);
-                        di_bh = NULL;
-                        ret = jbd2_journal_force_commit(journal);
-                        if (ret < 0)
-                                mlog_errno(ret);
-                }
-        } else if (written > 0 && append_write && !is_overwrite &&
-                        !cluster_align_head) {
-                /* zeroing out the allocated cluster head */
-                u32 p_cpos = 0;
-                u32 v_cpos = ocfs2_bytes_to_clusters(osb->sb, offset);
-                ret = ocfs2_inode_lock(inode, NULL, 0);
-                if (ret < 0) {
-                        mlog_errno(ret);
-                        goto clean_orphan;
-                }
-                ret = ocfs2_get_clusters(inode, v_cpos, &p_cpos,
-                                &num_clusters, &ext_flags);
-                if (ret < 0) {
-                        mlog_errno(ret);
-                        ocfs2_inode_unlock(inode, 0);
-                        goto clean_orphan;
-                }
-                BUG_ON(!p_cpos || (ext_flags & OCFS2_EXT_UNWRITTEN));
-                ret = blkdev_issue_zeroout(osb->sb->s_bdev,
-                                (u64)p_cpos << (osb->s_clustersize_bits - 9),
-                                zero_len_head >> 9, GFP_NOFS, false);
-                if (ret < 0)
-                        mlog_errno(ret);
-                ocfs2_inode_unlock(inode, 0);
-        }
-clean_orphan:
-        if (orphaned) {
-                int tmp_ret;
-                int update_isize = written > 0 ? 1 : 0;
-                loff_t end = update_isize ? offset + written : 0;
-                tmp_ret = ocfs2_inode_lock(inode, &di_bh, 1);
-                if (tmp_ret < 0) {
-                        ret = tmp_ret;
-                        mlog_errno(ret);
-                        goto out;
-                }
-                tmp_ret = ocfs2_del_inode_from_orphan(osb, inode, di_bh,
-                                update_isize, end);
-                if (tmp_ret < 0) {
-                        ocfs2_inode_unlock(inode, 1);
-                        ret = tmp_ret;
-                        mlog_errno(ret);
-                        brelse(di_bh);
-                        goto out;
-                }
-                ocfs2_inode_unlock(inode, 1);
-                brelse(di_bh);
-                tmp_ret = jbd2_journal_force_commit(journal);
-                if (tmp_ret < 0) {
-                        ret = tmp_ret;
-                        mlog_errno(tmp_ret);
-                }
-        }
-out:
-        if (ret >= 0)
-                ret = written;
-        return ret;
-}
-static ssize_t ocfs2_direct_IO(struct kiocb *iocb, struct iov_iter *iter,
-                               loff_t offset)
-{
-        struct file *file = iocb->ki_filp;
-        struct inode *inode = file_inode(file)->i_mapping->host;
-        struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
-        int full_coherency = !(osb->s_mount_opt &
-                        OCFS2_MOUNT_COHERENCY_BUFFERED);
-        /*
-         * Fallback to buffered I/O if we see an inode without
-         * extents.
-         */
-        if (OCFS2_I(inode)->ip_dyn_features & OCFS2_INLINE_DATA_FL)
-                return 0;
-        /* Fallback to buffered I/O if we are appending and
-         * concurrent O_DIRECT writes are allowed.
-         */
-        if (i_size_read(inode) <= offset && !full_coherency)
-                return 0;
-        if (iov_iter_rw(iter) == READ)
-                return __blockdev_direct_IO(iocb, inode, inode->i_sb->s_bdev,
-                                            iter, offset,
-                                            ocfs2_direct_IO_get_blocks,
-                                            ocfs2_dio_end_io, NULL, 0);
-        else
-                return ocfs2_direct_IO_write(iocb, iter, offset);
-}
 static void ocfs2_figure_cluster_boundaries(struct ocfs2_super *osb,
                                            u32 cpos,
                                            unsigned int *start,
@@ -1196,6 +692,13 @@ next_bh:
 #define OCFS2_MAX_CLUSTERS_PER_PAGE     (PAGE_CACHE_SIZE / OCFS2_MIN_CLUSTERSIZE)
+struct ocfs2_unwritten_extent {
+        struct list_head        ue_node;
+        struct list_head        ue_ip_node;
+        u32                     ue_cpos;
+        u32                     ue_phys;
+};
 /*
 * Describe the state of a single cluster to be written to.
 */
@@ -1207,7 +710,7 @@ struct ocfs2_write_cluster_desc {
         * filled.
         */
        unsigned        c_new;
-        unsigned        c_unwritten;
+        unsigned        c_clear_unwritten;
        unsigned        c_needs_zero;
 };
@@ -1219,6 +722,9 @@ struct ocfs2_write_ctxt {
        /* First cluster allocated in a nonsparse extend */
        u32                             w_first_new_cpos;
+        /* Type of caller. Must be one of buffer, mmap, direct.  */
+        ocfs2_write_type_t              w_type;
        struct ocfs2_write_cluster_desc w_desc[OCFS2_MAX_CLUSTERS_PER_PAGE];
        /*
@@ -1267,6 +773,8 @@ struct ocfs2_write_ctxt {
        struct buffer_head              *w_di_bh;
        struct ocfs2_cached_dealloc_ctxt w_dealloc;
+        struct list_head                w_unwritten_list;
 };
 void ocfs2_unlock_and_free_pages(struct page **pages, int num_pages)
@@ -1305,8 +813,25 @@ static void ocfs2_unlock_pages(struct ocfs2_write_ctxt *wc)
        ocfs2_unlock_and_free_pages(wc->w_pages, wc->w_num_pages);
 }
-static void ocfs2_free_write_ctxt(struct ocfs2_write_ctxt *wc)
+static void ocfs2_free_unwritten_list(struct inode *inode,
+                                 struct list_head *head)
 {
+        struct ocfs2_inode_info *oi = OCFS2_I(inode);
+        struct ocfs2_unwritten_extent *ue = NULL, *tmp = NULL;
+        list_for_each_entry_safe(ue, tmp, head, ue_node) {
+                list_del(&ue->ue_node);
+                spin_lock(&oi->ip_lock);
+                list_del(&ue->ue_ip_node);
+                spin_unlock(&oi->ip_lock);
+                kfree(ue);
+        }
+}
+static void ocfs2_free_write_ctxt(struct inode *inode,
+                                  struct ocfs2_write_ctxt *wc)
+{
+        ocfs2_free_unwritten_list(inode, &wc->w_unwritten_list);
        ocfs2_unlock_pages(wc);
        brelse(wc->w_di_bh);
        kfree(wc);
@@ -1314,7 +839,8 @@ static void ocfs2_free_write_ctxt(struct ocfs2_write_ctxt *wc)
 static int ocfs2_alloc_write_ctxt(struct ocfs2_write_ctxt **wcp,
                                  struct ocfs2_super *osb, loff_t pos,
-                                  unsigned len, struct buffer_head *di_bh)
+                                  unsigned len, ocfs2_write_type_t type,
+                                  struct buffer_head *di_bh)
 {
        u32 cend;
        struct ocfs2_write_ctxt *wc;
@@ -1329,6 +855,7 @@ static int ocfs2_alloc_write_ctxt(struct ocfs2_write_ctxt **wcp,
        wc->w_clen = cend - wc->w_cpos + 1;
        get_bh(di_bh);
        wc->w_di_bh = di_bh;
+        wc->w_type = type;
        if (unlikely(PAGE_CACHE_SHIFT > osb->s_clustersize_bits))
                wc->w_large_pages = 1;
@@ -1336,6 +863,7 @@ static int ocfs2_alloc_write_ctxt(struct ocfs2_write_ctxt **wcp,
                wc->w_large_pages = 0;
        ocfs2_init_dealloc_ctxt(&wc->w_dealloc);
+        INIT_LIST_HEAD(&wc->w_unwritten_list);
        *wcp = wc;
@@ -1396,12 +924,13 @@ static void ocfs2_write_failure(struct inode *inode,
                to = user_pos + user_len;
        struct page *tmppage;
-        ocfs2_zero_new_buffers(wc->w_target_page, from, to);
+        if (wc->w_target_page)
+                ocfs2_zero_new_buffers(wc->w_target_page, from, to);
        for(i = 0; i < wc->w_num_pages; i++) {
                tmppage = wc->w_pages[i];
-                if (page_has_buffers(tmppage)) {
+                if (tmppage && page_has_buffers(tmppage)) {
                        if (ocfs2_should_order_data(inode))
                                ocfs2_jbd2_file_inode(wc->w_handle, inode);
@@ -1531,11 +1060,13 @@ static int ocfs2_grab_pages_for_write(struct address_space *mapping,
                wc->w_num_pages = 1;
                start = target_index;
        }
+        end_index = (user_pos + user_len - 1) >> PAGE_CACHE_SHIFT;
        for(i = 0; i < wc->w_num_pages; i++) {
                index = start + i;
-                if (index == target_index && mmap_page) {
+                if (index >= target_index && index <= end_index &&
+                    wc->w_type == OCFS2_WRITE_MMAP) {
                        /*
                         * ocfs2_pagemkwrite() is a little different
                         * and wants us to directly use the page
@@ -1554,6 +1085,11 @@ static int ocfs2_grab_pages_for_write(struct address_space *mapping,
                        page_cache_get(mmap_page);
                        wc->w_pages[i] = mmap_page;
                        wc->w_target_locked = true;
+                } else if (index >= target_index && index <= end_index &&
+                           wc->w_type == OCFS2_WRITE_DIRECT) {
+                        /* Direct write has no mapping page. */
+                        wc->w_pages[i] = NULL;
+                        continue;
                } else {
                        wc->w_pages[i] = find_or_create_page(mapping, index,
                                                             GFP_NOFS);
@@ -1578,19 +1114,20 @@ out:
 * Prepare a single cluster for write one cluster into the file.
 */
 static int ocfs2_write_cluster(struct address_space *mapping,
-                               u32 phys, unsigned int unwritten,
+                               u32 *phys, unsigned int new,
+                               unsigned int clear_unwritten,
                               unsigned int should_zero,
                               struct ocfs2_alloc_context *data_ac,
                               struct ocfs2_alloc_context *meta_ac,
                               struct ocfs2_write_ctxt *wc, u32 cpos,
                               loff_t user_pos, unsigned user_len)
 {
-        int ret, i, new;
+        int ret, i;
-        u64 v_blkno, p_blkno;
+        u64 p_blkno;
        struct inode *inode = mapping->host;
        struct ocfs2_extent_tree et;
+        int bpc = ocfs2_clusters_to_blocks(inode->i_sb, 1);
-        new = phys == 0 ? 1 : 0;
        if (new) {
                u32 tmp_pos;
@@ -1600,9 +1137,9 @@ static int ocfs2_write_cluster(struct address_space *mapping,
                 */
                tmp_pos = cpos;
                ret = ocfs2_add_inode_data(OCFS2_SB(inode->i_sb), inode,
-                                           &tmp_pos, 1, 0, wc->w_di_bh,
+                                           &tmp_pos, 1, !clear_unwritten,
-                                           wc->w_handle, data_ac,
+                                           wc->w_di_bh, wc->w_handle,
-                                           meta_ac, NULL);
+                                           data_ac, meta_ac, NULL);
                /*
                 * This shouldn't happen because we must have already
                 * calculated the correct meta data allocation required. The
@@ -1619,11 +1156,11 @@ static int ocfs2_write_cluster(struct address_space *mapping,
                        mlog_errno(ret);
                        goto out;
                }
-        } else if (unwritten) {
+        } else if (clear_unwritten) {
                ocfs2_init_dinode_extent_tree(&et, INODE_CACHE(inode),
                                              wc->w_di_bh);
                ret = ocfs2_mark_extent_written(inode, &et,
-                                                wc->w_handle, cpos, 1, phys,
+                                                wc->w_handle, cpos, 1, *phys,
                                                meta_ac, &wc->w_dealloc);
                if (ret < 0) {
                        mlog_errno(ret);
@@ -1631,30 +1168,33 @@ static int ocfs2_write_cluster(struct address_space *mapping,
                }
        }
-        if (should_zero)
-                v_blkno = ocfs2_clusters_to_blocks(inode->i_sb, cpos);
-        else
-                v_blkno = user_pos >> inode->i_sb->s_blocksize_bits;
        /*
         * The only reason this should fail is due to an inability to
         * find the extent added.
         */
-        ret = ocfs2_extent_map_get_blocks(inode, v_blkno, &p_blkno, NULL,
+        ret = ocfs2_get_clusters(inode, cpos, phys, NULL, NULL);
-                                          NULL);
        if (ret < 0) {
                mlog(ML_ERROR, "Get physical blkno failed for inode %llu, "
-                            "at logical block %llu",
+                            "at logical cluster %u",
-                            (unsigned long long)OCFS2_I(inode)->ip_blkno,
+                            (unsigned long long)OCFS2_I(inode)->ip_blkno, cpos);
-                            (unsigned long long)v_blkno);
                goto out;
        }
-        BUG_ON(p_blkno == 0);
+        BUG_ON(*phys == 0);
+        p_blkno = ocfs2_clusters_to_blocks(inode->i_sb, *phys);
+        if (!should_zero)
+                p_blkno += (user_pos >> inode->i_sb->s_blocksize_bits) & (u64)(bpc - 1);
        for(i = 0; i < wc->w_num_pages; i++) {
                int tmpret;
+                /* This is the direct io target page. */
+                if (wc->w_pages[i] == NULL) {
+                        p_blkno++;
+                        continue;
+                }
                tmpret = ocfs2_prepare_page_for_write(inode, &p_blkno, wc,
                                                      wc->w_pages[i], cpos,
                                                      user_pos, user_len,
@@ -1701,8 +1241,9 @@ static int ocfs2_write_cluster_by_desc(struct address_space *mapping,
                if ((cluster_off + local_len) > osb->s_clustersize)
                        local_len = osb->s_clustersize - cluster_off;
-                ret = ocfs2_write_cluster(mapping, desc->c_phys,
+                ret = ocfs2_write_cluster(mapping, &desc->c_phys,
-                                          desc->c_unwritten,
+                                          desc->c_new,
+                                          desc->c_clear_unwritten,
                                          desc->c_needs_zero,
                                          data_ac, meta_ac,
                                          wc, desc->c_cpos, pos, local_len);
@@ -1773,6 +1314,66 @@ static void ocfs2_set_target_boundaries(struct ocfs2_super *osb,
 }
 /*
+ * Check if this extent is marked UNWRITTEN by direct io. If so, we need not to
+ * do the zero work. And should not to clear UNWRITTEN since it will be cleared
+ * by the direct io procedure.
+ * If this is a new extent that allocated by direct io, we should mark it in
+ * the ip_unwritten_list.
+ */
+static int ocfs2_unwritten_check(struct inode *inode,
+                                 struct ocfs2_write_ctxt *wc,
+                                 struct ocfs2_write_cluster_desc *desc)
+{
+        struct ocfs2_inode_info *oi = OCFS2_I(inode);
+        struct ocfs2_unwritten_extent *ue = NULL, *new = NULL;
+        int ret = 0;
+        if (!desc->c_needs_zero)
+                return 0;
+retry:
+        spin_lock(&oi->ip_lock);
+        /* Needs not to zero no metter buffer or direct. The one who is zero
+         * the cluster is doing zero. And he will clear unwritten after all
+         * cluster io finished. */
+        list_for_each_entry(ue, &oi->ip_unwritten_list, ue_ip_node) {
+                if (desc->c_cpos == ue->ue_cpos) {
+                        BUG_ON(desc->c_new);
+                        desc->c_needs_zero = 0;
+                        desc->c_clear_unwritten = 0;
+                        goto unlock;
+                }
+        }
+        if (wc->w_type != OCFS2_WRITE_DIRECT)
+                goto unlock;
+        if (new == NULL) {
+                spin_unlock(&oi->ip_lock);
+                new = kmalloc(sizeof(struct ocfs2_unwritten_extent),
+                             GFP_NOFS);
+                if (new == NULL) {
+                        ret = -ENOMEM;
+                        goto out;
+                }
+                goto retry;
+        }
+        /* This direct write will doing zero. */
+        new->ue_cpos = desc->c_cpos;
+        new->ue_phys = desc->c_phys;
+        desc->c_clear_unwritten = 0;
+        list_add_tail(&new->ue_ip_node, &oi->ip_unwritten_list);
+        list_add_tail(&new->ue_node, &wc->w_unwritten_list);
+        new = NULL;
+unlock:
+        spin_unlock(&oi->ip_lock);
+out:
+        if (new)
+                kfree(new);
+        return ret;
+}
+/*
 * Populate each single-cluster write descriptor in the write context
 * with information about the i/o to be done.
 *
@@ -1847,14 +1448,21 @@ static int ocfs2_populate_write_desc(struct inode *inode,
                if (phys == 0) {
                        desc->c_new = 1;
                        desc->c_needs_zero = 1;
+                        desc->c_clear_unwritten = 1;
                        *clusters_to_alloc = *clusters_to_alloc + 1;
                }
                if (ext_flags & OCFS2_EXT_UNWRITTEN) {
-                        desc->c_unwritten = 1;
+                        desc->c_clear_unwritten = 1;
                        desc->c_needs_zero = 1;
                }
+                ret = ocfs2_unwritten_check(inode, wc, desc);
+                if (ret) {
+                        mlog_errno(ret);
+                        goto out;
+                }
                num_clusters--;
        }
@@ -2017,8 +1625,10 @@ static int ocfs2_expand_nonsparse_inode(struct inode *inode,
        if (ret)
                mlog_errno(ret);
-        wc->w_first_new_cpos =
+        /* There is no wc if this is call from direct. */
-                ocfs2_clusters_for_bytes(inode->i_sb, i_size_read(inode));
+        if (wc)
+                wc->w_first_new_cpos =
+                        ocfs2_clusters_for_bytes(inode->i_sb, i_size_read(inode));
        return ret;
 }
@@ -2072,9 +1682,8 @@ out:
        return ret;
 }
-int ocfs2_write_begin_nolock(struct file *filp,
+int ocfs2_write_begin_nolock(struct address_space *mapping,
-                             struct address_space *mapping,
+                             loff_t pos, unsigned len, ocfs2_write_type_t type,
-                             loff_t pos, unsigned len, unsigned flags,
                             struct page **pagep, void **fsdata,
                             struct buffer_head *di_bh, struct page *mmap_page)
 {
@@ -2091,7 +1700,7 @@ int ocfs2_write_begin_nolock(struct file *filp,
        int try_free = 1, ret1;
 try_again:
-        ret = ocfs2_alloc_write_ctxt(&wc, osb, pos, len, di_bh);
+        ret = ocfs2_alloc_write_ctxt(&wc, osb, pos, len, type, di_bh);
        if (ret) {
                mlog_errno(ret);
                return ret;
@@ -2110,14 +1719,17 @@ try_again:
                }
        }
-        if (ocfs2_sparse_alloc(osb))
+        /* Direct io change i_size late, should not zero tail here. */
-                ret = ocfs2_zero_tail(inode, di_bh, pos);
+        if (type != OCFS2_WRITE_DIRECT) {
-        else
+                if (ocfs2_sparse_alloc(osb))
-                ret = ocfs2_expand_nonsparse_inode(inode, di_bh, pos, len,
+                        ret = ocfs2_zero_tail(inode, di_bh, pos);
-                                                   wc);
+                else
-        if (ret) {
+                        ret = ocfs2_expand_nonsparse_inode(inode, di_bh, pos,
-                mlog_errno(ret);
+                                                           len, wc);
-                goto out;
+                if (ret) {
+                        mlog_errno(ret);
+                        goto out;
+                }
        }
        ret = ocfs2_check_range_for_refcount(inode, pos, len);
@@ -2148,7 +1760,7 @@ try_again:
                        (unsigned long long)OCFS2_I(inode)->ip_blkno,
                        (long long)i_size_read(inode),
                        le32_to_cpu(di->i_clusters),
-                        pos, len, flags, mmap_page,
+                        pos, len, type, mmap_page,
                        clusters_to_alloc, extents_to_split);
        /*
@@ -2178,17 +1790,17 @@ try_again:
                credits = ocfs2_calc_extend_credits(inode->i_sb,
                                                    &di->id2.i_list);
+        } else if (type == OCFS2_WRITE_DIRECT)
-        }
+                /* direct write needs not to start trans if no extents alloc. */
+                goto success;
        /*
         * We have to zero sparse allocated clusters, unwritten extent clusters,
         * and non-sparse clusters we just extended.  For non-sparse writes,
         * we know zeros will only be needed in the first and/or last cluster.
         */
-        if (clusters_to_alloc || extents_to_split ||
+        if (wc->w_clen && (wc->w_desc[0].c_needs_zero ||
-            (wc->w_clen && (wc->w_desc[0].c_needs_zero ||
+                           wc->w_desc[wc->w_clen - 1].c_needs_zero))
-                            wc->w_desc[wc->w_clen - 1].c_needs_zero)))
                cluster_of_pages = 1;
        else
                cluster_of_pages = 0;
@@ -2255,7 +1867,8 @@ try_again:
                ocfs2_free_alloc_context(meta_ac);
 success:
-        *pagep = wc->w_target_page;
+        if (pagep)
+                *pagep = wc->w_target_page;
        *fsdata = wc;
        return 0;
 out_quota:
@@ -2266,7 +1879,7 @@ out_commit:
        ocfs2_commit_trans(osb, handle);
 out:
-        ocfs2_free_write_ctxt(wc);
+        ocfs2_free_write_ctxt(inode, wc);
        if (data_ac) {
                ocfs2_free_alloc_context(data_ac);
@@ -2318,8 +1931,8 @@ static int ocfs2_write_begin(struct file *file, struct address_space *mapping,
         */
        down_write(&OCFS2_I(inode)->ip_alloc_sem);
-        ret = ocfs2_write_begin_nolock(file, mapping, pos, len, flags, pagep,
+        ret = ocfs2_write_begin_nolock(mapping, pos, len, OCFS2_WRITE_BUFFER,
-                                       fsdata, di_bh, NULL);
+                                       pagep, fsdata, di_bh, NULL);
        if (ret) {
                mlog_errno(ret);
                goto out_fail;
@@ -2376,12 +1989,16 @@ int ocfs2_write_end_nolock(struct address_space *mapping,
        handle_t *handle = wc->w_handle;
        struct page *tmppage;
-        ret = ocfs2_journal_access_di(handle, INODE_CACHE(inode), wc->w_di_bh,
+        BUG_ON(!list_empty(&wc->w_unwritten_list));
-                        OCFS2_JOURNAL_ACCESS_WRITE);
-        if (ret) {
+        if (handle) {
-                copied = ret;
+                ret = ocfs2_journal_access_di(handle, INODE_CACHE(inode),
-                mlog_errno(ret);
+                                wc->w_di_bh, OCFS2_JOURNAL_ACCESS_WRITE);
-                goto out;
+                if (ret) {
+                        copied = ret;
+                        mlog_errno(ret);
+                        goto out;
+                }
        }
        if (OCFS2_I(inode)->ip_dyn_features & OCFS2_INLINE_DATA_FL) {
@@ -2389,18 +2006,23 @@ int ocfs2_write_end_nolock(struct address_space *mapping,
                goto out_write_size;
        }
-        if (unlikely(copied < len)) {
+        if (unlikely(copied < len) && wc->w_target_page) {
                if (!PageUptodate(wc->w_target_page))
                        copied = 0;
                ocfs2_zero_new_buffers(wc->w_target_page, start+copied,
                                       start+len);
        }
-        flush_dcache_page(wc->w_target_page);
+        if (wc->w_target_page)
+                flush_dcache_page(wc->w_target_page);
        for(i = 0; i < wc->w_num_pages; i++) {
                tmppage = wc->w_pages[i];
+                /* This is the direct io target page. */
+                if (tmppage == NULL)
+                        continue;
                if (tmppage == wc->w_target_page) {
                        from = wc->w_target_from;
                        to = wc->w_target_to;
@@ -2419,25 +2041,29 @@ int ocfs2_write_end_nolock(struct address_space *mapping,
                }
                if (page_has_buffers(tmppage)) {
-                        if (ocfs2_should_order_data(inode))
+                        if (handle && ocfs2_should_order_data(inode))
-                                ocfs2_jbd2_file_inode(wc->w_handle, inode);
+                                ocfs2_jbd2_file_inode(handle, inode);
                        block_commit_write(tmppage, from, to);
                }
        }
 out_write_size:
-        pos += copied;
+        /* Direct io do not update i_size here. */
-        if (pos > i_size_read(inode)) {
+        if (wc->w_type != OCFS2_WRITE_DIRECT) {
-                i_size_write(inode, pos);
+                pos += copied;
-                mark_inode_dirty(inode);
+                if (pos > i_size_read(inode)) {
-        }
+                        i_size_write(inode, pos);
-        inode->i_blocks = ocfs2_inode_sector_count(inode);
+                        mark_inode_dirty(inode);
-        di->i_size = cpu_to_le64((u64)i_size_read(inode));
+                }
-        inode->i_mtime = inode->i_ctime = CURRENT_TIME;
+                inode->i_blocks = ocfs2_inode_sector_count(inode);
-        di->i_mtime = di->i_ctime = cpu_to_le64(inode->i_mtime.tv_sec);
+                di->i_size = cpu_to_le64((u64)i_size_read(inode));
-        di->i_mtime_nsec = di->i_ctime_nsec = cpu_to_le32(inode->i_mtime.tv_nsec);
+                inode->i_mtime = inode->i_ctime = CURRENT_TIME;
-        ocfs2_update_inode_fsync_trans(handle, inode, 1);
+                di->i_mtime = di->i_ctime = cpu_to_le64(inode->i_mtime.tv_sec);
-        ocfs2_journal_dirty(handle, wc->w_di_bh);
+                di->i_mtime_nsec = di->i_ctime_nsec = cpu_to_le32(inode->i_mtime.tv_nsec);
+                ocfs2_update_inode_fsync_trans(handle, inode, 1);
+        }
+        if (handle)
+                ocfs2_journal_dirty(handle, wc->w_di_bh);
 out:
        /* unlock pages before dealloc since it needs acquiring j_trans_barrier
@@ -2447,7 +2073,8 @@ out:
         */
        ocfs2_unlock_pages(wc);
-        ocfs2_commit_trans(osb, handle);
+        if (handle)
+                ocfs2_commit_trans(osb, handle);
        ocfs2_run_deallocs(osb, &wc->w_dealloc);
@@ -2472,6 +2099,360 @@ static int ocfs2_write_end(struct file *file, struct address_space *mapping,
        return ret;
 }
+struct ocfs2_dio_write_ctxt {
+        struct list_head        dw_zero_list;
+        unsigned                dw_zero_count;
+        int                     dw_orphaned;
+        pid_t                   dw_writer_pid;
+};
+static struct ocfs2_dio_write_ctxt *
+ocfs2_dio_alloc_write_ctx(struct buffer_head *bh, int *alloc)
+{
+        struct ocfs2_dio_write_ctxt *dwc = NULL;
+        if (bh->b_private)
+                return bh->b_private;
+        dwc = kmalloc(sizeof(struct ocfs2_dio_write_ctxt), GFP_NOFS);
+        if (dwc == NULL)
+                return NULL;
+        INIT_LIST_HEAD(&dwc->dw_zero_list);
+        dwc->dw_zero_count = 0;
+        dwc->dw_orphaned = 0;
+        dwc->dw_writer_pid = task_pid_nr(current);
+        bh->b_private = dwc;
+        *alloc = 1;
+        return dwc;
+}
+static void ocfs2_dio_free_write_ctx(struct inode *inode,
+                                     struct ocfs2_dio_write_ctxt *dwc)
+{
+        ocfs2_free_unwritten_list(inode, &dwc->dw_zero_list);
+        kfree(dwc);
+}
+/*
+ * TODO: Make this into a generic get_blocks function.
+ *
+ * From do_direct_io in direct-io.c:
+ *  "So what we do is to permit the ->get_blocks function to populate
+ *   bh.b_size with the size of IO which is permitted at this offset and
+ *   this i_blkbits."
+ *
+ * This function is called directly from get_more_blocks in direct-io.c.
+ *
+ * called like this: dio->get_blocks(dio->inode, fs_startblk,
+ *                                      fs_count, map_bh, dio->rw == WRITE);
+ */
+static int ocfs2_dio_get_block(struct inode *inode, sector_t iblock,
+                               struct buffer_head *bh_result, int create)
+{
+        struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
+        struct ocfs2_inode_info *oi = OCFS2_I(inode);
+        struct ocfs2_write_ctxt *wc;
+        struct ocfs2_write_cluster_desc *desc = NULL;
+        struct ocfs2_dio_write_ctxt *dwc = NULL;
+        struct buffer_head *di_bh = NULL;
+        u64 p_blkno;
+        loff_t pos = iblock << inode->i_sb->s_blocksize_bits;
+        unsigned len, total_len = bh_result->b_size;
+        int ret = 0, first_get_block = 0;
+        len = osb->s_clustersize - (pos & (osb->s_clustersize - 1));
+        len = min(total_len, len);
+        mlog(0, "get block of %lu at %llu:%u req %u\n",
+                        inode->i_ino, pos, len, total_len);
+        /*
+         * Because we need to change file size in ocfs2_dio_end_io_write(), or
+         * we may need to add it to orphan dir. So can not fall to fast path
+         * while file size will be changed.
+         */
+        if (pos + total_len <= i_size_read(inode)) {
+                down_read(&oi->ip_alloc_sem);
+                /* This is the fast path for re-write. */
+                ret = ocfs2_get_block(inode, iblock, bh_result, create);
+                up_read(&oi->ip_alloc_sem);
+                if (buffer_mapped(bh_result) &&
+                    !buffer_new(bh_result) &&
+                    ret == 0)
+                        goto out;
+                /* Clear state set by ocfs2_get_block. */
+                bh_result->b_state = 0;
+        }
+        dwc = ocfs2_dio_alloc_write_ctx(bh_result, &first_get_block);
+        if (unlikely(dwc == NULL)) {
+                ret = -ENOMEM;
+                mlog_errno(ret);
+                goto out;
+        }
+        if (ocfs2_clusters_for_bytes(inode->i_sb, pos + total_len) >
+            ocfs2_clusters_for_bytes(inode->i_sb, i_size_read(inode)) &&
+            !dwc->dw_orphaned) {
+                /*
+                 * when we are going to alloc extents beyond file size, add the
+                 * inode to orphan dir, so we can recall those spaces when
+                 * system crashed during write.
+                 */
+                ret = ocfs2_add_inode_to_orphan(osb, inode);
+                if (ret < 0) {
+                        mlog_errno(ret);
+                        goto out;
+                }
+                dwc->dw_orphaned = 1;
+        }
+        ret = ocfs2_inode_lock(inode, &di_bh, 1);
+        if (ret) {
+                mlog_errno(ret);
+                goto out;
+        }
+        down_write(&oi->ip_alloc_sem);
+        if (first_get_block) {
+                if (ocfs2_sparse_alloc(OCFS2_SB(inode->i_sb)))
+                        ret = ocfs2_zero_tail(inode, di_bh, pos);
+                else
+                        ret = ocfs2_expand_nonsparse_inode(inode, di_bh, pos,
+                                                           total_len, NULL);
+                if (ret < 0) {
+                        mlog_errno(ret);
+                        goto unlock;
+                }
+        }
+        ret = ocfs2_write_begin_nolock(inode->i_mapping, pos, len,
+                                       OCFS2_WRITE_DIRECT, NULL,
+                                       (void **)&wc, di_bh, NULL);
+        if (ret) {
+                mlog_errno(ret);
+                goto unlock;
+        }
+        desc = &wc->w_desc[0];
+        p_blkno = ocfs2_clusters_to_blocks(inode->i_sb, desc->c_phys);
+        BUG_ON(p_blkno == 0);
+        p_blkno += iblock & (u64)(ocfs2_clusters_to_blocks(inode->i_sb, 1) - 1);
+        map_bh(bh_result, inode->i_sb, p_blkno);
+        bh_result->b_size = len;
+        if (desc->c_needs_zero)
+                set_buffer_new(bh_result);
+        /* May sleep in end_io. It should not happen in a irq context. So defer
+         * it to dio work queue. */
+        set_buffer_defer_completion(bh_result);
+        if (!list_empty(&wc->w_unwritten_list)) {
+                struct ocfs2_unwritten_extent *ue = NULL;
+                ue = list_first_entry(&wc->w_unwritten_list,
+                                      struct ocfs2_unwritten_extent,
+                                      ue_node);
+                BUG_ON(ue->ue_cpos != desc->c_cpos);
+                /* The physical address may be 0, fill it. */
+                ue->ue_phys = desc->c_phys;
+                list_splice_tail_init(&wc->w_unwritten_list, &dwc->dw_zero_list);
+                dwc->dw_zero_count++;
+        }
+        ret = ocfs2_write_end_nolock(inode->i_mapping, pos, len, len, NULL, wc);
+        BUG_ON(ret != len);
+        ret = 0;
+unlock:
+        up_write(&oi->ip_alloc_sem);
+        ocfs2_inode_unlock(inode, 1);
+        brelse(di_bh);
+out:
+        if (ret < 0)
+                ret = -EIO;
+        return ret;
+}
+static void ocfs2_dio_end_io_write(struct inode *inode,
+                                   struct ocfs2_dio_write_ctxt *dwc,
+                                   loff_t offset,
+                                   ssize_t bytes)
+{
+        struct ocfs2_cached_dealloc_ctxt dealloc;
+        struct ocfs2_extent_tree et;
+        struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
+        struct ocfs2_inode_info *oi = OCFS2_I(inode);
+        struct ocfs2_unwritten_extent *ue = NULL;
+        struct buffer_head *di_bh = NULL;
+        struct ocfs2_dinode *di;
+        struct ocfs2_alloc_context *data_ac = NULL;
+        struct ocfs2_alloc_context *meta_ac = NULL;
+        handle_t *handle = NULL;
+        loff_t end = offset + bytes;
+        int ret = 0, credits = 0, locked = 0;
+        ocfs2_init_dealloc_ctxt(&dealloc);
+        /* We do clear unwritten, delete orphan, change i_size here. If neither
+         * of these happen, we can skip all this. */
+        if (list_empty(&dwc->dw_zero_list) &&
+            end <= i_size_read(inode) &&
+            !dwc->dw_orphaned)
+                goto out;
+        /* ocfs2_file_write_iter will get i_mutex, so we need not lock if we
+         * are in that context. */
+        if (dwc->dw_writer_pid != task_pid_nr(current)) {
+                mutex_lock(&inode->i_mutex);
+                locked = 1;
+        }
+        ret = ocfs2_inode_lock(inode, &di_bh, 1);
+        if (ret < 0) {
+                mlog_errno(ret);
+                goto out;
+        }
+        down_write(&oi->ip_alloc_sem);
+        /* Delete orphan before acquire i_mutex. */
+        if (dwc->dw_orphaned) {
+                BUG_ON(dwc->dw_writer_pid != task_pid_nr(current));
+                end = end > i_size_read(inode) ? end : 0;
+                ret = ocfs2_del_inode_from_orphan(osb, inode, di_bh,
+                                !!end, end);
+                if (ret < 0)
+                        mlog_errno(ret);
+        }
+        di = (struct ocfs2_dinode *)di_bh;
+        ocfs2_init_dinode_extent_tree(&et, INODE_CACHE(inode), di_bh);
+        ret = ocfs2_lock_allocators(inode, &et, 0, dwc->dw_zero_count*2,
+                                    &data_ac, &meta_ac);
+        if (ret) {
+                mlog_errno(ret);
+                goto unlock;
+        }
+        credits = ocfs2_calc_extend_credits(inode->i_sb, &di->id2.i_list);
+        handle = ocfs2_start_trans(osb, credits);
+        if (IS_ERR(handle)) {
+                ret = PTR_ERR(handle);
+                mlog_errno(ret);
+                goto unlock;
+        }
+        ret = ocfs2_journal_access_di(handle, INODE_CACHE(inode), di_bh,
+                                      OCFS2_JOURNAL_ACCESS_WRITE);
+        if (ret) {
+                mlog_errno(ret);
+                goto commit;
+        }
+        list_for_each_entry(ue, &dwc->dw_zero_list, ue_node) {
+                ret = ocfs2_mark_extent_written(inode, &et, handle,
+                                                ue->ue_cpos, 1,
+                                                ue->ue_phys,
+                                                meta_ac, &dealloc);
+                if (ret < 0) {
+                        mlog_errno(ret);
+                        break;
+                }
+        }
+        if (end > i_size_read(inode)) {
+                ret = ocfs2_set_inode_size(handle, inode, di_bh, end);
+                if (ret < 0)
+                        mlog_errno(ret);
+        }
+commit:
+        ocfs2_commit_trans(osb, handle);
+unlock:
+        up_write(&oi->ip_alloc_sem);
+        ocfs2_inode_unlock(inode, 1);
+        brelse(di_bh);
+out:
+        if (data_ac)
+                ocfs2_free_alloc_context(data_ac);
+        if (meta_ac)
+                ocfs2_free_alloc_context(meta_ac);
+        ocfs2_run_deallocs(osb, &dealloc);
+        if (locked)
+                mutex_unlock(&inode->i_mutex);
+        ocfs2_dio_free_write_ctx(inode, dwc);
+}
+/*
+ * ocfs2_dio_end_io is called by the dio core when a dio is finished.  We're
+ * particularly interested in the aio/dio case.  We use the rw_lock DLM lock
+ * to protect io on one node from truncation on another.
+ */
+static int ocfs2_dio_end_io(struct kiocb *iocb,
+                            loff_t offset,
+                            ssize_t bytes,
+                            void *private)
+{
+        struct inode *inode = file_inode(iocb->ki_filp);
+        int level;
+        if (bytes <= 0)
+                return 0;
+        /* this io's submitter should not have unlocked this before we could */
+        BUG_ON(!ocfs2_iocb_is_rw_locked(iocb));
+        if (private)
+                ocfs2_dio_end_io_write(inode, private, offset, bytes);
+        ocfs2_iocb_clear_rw_locked(iocb);
+        level = ocfs2_iocb_rw_locked_level(iocb);
+        ocfs2_rw_unlock(inode, level);
+        return 0;
+}
+static ssize_t ocfs2_direct_IO(struct kiocb *iocb, struct iov_iter *iter,
+                               loff_t offset)
+{
+        struct file *file = iocb->ki_filp;
+        struct inode *inode = file_inode(file)->i_mapping->host;
+        struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
+        loff_t end = offset + iter->count;
+        get_block_t *get_block;
+        /*
+         * Fallback to buffered I/O if we see an inode without
+         * extents.
+         */
+        if (OCFS2_I(inode)->ip_dyn_features & OCFS2_INLINE_DATA_FL)
+                return 0;
+        /* Fallback to buffered I/O if we do not support append dio. */
+        if (end > i_size_read(inode) && !ocfs2_supports_append_dio(osb))
+                return 0;
+        if (iov_iter_rw(iter) == READ)
+                get_block = ocfs2_get_block;
+        else
+                get_block = ocfs2_dio_get_block;
+        return __blockdev_direct_IO(iocb, inode, inode->i_sb->s_bdev,
+                                    iter, offset, get_block,
+                                    ocfs2_dio_end_io, NULL, 0);
+}
 const struct address_space_operations ocfs2_aops = {
        .readpage               = ocfs2_readpage,
        .readpages              = ocfs2_readpages,
diff --git a/fs/ocfs2/aops.h b/fs/ocfs2/aops.h
index 24e496d6bdcd..b1c9f28a57b1 100644
--- a/fs/ocfs2/aops.h
+++ b/fs/ocfs2/aops.h
@@ -47,9 +47,14 @@ int ocfs2_write_end_nolock(struct address_space *mapping,
                           loff_t pos, unsigned len, unsigned copied,
                           struct page *page, void *fsdata);
-int ocfs2_write_begin_nolock(struct file *filp,
+typedef enum {
-                             struct address_space *mapping,
+        OCFS2_WRITE_BUFFER = 0,
-                             loff_t pos, unsigned len, unsigned flags,
+        OCFS2_WRITE_DIRECT,
+        OCFS2_WRITE_MMAP,
+} ocfs2_write_type_t;
+int ocfs2_write_begin_nolock(struct address_space *mapping,
+                             loff_t pos, unsigned len, ocfs2_write_type_t type,
                             struct page **pagep, void **fsdata,
                             struct buffer_head *di_bh, struct page *mmap_page);
@@ -79,7 +84,6 @@ static inline void ocfs2_iocb_set_rw_locked(struct kiocb *iocb, int level)
 enum ocfs2_iocb_lock_bits {
        OCFS2_IOCB_RW_LOCK = 0,
        OCFS2_IOCB_RW_LOCK_LEVEL,
-        OCFS2_IOCB_UNALIGNED_IO,
        OCFS2_IOCB_NUM_LOCKS
 };
@@ -88,11 +92,4 @@ enum ocfs2_iocb_lock_bits {
 #define ocfs2_iocb_rw_locked_level(iocb) \
        test_bit(OCFS2_IOCB_RW_LOCK_LEVEL, (unsigned long *)&iocb->private)
-#define ocfs2_iocb_set_unaligned_aio(iocb) \
-        set_bit(OCFS2_IOCB_UNALIGNED_IO, (unsigned long *)&iocb->private)
-#define ocfs2_iocb_clear_unaligned_aio(iocb) \
-        clear_bit(OCFS2_IOCB_UNALIGNED_IO, (unsigned long *)&iocb->private)
-#define ocfs2_iocb_is_unaligned_aio(iocb) \
-        test_bit(OCFS2_IOCB_UNALIGNED_IO, (unsigned long *)&iocb->private)
 #endif /* OCFS2_FILE_H */
diff --git a/fs/ocfs2/cluster/heartbeat.c b/fs/ocfs2/cluster/heartbeat.c
index a76b9ea7722e..bd15929b5f92 100644
--- a/fs/ocfs2/cluster/heartbeat.c
+++ b/fs/ocfs2/cluster/heartbeat.c
@@ -287,7 +287,6 @@ struct o2hb_bio_wait_ctxt {
 static void o2hb_write_timeout(struct work_struct *work)
 {
        int failed, quorum;
-        unsigned long flags;
        struct o2hb_region *reg =
                container_of(work, struct o2hb_region,
                             hr_write_timeout_work.work);
@@ -297,14 +296,14 @@ static void o2hb_write_timeout(struct work_struct *work)
             jiffies_to_msecs(jiffies - reg->hr_last_timeout_start));
        if (o2hb_global_heartbeat_active()) {
-                spin_lock_irqsave(&o2hb_live_lock, flags);
+                spin_lock(&o2hb_live_lock);
                if (test_bit(reg->hr_region_num, o2hb_quorum_region_bitmap))
                        set_bit(reg->hr_region_num, o2hb_failed_region_bitmap);
                failed = bitmap_weight(o2hb_failed_region_bitmap,
                                        O2NM_MAX_REGIONS);
                quorum = bitmap_weight(o2hb_quorum_region_bitmap,
                                        O2NM_MAX_REGIONS);
-                spin_unlock_irqrestore(&o2hb_live_lock, flags);
+                spin_unlock(&o2hb_live_lock);
                mlog(ML_HEARTBEAT, "Number of regions %d, failed regions %d\n",
                     quorum, failed);
@@ -1445,8 +1444,8 @@ static void o2hb_region_release(struct config_item *item)
        debugfs_remove(reg->hr_debug_dir);
        kfree(reg->hr_db_livenodes);
        kfree(reg->hr_db_regnum);
-        kfree(reg->hr_debug_elapsed_time);
+        kfree(reg->hr_db_elapsed_time);
-        kfree(reg->hr_debug_pinned);
+        kfree(reg->hr_db_pinned);
        spin_lock(&o2hb_live_lock);
        list_del(&reg->hr_all_item);
@@ -2425,11 +2424,10 @@ EXPORT_SYMBOL_GPL(o2hb_check_node_heartbeating);
 int o2hb_check_node_heartbeating_no_sem(u8 node_num)
 {
        unsigned long testing_map[BITS_TO_LONGS(O2NM_MAX_NODES)];
-        unsigned long flags;
-        spin_lock_irqsave(&o2hb_live_lock, flags);
+        spin_lock(&o2hb_live_lock);
        o2hb_fill_node_map_from_callback(testing_map, sizeof(testing_map));
-        spin_unlock_irqrestore(&o2hb_live_lock, flags);
+        spin_unlock(&o2hb_live_lock);
        if (!test_bit(node_num, testing_map)) {
                mlog(ML_HEARTBEAT,
                     "node (%u) does not have heartbeating enabled.\n",
diff --git a/fs/ocfs2/cluster/nodemanager.c b/fs/ocfs2/cluster/nodemanager.c
index ebe543894db0..b17d180bdc16 100644
--- a/fs/ocfs2/cluster/nodemanager.c
+++ b/fs/ocfs2/cluster/nodemanager.c
@@ -630,7 +630,6 @@ static void o2nm_cluster_release(struct config_item *item)
 {
        struct o2nm_cluster *cluster = to_o2nm_cluster(item);
-        kfree(cluster->cl_group.default_groups);
        kfree(cluster);
 }
@@ -666,7 +665,6 @@ static struct config_group *o2nm_cluster_group_make_group(struct config_group *g
        struct o2nm_cluster *cluster = NULL;
        struct o2nm_node_group *ns = NULL;
        struct config_group *o2hb_group = NULL, *ret = NULL;
-        void *defs = NULL;
        /* this runs under the parent dir's i_mutex; there can be only
         * one caller in here at a time */
@@ -675,20 +673,18 @@ static struct config_group *o2nm_cluster_group_make_group(struct config_group *g
        cluster = kzalloc(sizeof(struct o2nm_cluster), GFP_KERNEL);
        ns = kzalloc(sizeof(struct o2nm_node_group), GFP_KERNEL);
-        defs = kcalloc(3, sizeof(struct config_group *), GFP_KERNEL);
        o2hb_group = o2hb_alloc_hb_set();
-        if (cluster == NULL || ns == NULL || o2hb_group == NULL || defs == NULL)
+        if (cluster == NULL || ns == NULL || o2hb_group == NULL)
                goto out;
        config_group_init_type_name(&cluster->cl_group, name,
                                    &o2nm_cluster_type);
+        configfs_add_default_group(&ns->ns_group, &cluster->cl_group);
        config_group_init_type_name(&ns->ns_group, "node",
                                    &o2nm_node_group_type);
+        configfs_add_default_group(o2hb_group, &cluster->cl_group);
-        cluster->cl_group.default_groups = defs;
-        cluster->cl_group.default_groups[0] = &ns->ns_group;
-        cluster->cl_group.default_groups[1] = o2hb_group;
-        cluster->cl_group.default_groups[2] = NULL;
        rwlock_init(&cluster->cl_nodes_lock);
        cluster->cl_node_ip_tree = RB_ROOT;
        cluster->cl_reconnect_delay_ms = O2NET_RECONNECT_DELAY_MS_DEFAULT;
@@ -704,7 +700,6 @@ out:
                kfree(cluster);
                kfree(ns);
                o2hb_free_hb_set(o2hb_group);
-                kfree(defs);
                ret = ERR_PTR(-ENOMEM);
        }
@@ -714,18 +709,11 @@ out:
 static void o2nm_cluster_group_drop_item(struct config_group *group, struct config_item *item)
 {
        struct o2nm_cluster *cluster = to_o2nm_cluster(item);
-        int i;
-        struct config_item *killme;
        BUG_ON(o2nm_single_cluster != cluster);
        o2nm_single_cluster = NULL;
-        for (i = 0; cluster->cl_group.default_groups[i]; i++) {
+        configfs_remove_default_groups(&cluster->cl_group);
-                killme = &cluster->cl_group.default_groups[i]->cg_item;
-                cluster->cl_group.default_groups[i] = NULL;
-                config_item_put(killme);
-        }
        config_item_put(item);
 }
diff --git a/fs/ocfs2/dlm/dlmcommon.h b/fs/ocfs2/dlm/dlmcommon.h
index 68c607e63ff6..004f2cbe8f71 100644
--- a/fs/ocfs2/dlm/dlmcommon.h
+++ b/fs/ocfs2/dlm/dlmcommon.h
@@ -282,6 +282,7 @@ static inline void __dlm_set_joining_node(struct dlm_ctxt *dlm,
 #define DLM_LOCK_RES_DROPPING_REF         0x00000040
 #define DLM_LOCK_RES_BLOCK_DIRTY          0x00001000
 #define DLM_LOCK_RES_SETREF_INPROG        0x00002000
+#define DLM_LOCK_RES_RECOVERY_WAITING     0x00004000
 /* max milliseconds to wait to sync up a network failure with a node death */
 #define DLM_NODE_DEATH_WAIT_MAX (5 * 1000)
@@ -451,6 +452,7 @@ enum {
        DLM_QUERY_REGION                = 519,
        DLM_QUERY_NODEINFO              = 520,
        DLM_BEGIN_EXIT_DOMAIN_MSG       = 521,
+        DLM_DEREF_LOCKRES_DONE          = 522,
 };
 struct dlm_reco_node_data
@@ -545,7 +547,7 @@ struct dlm_master_requery
 * };
 *
 * from ../cluster/tcp.h
- *    NET_MAX_PAYLOAD_BYTES  (4096 - sizeof(net_msg))
+ *    O2NET_MAX_PAYLOAD_BYTES  (4096 - sizeof(net_msg))
 *    (roughly 4080 bytes)
 * and sizeof(dlm_migratable_lockres) = 112 bytes
 * and sizeof(dlm_migratable_lock) = 16 bytes
@@ -586,7 +588,7 @@ struct dlm_migratable_lockres
 /* from above, 128 bytes
 * for some undetermined future use */
-#define DLM_MIG_LOCKRES_RESERVED   (NET_MAX_PAYLOAD_BYTES - \
+#define DLM_MIG_LOCKRES_RESERVED   (O2NET_MAX_PAYLOAD_BYTES - \
                                    DLM_MIG_LOCKRES_MAX_LEN)
 struct dlm_create_lock
@@ -782,6 +784,20 @@ struct dlm_deref_lockres
        u8 name[O2NM_MAX_NAME_LEN];
 };
+enum {
+        DLM_DEREF_RESPONSE_DONE = 0,
+        DLM_DEREF_RESPONSE_INPROG = 1,
+};
+struct dlm_deref_lockres_done {
+        u32 pad1;
+        u16 pad2;
+        u8 node_idx;
+        u8 namelen;
+        u8 name[O2NM_MAX_NAME_LEN];
+};
 static inline enum dlm_status
 __dlm_lockres_state_to_status(struct dlm_lock_resource *res)
 {
@@ -789,7 +805,8 @@ __dlm_lockres_state_to_status(struct dlm_lock_resource *res)
        assert_spin_locked(&res->spinlock);
-        if (res->state & DLM_LOCK_RES_RECOVERING)
+        if (res->state & (DLM_LOCK_RES_RECOVERING|
+                        DLM_LOCK_RES_RECOVERY_WAITING))
                status = DLM_RECOVERING;
        else if (res->state & DLM_LOCK_RES_MIGRATING)
                status = DLM_MIGRATING;
@@ -968,6 +985,8 @@ int dlm_assert_master_handler(struct o2net_msg *msg, u32 len, void *data,
 void dlm_assert_master_post_handler(int status, void *data, void *ret_data);
 int dlm_deref_lockres_handler(struct o2net_msg *msg, u32 len, void *data,
                              void **ret_data);
+int dlm_deref_lockres_done_handler(struct o2net_msg *msg, u32 len, void *data,
+                              void **ret_data);
 int dlm_migrate_request_handler(struct o2net_msg *msg, u32 len, void *data,
                                void **ret_data);
 int dlm_mig_lockres_handler(struct o2net_msg *msg, u32 len, void *data,
@@ -1009,6 +1028,7 @@ static inline void __dlm_wait_on_lockres(struct dlm_lock_resource *res)
 {
        __dlm_wait_on_lockres_flags(res, (DLM_LOCK_RES_IN_PROGRESS|
                                          DLM_LOCK_RES_RECOVERING|
+                                          DLM_LOCK_RES_RECOVERY_WAITING|
                                          DLM_LOCK_RES_MIGRATING));
 }
diff --git a/fs/ocfs2/dlm/dlmconvert.c b/fs/ocfs2/dlm/dlmconvert.c
index e36d63ff1783..cdeafb4e7ed6 100644
--- a/fs/ocfs2/dlm/dlmconvert.c
+++ b/fs/ocfs2/dlm/dlmconvert.c
@@ -212,6 +212,12 @@ grant:
        if (lock->lksb->flags & DLM_LKSB_PUT_LVB)
                memcpy(res->lvb, lock->lksb->lvb, DLM_LVB_LEN);
+        /*
+         * Move the lock to the tail because it may be the only lock which has
+         * an invalid lvb.
+         */
+        list_move_tail(&lock->list, &res->granted);
        status = DLM_NORMAL;
        *call_ast = 1;
        goto unlock_exit;
@@ -262,6 +268,7 @@ enum dlm_status dlmconvert_remote(struct dlm_ctxt *dlm,
                                  struct dlm_lock *lock, int flags, int type)
 {
        enum dlm_status status;
+        u8 old_owner = res->owner;
        mlog(0, "type=%d, convert_type=%d, busy=%d\n", lock->ml.type,
             lock->ml.convert_type, res->state & DLM_LOCK_RES_IN_PROGRESS);
@@ -287,6 +294,19 @@ enum dlm_status dlmconvert_remote(struct dlm_ctxt *dlm,
                status = DLM_DENIED;
                goto bail;
        }
+        if (lock->ml.type == type && lock->ml.convert_type == LKM_IVMODE) {
+                mlog(0, "last convert request returned DLM_RECOVERING, but "
+                     "owner has already queued and sent ast to me. res %.*s, "
+                     "(cookie=%u:%llu, type=%d, conv=%d)\n",
+                     res->lockname.len, res->lockname.name,
+                     dlm_get_lock_cookie_node(be64_to_cpu(lock->ml.cookie)),
+                     dlm_get_lock_cookie_seq(be64_to_cpu(lock->ml.cookie)),
+                     lock->ml.type, lock->ml.convert_type);
+                status = DLM_NORMAL;
+                goto bail;
+        }
        res->state |= DLM_LOCK_RES_IN_PROGRESS;
        /* move lock to local convert queue */
        /* do not alter lock refcount.  switching lists. */
@@ -316,11 +336,19 @@ enum dlm_status dlmconvert_remote(struct dlm_ctxt *dlm,
        spin_lock(&res->spinlock);
        res->state &= ~DLM_LOCK_RES_IN_PROGRESS;
        lock->convert_pending = 0;
-        /* if it failed, move it back to granted queue */
+        /* if it failed, move it back to granted queue.
+         * if master returns DLM_NORMAL and then down before sending ast,
+         * it may have already been moved to granted queue, reset to
+         * DLM_RECOVERING and retry convert */
        if (status != DLM_NORMAL) {
                if (status != DLM_NOTQUEUED)
                        dlm_error(status);
                dlm_revert_pending_convert(res, lock);
+        } else if ((res->state & DLM_LOCK_RES_RECOVERING) ||
+                        (old_owner != res->owner)) {
+                mlog(0, "res %.*s is in recovering or has been recovered.\n",
+                                res->lockname.len, res->lockname.name);
+                status = DLM_RECOVERING;
        }
 bail:
        spin_unlock(&res->spinlock);
diff --git a/fs/ocfs2/dlm/dlmdomain.c b/fs/ocfs2/dlm/dlmdomain.c
index 2ee7fe747cea..12e064b8be9a 100644
--- a/fs/ocfs2/dlm/dlmdomain.c
+++ b/fs/ocfs2/dlm/dlmdomain.c
@@ -132,10 +132,13 @@ static DECLARE_WAIT_QUEUE_HEAD(dlm_domain_events);
 *      - Message DLM_QUERY_NODEINFO added to allow online node removes
 * New in version 1.2:
 *      - Message DLM_BEGIN_EXIT_DOMAIN_MSG added to mark start of exit domain
+ * New in version 1.3:
+ *      - Message DLM_DEREF_LOCKRES_DONE added to inform non-master that the
+ *        refmap is cleared
 */
 static const struct dlm_protocol_version dlm_protocol = {
        .pv_major = 1,
-        .pv_minor = 2,
+        .pv_minor = 3,
 };
 #define DLM_DOMAIN_BACKOFF_MS 200
@@ -1396,7 +1399,7 @@ static int dlm_send_join_cancels(struct dlm_ctxt *dlm,
                                 unsigned int map_size)
 {
        int status, tmpstat;
-        unsigned int node;
+        int node;
        if (map_size != (BITS_TO_LONGS(O2NM_MAX_NODES) *
                         sizeof(unsigned long))) {
@@ -1853,7 +1856,13 @@ static int dlm_register_domain_handlers(struct dlm_ctxt *dlm)
                                        sizeof(struct dlm_exit_domain),
                                        dlm_begin_exit_domain_handler,
                                        dlm, NULL, &dlm->dlm_domain_handlers);
+        if (status)
+                goto bail;
+        status = o2net_register_handler(DLM_DEREF_LOCKRES_DONE, dlm->key,
+                                        sizeof(struct dlm_deref_lockres_done),
+                                        dlm_deref_lockres_done_handler,
+                                        dlm, NULL, &dlm->dlm_domain_handlers);
 bail:
        if (status)
                dlm_unregister_domain_handlers(dlm);
diff --git a/fs/ocfs2/dlm/dlmmaster.c b/fs/ocfs2/dlm/dlmmaster.c
index 9477d6e1de37..9aed6e202201 100644
--- a/fs/ocfs2/dlm/dlmmaster.c
+++ b/fs/ocfs2/dlm/dlmmaster.c
@@ -2278,7 +2278,7 @@ int dlm_drop_lockres_ref(struct dlm_ctxt *dlm, struct dlm_lock_resource *res)
                dlm_print_one_lock_resource(res);
                BUG();
        }
-        return ret;
+        return ret ? ret : r;
 }
 int dlm_deref_lockres_handler(struct o2net_msg *msg, u32 len, void *data,
@@ -2345,7 +2345,7 @@ int dlm_deref_lockres_handler(struct o2net_msg *msg, u32 len, void *data,
                        res->lockname.len, res->lockname.name, node);
                        dlm_print_one_lock_resource(res);
                }
-                ret = 0;
+                ret = DLM_DEREF_RESPONSE_DONE;
                goto done;
        }
@@ -2365,7 +2365,7 @@ int dlm_deref_lockres_handler(struct o2net_msg *msg, u32 len, void *data,
        spin_unlock(&dlm->work_lock);
        queue_work(dlm->dlm_worker, &dlm->dispatched_work);
-        return 0;
+        return DLM_DEREF_RESPONSE_INPROG;
 done:
        if (res)
@@ -2375,6 +2375,122 @@ done:
        return ret;
 }
+int dlm_deref_lockres_done_handler(struct o2net_msg *msg, u32 len, void *data,
+                              void **ret_data)
+{
+        struct dlm_ctxt *dlm = data;
+        struct dlm_deref_lockres_done *deref
+                        = (struct dlm_deref_lockres_done *)msg->buf;
+        struct dlm_lock_resource *res = NULL;
+        char *name;
+        unsigned int namelen;
+        int ret = -EINVAL;
+        u8 node;
+        unsigned int hash;
+        if (!dlm_grab(dlm))
+                return 0;
+        name = deref->name;
+        namelen = deref->namelen;
+        node = deref->node_idx;
+        if (namelen > DLM_LOCKID_NAME_MAX) {
+                mlog(ML_ERROR, "Invalid name length!");
+                goto done;
+        }
+        if (deref->node_idx >= O2NM_MAX_NODES) {
+                mlog(ML_ERROR, "Invalid node number: %u\n", node);
+                goto done;
+        }
+        hash = dlm_lockid_hash(name, namelen);
+        spin_lock(&dlm->spinlock);
+        res = __dlm_lookup_lockres_full(dlm, name, namelen, hash);
+        if (!res) {
+                spin_unlock(&dlm->spinlock);
+                mlog(ML_ERROR, "%s:%.*s: bad lockres name\n",
+                     dlm->name, namelen, name);
+                goto done;
+        }
+        spin_lock(&res->spinlock);
+        BUG_ON(!(res->state & DLM_LOCK_RES_DROPPING_REF));
+        if (!list_empty(&res->purge)) {
+                mlog(0, "%s: Removing res %.*s from purgelist\n",
+                        dlm->name, res->lockname.len, res->lockname.name);
+                list_del_init(&res->purge);
+                dlm_lockres_put(res);
+                dlm->purge_count--;
+        }
+        if (!__dlm_lockres_unused(res)) {
+                mlog(ML_ERROR, "%s: res %.*s in use after deref\n",
+                        dlm->name, res->lockname.len, res->lockname.name);
+                __dlm_print_one_lock_resource(res);
+                BUG();
+        }
+        __dlm_unhash_lockres(dlm, res);
+        spin_lock(&dlm->track_lock);
+        if (!list_empty(&res->tracking))
+                list_del_init(&res->tracking);
+        else {
+                mlog(ML_ERROR, "%s: Resource %.*s not on the Tracking list\n",
+                     dlm->name, res->lockname.len, res->lockname.name);
+                __dlm_print_one_lock_resource(res);
+        }
+        spin_unlock(&dlm->track_lock);
+        /* lockres is not in the hash now. drop the flag and wake up
+         * any processes waiting in dlm_get_lock_resource.
+         */
+        res->state &= ~DLM_LOCK_RES_DROPPING_REF;
+        spin_unlock(&res->spinlock);
+        wake_up(&res->wq);
+        dlm_lockres_put(res);
+        spin_unlock(&dlm->spinlock);
+done:
+        dlm_put(dlm);
+        return ret;
+}
+static void dlm_drop_lockres_ref_done(struct dlm_ctxt *dlm,
+                struct dlm_lock_resource *res, u8 node)
+{
+        struct dlm_deref_lockres_done deref;
+        int ret = 0, r;
+        const char *lockname;
+        unsigned int namelen;
+        lockname = res->lockname.name;
+        namelen = res->lockname.len;
+        BUG_ON(namelen > O2NM_MAX_NAME_LEN);
+        memset(&deref, 0, sizeof(deref));
+        deref.node_idx = dlm->node_num;
+        deref.namelen = namelen;
+        memcpy(deref.name, lockname, namelen);
+        ret = o2net_send_message(DLM_DEREF_LOCKRES_DONE, dlm->key,
+                                 &deref, sizeof(deref), node, &r);
+        if (ret < 0) {
+                mlog(ML_ERROR, "%s: res %.*s, error %d send DEREF DONE "
+                                " to node %u\n", dlm->name, namelen,
+                                lockname, ret, node);
+        } else if (r < 0) {
+                /* ignore the error */
+                mlog(ML_ERROR, "%s: res %.*s, DEREF to node %u got %d\n",
+                     dlm->name, namelen, lockname, node, r);
+                dlm_print_one_lock_resource(res);
+        }
+}
 static void dlm_deref_lockres_worker(struct dlm_work_item *item, void *data)
 {
        struct dlm_ctxt *dlm;
@@ -2395,6 +2511,8 @@ static void dlm_deref_lockres_worker(struct dlm_work_item *item, void *data)
        }
        spin_unlock(&res->spinlock);
+        dlm_drop_lockres_ref_done(dlm, res, node);
        if (cleared) {
                mlog(0, "%s:%.*s node %u ref dropped in dispatch\n",
                     dlm->name, res->lockname.len, res->lockname.name, node);
@@ -2432,7 +2550,8 @@ static int dlm_is_lockres_migrateable(struct dlm_ctxt *dlm,
                return 0;
        /* delay migration when the lockres is in RECOCERING state */
-        if (res->state & DLM_LOCK_RES_RECOVERING)
+        if (res->state & (DLM_LOCK_RES_RECOVERING|
+                        DLM_LOCK_RES_RECOVERY_WAITING))
                return 0;
        if (res->owner != dlm->node_num)
diff --git a/fs/ocfs2/dlm/dlmrecovery.c b/fs/ocfs2/dlm/dlmrecovery.c
index b94a425f0175..f6b313898763 100644
--- a/fs/ocfs2/dlm/dlmrecovery.c
+++ b/fs/ocfs2/dlm/dlmrecovery.c
@@ -1403,12 +1403,24 @@ int dlm_mig_lockres_handler(struct o2net_msg *msg, u32 len, void *data,
         * and RECOVERY flag changed when it completes. */
        hash = dlm_lockid_hash(mres->lockname, mres->lockname_len);
        spin_lock(&dlm->spinlock);
-        res = __dlm_lookup_lockres(dlm, mres->lockname, mres->lockname_len,
+        res = __dlm_lookup_lockres_full(dlm, mres->lockname, mres->lockname_len,
                        hash);
        if (res) {
                /* this will get a ref on res */
                /* mark it as recovering/migrating and hash it */
                spin_lock(&res->spinlock);
+                if (res->state & DLM_LOCK_RES_DROPPING_REF) {
+                        mlog(0, "%s: node is attempting to migrate "
+                                "lockres %.*s, but marked as dropping "
+                                " ref!\n", dlm->name,
+                                mres->lockname_len, mres->lockname);
+                        ret = -EINVAL;
+                        spin_unlock(&res->spinlock);
+                        spin_unlock(&dlm->spinlock);
+                        dlm_lockres_put(res);
+                        goto leave;
+                }
                if (mres->flags & DLM_MRES_RECOVERY) {
                        res->state |= DLM_LOCK_RES_RECOVERING;
                } else {
@@ -2071,7 +2083,6 @@ void dlm_move_lockres_to_recovery_list(struct dlm_ctxt *dlm,
                        dlm_lock_get(lock);
                        if (lock->convert_pending) {
                                /* move converting lock back to granted */
-                                BUG_ON(i != DLM_CONVERTING_LIST);
                                mlog(0, "node died with convert pending "
                                     "on %.*s. move back to granted list.\n",
                                     res->lockname.len, res->lockname.name);
@@ -2163,6 +2174,13 @@ static void dlm_finish_local_lockres_recovery(struct dlm_ctxt *dlm,
        for (i = 0; i < DLM_HASH_BUCKETS; i++) {
                bucket = dlm_lockres_hash(dlm, i);
                hlist_for_each_entry(res, bucket, hash_node) {
+                        if (res->state & DLM_LOCK_RES_RECOVERY_WAITING) {
+                                spin_lock(&res->spinlock);
+                                res->state &= ~DLM_LOCK_RES_RECOVERY_WAITING;
+                                spin_unlock(&res->spinlock);
+                                wake_up(&res->wq);
+                        }
                        if (!(res->state & DLM_LOCK_RES_RECOVERING))
                                continue;
@@ -2300,6 +2318,7 @@ static void dlm_free_dead_locks(struct dlm_ctxt *dlm,
                             res->lockname.len, res->lockname.name, freed, dead_node);
                        __dlm_print_one_lock_resource(res);
                }
+                res->state |= DLM_LOCK_RES_RECOVERY_WAITING;
                dlm_lockres_clear_refmap_bit(dlm, res, dead_node);
        } else if (test_bit(dead_node, res->refmap)) {
                mlog(0, "%s:%.*s: dead node %u had a ref, but had "
@@ -2377,14 +2396,16 @@ static void dlm_do_local_recovery_cleanup(struct dlm_ctxt *dlm, u8 dead_node)
                        dlm_revalidate_lvb(dlm, res, dead_node);
                        if (res->owner == dead_node) {
                                if (res->state & DLM_LOCK_RES_DROPPING_REF) {
-                                        mlog(ML_NOTICE, "%s: res %.*s, Skip "
+                                        mlog(0, "%s:%.*s: owned by "
-                                             "recovery as it is being freed\n",
+                                                "dead node %u, this node was "
-                                             dlm->name, res->lockname.len,
+                                                "dropping its ref when it died. "
-                                             res->lockname.name);
+                                                "continue, dropping the flag.\n",
-                                } else
+                                                dlm->name, res->lockname.len,
-                                        dlm_move_lockres_to_recovery_list(dlm,
+                                                res->lockname.name, dead_node);
-                                                                          res);
+                                }
+                                res->state &= ~DLM_LOCK_RES_DROPPING_REF;
+                                dlm_move_lockres_to_recovery_list(dlm,
+                                                res);
                        } else if (res->owner == dlm->node_num) {
                                dlm_free_dead_locks(dlm, res, dead_node);
                                __dlm_lockres_calc_usage(dlm, res);
diff --git a/fs/ocfs2/dlm/dlmthread.c b/fs/ocfs2/dlm/dlmthread.c
index c5f6c241ecd7..68d239ba0c63 100644
--- a/fs/ocfs2/dlm/dlmthread.c
+++ b/fs/ocfs2/dlm/dlmthread.c
@@ -106,7 +106,8 @@ int __dlm_lockres_unused(struct dlm_lock_resource *res)
        if (!list_empty(&res->dirty) || res->state & DLM_LOCK_RES_DIRTY)
                return 0;
-        if (res->state & DLM_LOCK_RES_RECOVERING)
+        if (res->state & (DLM_LOCK_RES_RECOVERING|
+                        DLM_LOCK_RES_RECOVERY_WAITING))
                return 0;
        /* Another node has this resource with this node as the master */
@@ -202,6 +203,13 @@ static void dlm_purge_lockres(struct dlm_ctxt *dlm,
                dlm->purge_count--;
        }
+        if (!master && ret != 0) {
+                mlog(0, "%s: deref %.*s in progress or master goes down\n",
+                        dlm->name, res->lockname.len, res->lockname.name);
+                spin_unlock(&res->spinlock);
+                return;
+        }
        if (!__dlm_lockres_unused(res)) {
                mlog(ML_ERROR, "%s: res %.*s in use after deref\n",
                     dlm->name, res->lockname.len, res->lockname.name);
@@ -700,7 +708,8 @@ static int dlm_thread(void *data)
                         * dirty for a short while. */
                        BUG_ON(res->state & DLM_LOCK_RES_MIGRATING);
                        if (res->state & (DLM_LOCK_RES_IN_PROGRESS |
-                                          DLM_LOCK_RES_RECOVERING)) {
+                                          DLM_LOCK_RES_RECOVERING |
+                                          DLM_LOCK_RES_RECOVERY_WAITING)) {
                                /* move it to the tail and keep going */
                                res->state &= ~DLM_LOCK_RES_DIRTY;
                                spin_unlock(&res->spinlock);
diff --git a/fs/ocfs2/file.c b/fs/ocfs2/file.c
index 7cb38fdca229..c18ab45f8d21 100644
--- a/fs/ocfs2/file.c
+++ b/fs/ocfs2/file.c
@@ -1381,44 +1381,6 @@ out:
        return ret;
 }
-/*
- * Will look for holes and unwritten extents in the range starting at
- * pos for count bytes (inclusive).
- */
-static int ocfs2_check_range_for_holes(struct inode *inode, loff_t pos,
-                                       size_t count)
-{
-        int ret = 0;
-        unsigned int extent_flags;
-        u32 cpos, clusters, extent_len, phys_cpos;
-        struct super_block *sb = inode->i_sb;
-        cpos = pos >> OCFS2_SB(sb)->s_clustersize_bits;
-        clusters = ocfs2_clusters_for_bytes(sb, pos + count) - cpos;
-        while (clusters) {
-                ret = ocfs2_get_clusters(inode, cpos, &phys_cpos, &extent_len,
-                                         &extent_flags);
-                if (ret < 0) {
-                        mlog_errno(ret);
-                        goto out;
-                }
-                if (phys_cpos == 0 || (extent_flags & OCFS2_EXT_UNWRITTEN)) {
-                        ret = 1;
-                        break;
-                }
-                if (extent_len > clusters)
-                        extent_len = clusters;
-                clusters -= extent_len;
-                cpos += extent_len;
-        }
-out:
-        return ret;
-}
 static int ocfs2_write_remove_suid(struct inode *inode)
 {
        int ret;
@@ -2129,18 +2091,12 @@ out:
 static int ocfs2_prepare_inode_for_write(struct file *file,
                                         loff_t pos,
-                                         size_t count,
+                                         size_t count)
-                                         int appending,
-                                         int *direct_io,
-                                         int *has_refcount)
 {
        int ret = 0, meta_level = 0;
        struct dentry *dentry = file->f_path.dentry;
        struct inode *inode = d_inode(dentry);
        loff_t end;
-        struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
-        int full_coherency = !(osb->s_mount_opt &
-                OCFS2_MOUNT_COHERENCY_BUFFERED);
        /*
         * We start with a read level meta lock and only jump to an ex
@@ -2189,10 +2145,6 @@ static int ocfs2_prepare_inode_for_write(struct file *file,
                                                               pos,
                                                               count,
                                                               &meta_level);
-                        if (has_refcount)
-                                *has_refcount = 1;
-                        if (direct_io)
-                                *direct_io = 0;
                }
                if (ret < 0) {
@@ -2200,67 +2152,12 @@ static int ocfs2_prepare_inode_for_write(struct file *file,
                        goto out_unlock;
                }
-                /*
-                 * Skip the O_DIRECT checks if we don't need
-                 * them.
-                 */
-                if (!direct_io || !(*direct_io))
-                        break;
-                /*
-                 * There's no sane way to do direct writes to an inode
-                 * with inline data.
-                 */
-                if (OCFS2_I(inode)->ip_dyn_features & OCFS2_INLINE_DATA_FL) {
-                        *direct_io = 0;
-                        break;
-                }
-                /*
-                 * Allowing concurrent direct writes means
-                 * i_size changes wouldn't be synchronized, so
-                 * one node could wind up truncating another
-                 * nodes writes.
-                 */
-                if (end > i_size_read(inode) && !full_coherency) {
-                        *direct_io = 0;
-                        break;
-                }
-                /*
-                 * Fallback to old way if the feature bit is not set.
-                 */
-                if (end > i_size_read(inode) &&
-                                !ocfs2_supports_append_dio(osb)) {
-                        *direct_io = 0;
-                        break;
-                }
-                /*
-                 * We don't fill holes during direct io, so
-                 * check for them here. If any are found, the
-                 * caller will have to retake some cluster
-                 * locks and initiate the io as buffered.
-                 */
-                ret = ocfs2_check_range_for_holes(inode, pos, count);
-                if (ret == 1) {
-                        /*
-                         * Fallback to old way if the feature bit is not set.
-                         * Otherwise try dio first and then complete the rest
-                         * request through buffer io.
-                         */
-                        if (!ocfs2_supports_append_dio(osb))
-                                *direct_io = 0;
-                        ret = 0;
-                } else if (ret < 0)
-                        mlog_errno(ret);
                break;
        }
 out_unlock:
        trace_ocfs2_prepare_inode_for_write(OCFS2_I(inode)->ip_blkno,
-                                            pos, appending, count,
+                                            pos, count);
-                                            direct_io, has_refcount);
        if (meta_level >= 0)
                ocfs2_inode_unlock(inode, meta_level);
@@ -2272,18 +2169,16 @@ out:
 static ssize_t ocfs2_file_write_iter(struct kiocb *iocb,
                                    struct iov_iter *from)
 {
-        int direct_io, appending, rw_level;
+        int direct_io, rw_level;
-        int can_do_direct, has_refcount = 0;
        ssize_t written = 0;
        ssize_t ret;
-        size_t count = iov_iter_count(from), orig_count;
+        size_t count = iov_iter_count(from);
        struct file *file = iocb->ki_filp;
        struct inode *inode = file_inode(file);
        struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
        int full_coherency = !(osb->s_mount_opt &
                               OCFS2_MOUNT_COHERENCY_BUFFERED);
-        int unaligned_dio = 0;
+        void *saved_ki_complete = NULL;
-        int dropped_dio = 0;
        int append_write = ((iocb->ki_pos + count) >=
                        i_size_read(inode) ? 1 : 0);
@@ -2296,12 +2191,10 @@ static ssize_t ocfs2_file_write_iter(struct kiocb *iocb,
        if (count == 0)
                return 0;
-        appending = iocb->ki_flags & IOCB_APPEND ? 1 : 0;
        direct_io = iocb->ki_flags & IOCB_DIRECT ? 1 : 0;
        inode_lock(inode);
-relock:
        /*
         * Concurrent O_DIRECT writes are allowed with
         * mount_option "coherency=buffered".
@@ -2334,7 +2227,6 @@ relock:
                ocfs2_inode_unlock(inode, 1);
        }
-        orig_count = iov_iter_count(from);
        ret = generic_write_checks(iocb, from);
        if (ret <= 0) {
                if (ret)
@@ -2343,41 +2235,18 @@ relock:
        }
        count = ret;
-        can_do_direct = direct_io;
+        ret = ocfs2_prepare_inode_for_write(file, iocb->ki_pos, count);
-        ret = ocfs2_prepare_inode_for_write(file, iocb->ki_pos, count, appending,
-                                            &can_do_direct, &has_refcount);
        if (ret < 0) {
                mlog_errno(ret);
                goto out;
        }
-        if (direct_io && !is_sync_kiocb(iocb))
+        if (direct_io && !is_sync_kiocb(iocb) &&
-                unaligned_dio = ocfs2_is_io_unaligned(inode, count, iocb->ki_pos);
+            ocfs2_is_io_unaligned(inode, count, iocb->ki_pos)) {
-        /*
-         * We can't complete the direct I/O as requested, fall back to
-         * buffered I/O.
-         */
-        if (direct_io && !can_do_direct) {
-                ocfs2_rw_unlock(inode, rw_level);
-                rw_level = -1;
-                direct_io = 0;
-                iocb->ki_flags &= ~IOCB_DIRECT;
-                iov_iter_reexpand(from, orig_count);
-                dropped_dio = 1;
-                goto relock;
-        }
-        if (unaligned_dio) {
                /*
-                 * Wait on previous unaligned aio to complete before
+                 * Make it a sync io if it's an unaligned aio.
-                 * proceeding.
                 */
-                mutex_lock(&OCFS2_I(inode)->ip_unaligned_aio);
+                saved_ki_complete = xchg(&iocb->ki_complete, NULL);
-                /* Mark the iocb as needing an unlock in ocfs2_dio_end_io */
-                ocfs2_iocb_set_unaligned_aio(iocb);
        }
        /* communicate with ocfs2_dio_end_io */
@@ -2398,14 +2267,13 @@ relock:
         */
        if ((written == -EIOCBQUEUED) || (!ocfs2_iocb_is_rw_locked(iocb))) {
                rw_level = -1;
-                unaligned_dio = 0;
        }
        if (unlikely(written <= 0))
-                goto no_sync;
+                goto out;
        if (((file->f_flags & O_DSYNC) && !direct_io) ||
-            IS_SYNC(inode) || dropped_dio) {
+            IS_SYNC(inode)) {
                ret = filemap_fdatawrite_range(file->f_mapping,
                                               iocb->ki_pos - written,
                                               iocb->ki_pos - 1);
@@ -2424,13 +2292,10 @@ relock:
                                                      iocb->ki_pos - 1);
        }
-no_sync:
-        if (unaligned_dio && ocfs2_iocb_is_unaligned_aio(iocb)) {
-                ocfs2_iocb_clear_unaligned_aio(iocb);
-                mutex_unlock(&OCFS2_I(inode)->ip_unaligned_aio);
-        }
 out:
+        if (saved_ki_complete)
+                xchg(&iocb->ki_complete, saved_ki_complete);
        if (rw_level != -1)
                ocfs2_rw_unlock(inode, rw_level);
diff --git a/fs/ocfs2/filecheck.c b/fs/ocfs2/filecheck.c
new file mode 100644
index 000000000000..2cabbcf2f28e
--- /dev/null
+++ b/fs/ocfs2/filecheck.c
@@ -0,0 +1,606 @@
+/* -*- mode: c; c-basic-offset: 8; -*-
+ * vim: noexpandtab sw=8 ts=8 sts=0:
+ *
+ * filecheck.c
+ *
+ * Code which implements online file check.
+ *
+ * Copyright (C) 2016 SuSE.  All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public
+ * License as published by the Free Software Foundation, version 2.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ */
+#include <linux/list.h>
+#include <linux/spinlock.h>
+#include <linux/module.h>
+#include <linux/slab.h>
+#include <linux/kmod.h>
+#include <linux/fs.h>
+#include <linux/kobject.h>
+#include <linux/sysfs.h>
+#include <linux/sysctl.h>
+#include <cluster/masklog.h>
+#include "ocfs2.h"
+#include "ocfs2_fs.h"
+#include "stackglue.h"
+#include "inode.h"
+#include "filecheck.h"
+/* File check error strings,
+ * must correspond with error number in header file.
+ */
+static const char * const ocfs2_filecheck_errs[] = {
+        "SUCCESS",
+        "FAILED",
+        "INPROGRESS",
+        "READONLY",
+        "INJBD",
+        "INVALIDINO",
+        "BLOCKECC",
+        "BLOCKNO",
+        "VALIDFLAG",
+        "GENERATION",
+        "UNSUPPORTED"
+};
+static DEFINE_SPINLOCK(ocfs2_filecheck_sysfs_lock);
+static LIST_HEAD(ocfs2_filecheck_sysfs_list);
+struct ocfs2_filecheck {
+        struct list_head fc_head;       /* File check entry list head */
+        spinlock_t fc_lock;
+        unsigned int fc_max;    /* Maximum number of entry in list */
+        unsigned int fc_size;   /* Current entry count in list */
+        unsigned int fc_done;   /* Finished entry count in list */
+};
+struct ocfs2_filecheck_sysfs_entry {    /* sysfs entry per mounting */
+        struct list_head fs_list;
+        atomic_t fs_count;
+        struct super_block *fs_sb;
+        struct kset *fs_devicekset;
+        struct kset *fs_fcheckkset;
+        struct ocfs2_filecheck *fs_fcheck;
+};
+#define OCFS2_FILECHECK_MAXSIZE         100
+#define OCFS2_FILECHECK_MINSIZE         10
+/* File check operation type */
+enum {
+        OCFS2_FILECHECK_TYPE_CHK = 0,   /* Check a file(inode) */
+        OCFS2_FILECHECK_TYPE_FIX,       /* Fix a file(inode) */
+        OCFS2_FILECHECK_TYPE_SET = 100  /* Set entry list maximum size */
+};
+struct ocfs2_filecheck_entry {
+        struct list_head fe_list;
+        unsigned long fe_ino;
+        unsigned int fe_type;
+        unsigned int fe_done:1;
+        unsigned int fe_status:31;
+};
+struct ocfs2_filecheck_args {
+        unsigned int fa_type;
+        union {
+                unsigned long fa_ino;
+                unsigned int fa_len;
+        };
+};
+static const char *
+ocfs2_filecheck_error(int errno)
+{
+        if (!errno)
+                return ocfs2_filecheck_errs[errno];
+        BUG_ON(errno < OCFS2_FILECHECK_ERR_START ||
+               errno > OCFS2_FILECHECK_ERR_END);
+        return ocfs2_filecheck_errs[errno - OCFS2_FILECHECK_ERR_START + 1];
+}
+static ssize_t ocfs2_filecheck_show(struct kobject *kobj,
+                                    struct kobj_attribute *attr,
+                                    char *buf);
+static ssize_t ocfs2_filecheck_store(struct kobject *kobj,
+                                     struct kobj_attribute *attr,
+                                     const char *buf, size_t count);
+static struct kobj_attribute ocfs2_attr_filecheck_chk =
+                                        __ATTR(check, S_IRUSR | S_IWUSR,
+                                        ocfs2_filecheck_show,
+                                        ocfs2_filecheck_store);
+static struct kobj_attribute ocfs2_attr_filecheck_fix =
+                                        __ATTR(fix, S_IRUSR | S_IWUSR,
+                                        ocfs2_filecheck_show,
+                                        ocfs2_filecheck_store);
+static struct kobj_attribute ocfs2_attr_filecheck_set =
+                                        __ATTR(set, S_IRUSR | S_IWUSR,
+                                        ocfs2_filecheck_show,
+                                        ocfs2_filecheck_store);
+static int ocfs2_filecheck_sysfs_wait(atomic_t *p)
+{
+        schedule();
+        return 0;
+}
+static void
+ocfs2_filecheck_sysfs_free(struct ocfs2_filecheck_sysfs_entry *entry)
+{
+        struct ocfs2_filecheck_entry *p;
+        if (!atomic_dec_and_test(&entry->fs_count))
+                wait_on_atomic_t(&entry->fs_count, ocfs2_filecheck_sysfs_wait,
+                                 TASK_UNINTERRUPTIBLE);
+        spin_lock(&entry->fs_fcheck->fc_lock);
+        while (!list_empty(&entry->fs_fcheck->fc_head)) {
+                p = list_first_entry(&entry->fs_fcheck->fc_head,
+                                     struct ocfs2_filecheck_entry, fe_list);
+                list_del(&p->fe_list);
+                BUG_ON(!p->fe_done); /* To free a undone file check entry */
+                kfree(p);
+        }
+        spin_unlock(&entry->fs_fcheck->fc_lock);
+        kset_unregister(entry->fs_fcheckkset);
+        kset_unregister(entry->fs_devicekset);
+        kfree(entry->fs_fcheck);
+        kfree(entry);
+}
+static void
+ocfs2_filecheck_sysfs_add(struct ocfs2_filecheck_sysfs_entry *entry)
+{
+        spin_lock(&ocfs2_filecheck_sysfs_lock);
+        list_add_tail(&entry->fs_list, &ocfs2_filecheck_sysfs_list);
+        spin_unlock(&ocfs2_filecheck_sysfs_lock);
+}
+static int ocfs2_filecheck_sysfs_del(const char *devname)
+{
+        struct ocfs2_filecheck_sysfs_entry *p;
+        spin_lock(&ocfs2_filecheck_sysfs_lock);
+        list_for_each_entry(p, &ocfs2_filecheck_sysfs_list, fs_list) {
+                if (!strcmp(p->fs_sb->s_id, devname)) {
+                        list_del(&p->fs_list);
+                        spin_unlock(&ocfs2_filecheck_sysfs_lock);
+                        ocfs2_filecheck_sysfs_free(p);
+                        return 0;
+                }
+        }
+        spin_unlock(&ocfs2_filecheck_sysfs_lock);
+        return 1;
+}
+static void
+ocfs2_filecheck_sysfs_put(struct ocfs2_filecheck_sysfs_entry *entry)
+{
+        if (atomic_dec_and_test(&entry->fs_count))
+                wake_up_atomic_t(&entry->fs_count);
+}
+static struct ocfs2_filecheck_sysfs_entry *
+ocfs2_filecheck_sysfs_get(const char *devname)
+{
+        struct ocfs2_filecheck_sysfs_entry *p = NULL;
+        spin_lock(&ocfs2_filecheck_sysfs_lock);
+        list_for_each_entry(p, &ocfs2_filecheck_sysfs_list, fs_list) {
+                if (!strcmp(p->fs_sb->s_id, devname)) {
+                        atomic_inc(&p->fs_count);
+                        spin_unlock(&ocfs2_filecheck_sysfs_lock);
+                        return p;
+                }
+        }
+        spin_unlock(&ocfs2_filecheck_sysfs_lock);
+        return NULL;
+}
+int ocfs2_filecheck_create_sysfs(struct super_block *sb)
+{
+        int ret = 0;
+        struct kset *device_kset = NULL;
+        struct kset *fcheck_kset = NULL;
+        struct ocfs2_filecheck *fcheck = NULL;
+        struct ocfs2_filecheck_sysfs_entry *entry = NULL;
+        struct attribute **attrs = NULL;
+        struct attribute_group attrgp;
+        if (!ocfs2_kset)
+                return -ENOMEM;
+        attrs = kmalloc(sizeof(struct attribute *) * 4, GFP_NOFS);
+        if (!attrs) {
+                ret = -ENOMEM;
+                goto error;
+        } else {
+                attrs[0] = &ocfs2_attr_filecheck_chk.attr;
+                attrs[1] = &ocfs2_attr_filecheck_fix.attr;
+                attrs[2] = &ocfs2_attr_filecheck_set.attr;
+                attrs[3] = NULL;
+                memset(&attrgp, 0, sizeof(attrgp));
+                attrgp.attrs = attrs;
+        }
+        fcheck = kmalloc(sizeof(struct ocfs2_filecheck), GFP_NOFS);
+        if (!fcheck) {
+                ret = -ENOMEM;
+                goto error;
+        } else {
+                INIT_LIST_HEAD(&fcheck->fc_head);
+                spin_lock_init(&fcheck->fc_lock);
+                fcheck->fc_max = OCFS2_FILECHECK_MINSIZE;
+                fcheck->fc_size = 0;
+                fcheck->fc_done = 0;
+        }
+        if (strlen(sb->s_id) <= 0) {
+                mlog(ML_ERROR,
+                "Cannot get device basename when create filecheck sysfs\n");
+                ret = -ENODEV;
+                goto error;
+        }
+        device_kset = kset_create_and_add(sb->s_id, NULL, &ocfs2_kset->kobj);
+        if (!device_kset) {
+                ret = -ENOMEM;
+                goto error;
+        }
+        fcheck_kset = kset_create_and_add("filecheck", NULL,
+                                          &device_kset->kobj);
+        if (!fcheck_kset) {
+                ret = -ENOMEM;
+                goto error;
+        }
+        ret = sysfs_create_group(&fcheck_kset->kobj, &attrgp);
+        if (ret)
+                goto error;
+        entry = kmalloc(sizeof(struct ocfs2_filecheck_sysfs_entry), GFP_NOFS);
+        if (!entry) {
+                ret = -ENOMEM;
+                goto error;
+        } else {
+                atomic_set(&entry->fs_count, 1);
+                entry->fs_sb = sb;
+                entry->fs_devicekset = device_kset;
+                entry->fs_fcheckkset = fcheck_kset;
+                entry->fs_fcheck = fcheck;
+                ocfs2_filecheck_sysfs_add(entry);
+        }
+        kfree(attrs);
+        return 0;
+error:
+        kfree(attrs);
+        kfree(entry);
+        kfree(fcheck);
+        kset_unregister(fcheck_kset);
+        kset_unregister(device_kset);
+        return ret;
+}
+int ocfs2_filecheck_remove_sysfs(struct super_block *sb)
+{
+        return ocfs2_filecheck_sysfs_del(sb->s_id);
+}
+static int
+ocfs2_filecheck_erase_entries(struct ocfs2_filecheck_sysfs_entry *ent,
+                              unsigned int count);
+static int
+ocfs2_filecheck_adjust_max(struct ocfs2_filecheck_sysfs_entry *ent,
+                           unsigned int len)
+{
+        int ret;
+        if ((len < OCFS2_FILECHECK_MINSIZE) || (len > OCFS2_FILECHECK_MAXSIZE))
+                return -EINVAL;
+        spin_lock(&ent->fs_fcheck->fc_lock);
+        if (len < (ent->fs_fcheck->fc_size - ent->fs_fcheck->fc_done)) {
+                mlog(ML_ERROR,
+                "Cannot set online file check maximum entry number "
+                "to %u due to too many pending entries(%u)\n",
+                len, ent->fs_fcheck->fc_size - ent->fs_fcheck->fc_done);
+                ret = -EBUSY;
+        } else {
+                if (len < ent->fs_fcheck->fc_size)
+                        BUG_ON(!ocfs2_filecheck_erase_entries(ent,
+                                ent->fs_fcheck->fc_size - len));
+                ent->fs_fcheck->fc_max = len;
+                ret = 0;
+        }
+        spin_unlock(&ent->fs_fcheck->fc_lock);
+        return ret;
+}
+#define OCFS2_FILECHECK_ARGS_LEN        24
+static int
+ocfs2_filecheck_args_get_long(const char *buf, size_t count,
+                              unsigned long *val)
+{
+        char buffer[OCFS2_FILECHECK_ARGS_LEN];
+        memcpy(buffer, buf, count);
+        buffer[count] = '\0';
+        if (kstrtoul(buffer, 0, val))
+                return 1;
+        return 0;
+}
+static int
+ocfs2_filecheck_type_parse(const char *name, unsigned int *type)
+{
+        if (!strncmp(name, "fix", 4))
+                *type = OCFS2_FILECHECK_TYPE_FIX;
+        else if (!strncmp(name, "check", 6))
+                *type = OCFS2_FILECHECK_TYPE_CHK;
+        else if (!strncmp(name, "set", 4))
+                *type = OCFS2_FILECHECK_TYPE_SET;
+        else
+                return 1;
+        return 0;
+}
+static int
+ocfs2_filecheck_args_parse(const char *name, const char *buf, size_t count,
+                           struct ocfs2_filecheck_args *args)
+{
+        unsigned long val = 0;
+        unsigned int type;
+        /* too short/long args length */
+        if ((count < 1) || (count >= OCFS2_FILECHECK_ARGS_LEN))
+                return 1;
+        if (ocfs2_filecheck_type_parse(name, &type))
+                return 1;
+        if (ocfs2_filecheck_args_get_long(buf, count, &val))
+                return 1;
+        if (val <= 0)
+                return 1;
+        args->fa_type = type;
+        if (type == OCFS2_FILECHECK_TYPE_SET)
+                args->fa_len = (unsigned int)val;
+        else
+                args->fa_ino = val;
+        return 0;
+}
+static ssize_t ocfs2_filecheck_show(struct kobject *kobj,
+                                    struct kobj_attribute *attr,
+                                    char *buf)
+{
+        ssize_t ret = 0, total = 0, remain = PAGE_SIZE;
+        unsigned int type;
+        struct ocfs2_filecheck_entry *p;
+        struct ocfs2_filecheck_sysfs_entry *ent;
+        if (ocfs2_filecheck_type_parse(attr->attr.name, &type))
+                return -EINVAL;
+        ent = ocfs2_filecheck_sysfs_get(kobj->parent->name);
+        if (!ent) {
+                mlog(ML_ERROR,
+                "Cannot get the corresponding entry via device basename %s\n",
+                kobj->name);
+                return -ENODEV;
+        }
+        if (type == OCFS2_FILECHECK_TYPE_SET) {
+                spin_lock(&ent->fs_fcheck->fc_lock);
+                total = snprintf(buf, remain, "%u\n", ent->fs_fcheck->fc_max);
+                spin_unlock(&ent->fs_fcheck->fc_lock);
+                goto exit;
+        }
+        ret = snprintf(buf, remain, "INO\t\tDONE\tERROR\n");
+        total += ret;
+        remain -= ret;
+        spin_lock(&ent->fs_fcheck->fc_lock);
+        list_for_each_entry(p, &ent->fs_fcheck->fc_head, fe_list) {
+                if (p->fe_type != type)
+                        continue;
+                ret = snprintf(buf + total, remain, "%lu\t\t%u\t%s\n",
+                               p->fe_ino, p->fe_done,
+                               ocfs2_filecheck_error(p->fe_status));
+                if (ret < 0) {
+                        total = ret;
+                        break;
+                }
+                if (ret == remain) {
+                        /* snprintf() didn't fit */
+                        total = -E2BIG;
+                        break;
+                }
+                total += ret;
+                remain -= ret;
+        }
+        spin_unlock(&ent->fs_fcheck->fc_lock);
+exit:
+        ocfs2_filecheck_sysfs_put(ent);
+        return total;
+}
+static int
+ocfs2_filecheck_erase_entry(struct ocfs2_filecheck_sysfs_entry *ent)
+{
+        struct ocfs2_filecheck_entry *p;
+        list_for_each_entry(p, &ent->fs_fcheck->fc_head, fe_list) {
+                if (p->fe_done) {
+                        list_del(&p->fe_list);
+                        kfree(p);
+                        ent->fs_fcheck->fc_size--;
+                        ent->fs_fcheck->fc_done--;
+                        return 1;
+                }
+        }
+        return 0;
+}
+static int
+ocfs2_filecheck_erase_entries(struct ocfs2_filecheck_sysfs_entry *ent,
+                              unsigned int count)
+{
+        unsigned int i = 0;
+        unsigned int ret = 0;
+        while (i++ < count) {
+                if (ocfs2_filecheck_erase_entry(ent))
+                        ret++;
+                else
+                        break;
+        }
+        return (ret == count ? 1 : 0);
+}
+static void
+ocfs2_filecheck_done_entry(struct ocfs2_filecheck_sysfs_entry *ent,
+                           struct ocfs2_filecheck_entry *entry)
+{
+        entry->fe_done = 1;
+        spin_lock(&ent->fs_fcheck->fc_lock);
+        ent->fs_fcheck->fc_done++;
+        spin_unlock(&ent->fs_fcheck->fc_lock);
+}
+static unsigned int
+ocfs2_filecheck_handle(struct super_block *sb,
+                       unsigned long ino, unsigned int flags)
+{
+        unsigned int ret = OCFS2_FILECHECK_ERR_SUCCESS;
+        struct inode *inode = NULL;
+        int rc;
+        inode = ocfs2_iget(OCFS2_SB(sb), ino, flags, 0);
+        if (IS_ERR(inode)) {
+                rc = (int)(-(long)inode);
+                if (rc >= OCFS2_FILECHECK_ERR_START &&
+                    rc < OCFS2_FILECHECK_ERR_END)
+                        ret = rc;
+                else
+                        ret = OCFS2_FILECHECK_ERR_FAILED;
+        } else
+                iput(inode);
+        return ret;
+}
+static void
+ocfs2_filecheck_handle_entry(struct ocfs2_filecheck_sysfs_entry *ent,
+                             struct ocfs2_filecheck_entry *entry)
+{
+        if (entry->fe_type == OCFS2_FILECHECK_TYPE_CHK)
+                entry->fe_status = ocfs2_filecheck_handle(ent->fs_sb,
+                                entry->fe_ino, OCFS2_FI_FLAG_FILECHECK_CHK);
+        else if (entry->fe_type == OCFS2_FILECHECK_TYPE_FIX)
+                entry->fe_status = ocfs2_filecheck_handle(ent->fs_sb,
+                                entry->fe_ino, OCFS2_FI_FLAG_FILECHECK_FIX);
+        else
+                entry->fe_status = OCFS2_FILECHECK_ERR_UNSUPPORTED;
+        ocfs2_filecheck_done_entry(ent, entry);
+}
+static ssize_t ocfs2_filecheck_store(struct kobject *kobj,
+                                     struct kobj_attribute *attr,
+                                     const char *buf, size_t count)
+{
+        struct ocfs2_filecheck_args args;
+        struct ocfs2_filecheck_entry *entry;
+        struct ocfs2_filecheck_sysfs_entry *ent;
+        ssize_t ret = 0;
+        if (count == 0)
+                return count;
+        if (ocfs2_filecheck_args_parse(attr->attr.name, buf, count, &args)) {
+                mlog(ML_ERROR, "Invalid arguments for online file check\n");
+                return -EINVAL;
+        }
+        ent = ocfs2_filecheck_sysfs_get(kobj->parent->name);
+        if (!ent) {
+                mlog(ML_ERROR,
+                "Cannot get the corresponding entry via device basename %s\n",
+                kobj->parent->name);
+                return -ENODEV;
+        }
+        if (args.fa_type == OCFS2_FILECHECK_TYPE_SET) {
+                ret = ocfs2_filecheck_adjust_max(ent, args.fa_len);
+                goto exit;
+        }
+        entry = kmalloc(sizeof(struct ocfs2_filecheck_entry), GFP_NOFS);
+        if (!entry) {
+                ret = -ENOMEM;
+                goto exit;
+        }
+        spin_lock(&ent->fs_fcheck->fc_lock);
+        if ((ent->fs_fcheck->fc_size >= ent->fs_fcheck->fc_max) &&
+            (ent->fs_fcheck->fc_done == 0)) {
+                mlog(ML_ERROR,
+                "Cannot do more file check "
+                "since file check queue(%u) is full now\n",
+                ent->fs_fcheck->fc_max);
+                ret = -EBUSY;
+                kfree(entry);
+        } else {
+                if ((ent->fs_fcheck->fc_size >= ent->fs_fcheck->fc_max) &&
+                    (ent->fs_fcheck->fc_done > 0)) {
+                        /* Delete the oldest entry which was done,
+                         * make sure the entry size in list does
+                         * not exceed maximum value
+                         */
+                        BUG_ON(!ocfs2_filecheck_erase_entry(ent));
+                }
+                entry->fe_ino = args.fa_ino;
+                entry->fe_type = args.fa_type;
+                entry->fe_done = 0;
+                entry->fe_status = OCFS2_FILECHECK_ERR_INPROGRESS;
+                list_add_tail(&entry->fe_list, &ent->fs_fcheck->fc_head);
+                ent->fs_fcheck->fc_size++;
+        }
+        spin_unlock(&ent->fs_fcheck->fc_lock);
+        if (!ret)
+                ocfs2_filecheck_handle_entry(ent, entry);
+exit:
+        ocfs2_filecheck_sysfs_put(ent);
+        return (!ret ? count : ret);
+}
diff --git a/fs/ocfs2/filecheck.h b/fs/ocfs2/filecheck.h
new file mode 100644
index 000000000000..e5cd002a2c09
--- /dev/null
+++ b/fs/ocfs2/filecheck.h
@@ -0,0 +1,49 @@
+/* -*- mode: c; c-basic-offset: 8; -*-
+ * vim: noexpandtab sw=8 ts=8 sts=0:
+ *
+ * filecheck.h
+ *
+ * Online file check.
+ *
+ * Copyright (C) 2016 SuSE.  All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public
+ * License as published by the Free Software Foundation, version 2.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ */
+#ifndef FILECHECK_H
+#define FILECHECK_H
+#include <linux/types.h>
+#include <linux/list.h>
+/* File check errno */
+enum {
+        OCFS2_FILECHECK_ERR_SUCCESS = 0,        /* Success */
+        OCFS2_FILECHECK_ERR_FAILED = 1000,      /* Other failure */
+        OCFS2_FILECHECK_ERR_INPROGRESS,         /* In progress */
+        OCFS2_FILECHECK_ERR_READONLY,           /* Read only */
+        OCFS2_FILECHECK_ERR_INJBD,              /* Buffer in jbd */
+        OCFS2_FILECHECK_ERR_INVALIDINO,         /* Invalid ino */
+        OCFS2_FILECHECK_ERR_BLOCKECC,           /* Block ecc */
+        OCFS2_FILECHECK_ERR_BLOCKNO,            /* Block number */
+        OCFS2_FILECHECK_ERR_VALIDFLAG,          /* Inode valid flag */
+        OCFS2_FILECHECK_ERR_GENERATION,         /* Inode generation */
+        OCFS2_FILECHECK_ERR_UNSUPPORTED         /* Unsupported */
+};
+#define OCFS2_FILECHECK_ERR_START       OCFS2_FILECHECK_ERR_FAILED
+#define OCFS2_FILECHECK_ERR_END         OCFS2_FILECHECK_ERR_UNSUPPORTED
+int ocfs2_filecheck_create_sysfs(struct super_block *sb);
+int ocfs2_filecheck_remove_sysfs(struct super_block *sb);
+#endif  /* FILECHECK_H */
diff --git a/fs/ocfs2/inode.c b/fs/ocfs2/inode.c
index 36294446d960..12f4a9e9800f 100644
--- a/fs/ocfs2/inode.c
+++ b/fs/ocfs2/inode.c
@@ -53,6 +53,7 @@
 #include "xattr.h"
 #include "refcounttree.h"
 #include "ocfs2_trace.h"
+#include "filecheck.h"
 #include "buffer_head_io.h"
@@ -74,6 +75,14 @@ static int ocfs2_truncate_for_delete(struct ocfs2_super *osb,
                                    struct inode *inode,
                                    struct buffer_head *fe_bh);
+static int ocfs2_filecheck_read_inode_block_full(struct inode *inode,
+                                                 struct buffer_head **bh,
+                                                 int flags, int type);
+static int ocfs2_filecheck_validate_inode_block(struct super_block *sb,
+                                                struct buffer_head *bh);
+static int ocfs2_filecheck_repair_inode_block(struct super_block *sb,
+                                              struct buffer_head *bh);
 void ocfs2_set_inode_flags(struct inode *inode)
 {
        unsigned int flags = OCFS2_I(inode)->ip_attr;
@@ -127,6 +136,7 @@ struct inode *ocfs2_ilookup(struct super_block *sb, u64 blkno)
 struct inode *ocfs2_iget(struct ocfs2_super *osb, u64 blkno, unsigned flags,
                         int sysfile_type)
 {
+        int rc = 0;
        struct inode *inode = NULL;
        struct super_block *sb = osb->sb;
        struct ocfs2_find_inode_args args;
@@ -161,12 +171,17 @@ struct inode *ocfs2_iget(struct ocfs2_super *osb, u64 blkno, unsigned flags,
        }
        trace_ocfs2_iget5_locked(inode->i_state);
        if (inode->i_state & I_NEW) {
-                ocfs2_read_locked_inode(inode, &args);
+                rc = ocfs2_read_locked_inode(inode, &args);
                unlock_new_inode(inode);
        }
        if (is_bad_inode(inode)) {
                iput(inode);
-                inode = ERR_PTR(-ESTALE);
+                if ((flags & OCFS2_FI_FLAG_FILECHECK_CHK) ||
+                    (flags & OCFS2_FI_FLAG_FILECHECK_FIX))
+                        /* Return OCFS2_FILECHECK_ERR_XXX related errno */
+                        inode = ERR_PTR(rc);
+                else
+                        inode = ERR_PTR(-ESTALE);
                goto bail;
        }
@@ -410,7 +425,7 @@ static int ocfs2_read_locked_inode(struct inode *inode,
        struct ocfs2_super *osb;
        struct ocfs2_dinode *fe;
        struct buffer_head *bh = NULL;
-        int status, can_lock;
+        int status, can_lock, lock_level = 0;
        u32 generation = 0;
        status = -EINVAL;
@@ -478,7 +493,7 @@ static int ocfs2_read_locked_inode(struct inode *inode,
                        mlog_errno(status);
                        return status;
                }
-                status = ocfs2_inode_lock(inode, NULL, 0);
+                status = ocfs2_inode_lock(inode, NULL, lock_level);
                if (status) {
                        make_bad_inode(inode);
                        mlog_errno(status);
@@ -495,16 +510,32 @@ static int ocfs2_read_locked_inode(struct inode *inode,
        }
        if (can_lock) {
-                status = ocfs2_read_inode_block_full(inode, &bh,
+                if (args->fi_flags & OCFS2_FI_FLAG_FILECHECK_CHK)
-                                                     OCFS2_BH_IGNORE_CACHE);
+                        status = ocfs2_filecheck_read_inode_block_full(inode,
+                                                &bh, OCFS2_BH_IGNORE_CACHE, 0);
+                else if (args->fi_flags & OCFS2_FI_FLAG_FILECHECK_FIX)
+                        status = ocfs2_filecheck_read_inode_block_full(inode,
+                                                &bh, OCFS2_BH_IGNORE_CACHE, 1);
+                else
+                        status = ocfs2_read_inode_block_full(inode,
+                                                &bh, OCFS2_BH_IGNORE_CACHE);
        } else {
                status = ocfs2_read_blocks_sync(osb, args->fi_blkno, 1, &bh);
                /*
                 * If buffer is in jbd, then its checksum may not have been
                 * computed as yet.
                 */
-                if (!status && !buffer_jbd(bh))
+                if (!status && !buffer_jbd(bh)) {
-                        status = ocfs2_validate_inode_block(osb->sb, bh);
+                        if (args->fi_flags & OCFS2_FI_FLAG_FILECHECK_CHK)
+                                status = ocfs2_filecheck_validate_inode_block(
+                                                                osb->sb, bh);
+                        else if (args->fi_flags & OCFS2_FI_FLAG_FILECHECK_FIX)
+                                status = ocfs2_filecheck_repair_inode_block(
+                                                                osb->sb, bh);
+                        else
+                                status = ocfs2_validate_inode_block(
+                                                                osb->sb, bh);
+                }
        }
        if (status < 0) {
                mlog_errno(status);
@@ -532,11 +563,24 @@ static int ocfs2_read_locked_inode(struct inode *inode,
        BUG_ON(args->fi_blkno != le64_to_cpu(fe->i_blkno));
+        if (buffer_dirty(bh) && !buffer_jbd(bh)) {
+                if (can_lock) {
+                        ocfs2_inode_unlock(inode, lock_level);
+                        lock_level = 1;
+                        ocfs2_inode_lock(inode, NULL, lock_level);
+                }
+                status = ocfs2_write_block(osb, bh, INODE_CACHE(inode));
+                if (status < 0) {
+                        mlog_errno(status);
+                        goto bail;
+                }
+        }
        status = 0;
 bail:
        if (can_lock)
-                ocfs2_inode_unlock(inode, 0);
+                ocfs2_inode_unlock(inode, lock_level);
        if (status < 0)
                make_bad_inode(inode);
@@ -1126,6 +1170,9 @@ static void ocfs2_clear_inode(struct inode *inode)
        mlog_bug_on_msg(!list_empty(&oi->ip_io_markers),
                        "Clear inode of %llu, inode has io markers\n",
                        (unsigned long long)oi->ip_blkno);
+        mlog_bug_on_msg(!list_empty(&oi->ip_unwritten_list),
+                        "Clear inode of %llu, inode has unwritten extents\n",
+                        (unsigned long long)oi->ip_blkno);
        ocfs2_extent_map_trunc(inode, 0);
@@ -1397,6 +1444,169 @@ bail:
        return rc;
 }
+static int ocfs2_filecheck_validate_inode_block(struct super_block *sb,
+                                                struct buffer_head *bh)
+{
+        int rc = 0;
+        struct ocfs2_dinode *di = (struct ocfs2_dinode *)bh->b_data;
+        trace_ocfs2_filecheck_validate_inode_block(
+                (unsigned long long)bh->b_blocknr);
+        BUG_ON(!buffer_uptodate(bh));
+        /*
+         * Call ocfs2_validate_meta_ecc() first since it has ecc repair
+         * function, but we should not return error immediately when ecc
+         * validation fails, because the reason is quite likely the invalid
+         * inode number inputed.
+         */
+        rc = ocfs2_validate_meta_ecc(sb, bh->b_data, &di->i_check);
+        if (rc) {
+                mlog(ML_ERROR,
+                     "Filecheck: checksum failed for dinode %llu\n",
+                     (unsigned long long)bh->b_blocknr);
+                rc = -OCFS2_FILECHECK_ERR_BLOCKECC;
+        }
+        if (!OCFS2_IS_VALID_DINODE(di)) {
+                mlog(ML_ERROR,
+                     "Filecheck: invalid dinode #%llu: signature = %.*s\n",
+                     (unsigned long long)bh->b_blocknr, 7, di->i_signature);
+                rc = -OCFS2_FILECHECK_ERR_INVALIDINO;
+                goto bail;
+        } else if (rc)
+                goto bail;
+        if (le64_to_cpu(di->i_blkno) != bh->b_blocknr) {
+                mlog(ML_ERROR,
+                     "Filecheck: invalid dinode #%llu: i_blkno is %llu\n",
+                     (unsigned long long)bh->b_blocknr,
+                     (unsigned long long)le64_to_cpu(di->i_blkno));
+                rc = -OCFS2_FILECHECK_ERR_BLOCKNO;
+                goto bail;
+        }
+        if (!(di->i_flags & cpu_to_le32(OCFS2_VALID_FL))) {
+                mlog(ML_ERROR,
+                     "Filecheck: invalid dinode #%llu: OCFS2_VALID_FL "
+                     "not set\n",
+                     (unsigned long long)bh->b_blocknr);
+                rc = -OCFS2_FILECHECK_ERR_VALIDFLAG;
+                goto bail;
+        }
+        if (le32_to_cpu(di->i_fs_generation) !=
+            OCFS2_SB(sb)->fs_generation) {
+                mlog(ML_ERROR,
+                     "Filecheck: invalid dinode #%llu: fs_generation is %u\n",
+                     (unsigned long long)bh->b_blocknr,
+                     le32_to_cpu(di->i_fs_generation));
+                rc = -OCFS2_FILECHECK_ERR_GENERATION;
+                goto bail;
+        }
+bail:
+        return rc;
+}
+static int ocfs2_filecheck_repair_inode_block(struct super_block *sb,
+                                              struct buffer_head *bh)
+{
+        int changed = 0;
+        struct ocfs2_dinode *di = (struct ocfs2_dinode *)bh->b_data;
+        if (!ocfs2_filecheck_validate_inode_block(sb, bh))
+                return 0;
+        trace_ocfs2_filecheck_repair_inode_block(
+                (unsigned long long)bh->b_blocknr);
+        if (ocfs2_is_hard_readonly(OCFS2_SB(sb)) ||
+            ocfs2_is_soft_readonly(OCFS2_SB(sb))) {
+                mlog(ML_ERROR,
+                     "Filecheck: cannot repair dinode #%llu "
+                     "on readonly filesystem\n",
+                     (unsigned long long)bh->b_blocknr);
+                return -OCFS2_FILECHECK_ERR_READONLY;
+        }
+        if (buffer_jbd(bh)) {
+                mlog(ML_ERROR,
+                     "Filecheck: cannot repair dinode #%llu, "
+                     "its buffer is in jbd\n",
+                     (unsigned long long)bh->b_blocknr);
+                return -OCFS2_FILECHECK_ERR_INJBD;
+        }
+        if (!OCFS2_IS_VALID_DINODE(di)) {
+                /* Cannot fix invalid inode block */
+                return -OCFS2_FILECHECK_ERR_INVALIDINO;
+        }
+        if (!(di->i_flags & cpu_to_le32(OCFS2_VALID_FL))) {
+                /* Cannot just add VALID_FL flag back as a fix,
+                 * need more things to check here.
+                 */
+                return -OCFS2_FILECHECK_ERR_VALIDFLAG;
+        }
+        if (le64_to_cpu(di->i_blkno) != bh->b_blocknr) {
+                di->i_blkno = cpu_to_le64(bh->b_blocknr);
+                changed = 1;
+                mlog(ML_ERROR,
+                     "Filecheck: reset dinode #%llu: i_blkno to %llu\n",
+                     (unsigned long long)bh->b_blocknr,
+                     (unsigned long long)le64_to_cpu(di->i_blkno));
+        }
+        if (le32_to_cpu(di->i_fs_generation) !=
+            OCFS2_SB(sb)->fs_generation) {
+                di->i_fs_generation = cpu_to_le32(OCFS2_SB(sb)->fs_generation);
+                changed = 1;
+                mlog(ML_ERROR,
+                     "Filecheck: reset dinode #%llu: fs_generation to %u\n",
+                     (unsigned long long)bh->b_blocknr,
+                     le32_to_cpu(di->i_fs_generation));
+        }
+        if (changed || ocfs2_validate_meta_ecc(sb, bh->b_data, &di->i_check)) {
+                ocfs2_compute_meta_ecc(sb, bh->b_data, &di->i_check);
+                mark_buffer_dirty(bh);
+                mlog(ML_ERROR,
+                     "Filecheck: reset dinode #%llu: compute meta ecc\n",
+                     (unsigned long long)bh->b_blocknr);
+        }
+        return 0;
+}
+static int
+ocfs2_filecheck_read_inode_block_full(struct inode *inode,
+                                      struct buffer_head **bh,
+                                      int flags, int type)
+{
+        int rc;
+        struct buffer_head *tmp = *bh;
+        if (!type) /* Check inode block */
+                rc = ocfs2_read_blocks(INODE_CACHE(inode),
+                                OCFS2_I(inode)->ip_blkno,
+                                1, &tmp, flags,
+                                ocfs2_filecheck_validate_inode_block);
+        else /* Repair inode block */
+                rc = ocfs2_read_blocks(INODE_CACHE(inode),
+                                OCFS2_I(inode)->ip_blkno,
+                                1, &tmp, flags,
+                                ocfs2_filecheck_repair_inode_block);
+        /* If ocfs2_read_blocks() got us a new bh, pass it up. */
+        if (!rc && !*bh)
+                *bh = tmp;
+        return rc;
+}
 int ocfs2_read_inode_block_full(struct inode *inode, struct buffer_head **bh,
                                int flags)
 {
diff --git a/fs/ocfs2/inode.h b/fs/ocfs2/inode.h
index aac8b86f312e..d8f3fc8d2551 100644
--- a/fs/ocfs2/inode.h
+++ b/fs/ocfs2/inode.h
@@ -43,9 +43,6 @@ struct ocfs2_inode_info
        /* protects extended attribute changes on this inode */
        struct rw_semaphore             ip_xattr_sem;
-        /* Number of outstanding AIO's which are not page aligned */
-        struct mutex                    ip_unaligned_aio;
        /* These fields are protected by ip_lock */
        spinlock_t                      ip_lock;
        u32                             ip_open_count;
@@ -57,6 +54,9 @@ struct ocfs2_inode_info
        u32                             ip_flags; /* see below */
        u32                             ip_attr; /* inode attributes */
+        /* Record unwritten extents during direct io. */
+        struct list_head                ip_unwritten_list;
        /* protected by recovery_lock. */
        struct inode                    *ip_next_orphan;
@@ -139,6 +139,9 @@ int ocfs2_drop_inode(struct inode *inode);
 /* Flags for ocfs2_iget() */
 #define OCFS2_FI_FLAG_SYSFILE           0x1
 #define OCFS2_FI_FLAG_ORPHAN_RECOVERY   0x2
+#define OCFS2_FI_FLAG_FILECHECK_CHK     0x4
+#define OCFS2_FI_FLAG_FILECHECK_FIX     0x8
 struct inode *ocfs2_ilookup(struct super_block *sb, u64 feoff);
 struct inode *ocfs2_iget(struct ocfs2_super *osb, u64 feoff, unsigned flags,
                         int sysfile_type);
diff --git a/fs/ocfs2/journal.c b/fs/ocfs2/journal.c
index 61b833b721d8..e607419cdfa4 100644
--- a/fs/ocfs2/journal.c
+++ b/fs/ocfs2/journal.c
@@ -231,7 +231,7 @@ void ocfs2_recovery_exit(struct ocfs2_super *osb)
        /* At this point, we know that no more recovery threads can be
         * launched, so wait for any recovery completion work to
         * complete. */
-        flush_workqueue(ocfs2_wq);
+        flush_workqueue(osb->ocfs2_wq);
        /*
         * Now that recovery is shut down, and the osb is about to be
@@ -1326,7 +1326,7 @@ static void ocfs2_queue_recovery_completion(struct ocfs2_journal *journal,
        spin_lock(&journal->j_lock);
        list_add_tail(&item->lri_list, &journal->j_la_cleanups);
-        queue_work(ocfs2_wq, &journal->j_recovery_work);
+        queue_work(journal->j_osb->ocfs2_wq, &journal->j_recovery_work);
        spin_unlock(&journal->j_lock);
 }
@@ -1968,7 +1968,7 @@ static void ocfs2_orphan_scan_work(struct work_struct *work)
        mutex_lock(&os->os_lock);
        ocfs2_queue_orphan_scan(osb);
        if (atomic_read(&os->os_state) == ORPHAN_SCAN_ACTIVE)
-                queue_delayed_work(ocfs2_wq, &os->os_orphan_scan_work,
+                queue_delayed_work(osb->ocfs2_wq, &os->os_orphan_scan_work,
                                      ocfs2_orphan_scan_timeout());
        mutex_unlock(&os->os_lock);
 }
@@ -2008,7 +2008,7 @@ void ocfs2_orphan_scan_start(struct ocfs2_super *osb)
                atomic_set(&os->os_state, ORPHAN_SCAN_INACTIVE);
        else {
                atomic_set(&os->os_state, ORPHAN_SCAN_ACTIVE);
-                queue_delayed_work(ocfs2_wq, &os->os_orphan_scan_work,
+                queue_delayed_work(osb->ocfs2_wq, &os->os_orphan_scan_work,
                                   ocfs2_orphan_scan_timeout());
        }
 }
diff --git a/fs/ocfs2/localalloc.c b/fs/ocfs2/localalloc.c
index 7d62c43a2c3e..fe0d1f9571bb 100644
--- a/fs/ocfs2/localalloc.c
+++ b/fs/ocfs2/localalloc.c
@@ -386,7 +386,7 @@ void ocfs2_shutdown_local_alloc(struct ocfs2_super *osb)
        struct ocfs2_dinode *alloc = NULL;
        cancel_delayed_work(&osb->la_enable_wq);
-        flush_workqueue(ocfs2_wq);
+        flush_workqueue(osb->ocfs2_wq);
        if (osb->local_alloc_state == OCFS2_LA_UNUSED)
                goto out;
@@ -1085,7 +1085,7 @@ static int ocfs2_recalc_la_window(struct ocfs2_super *osb,
                } else {
                        osb->local_alloc_state = OCFS2_LA_DISABLED;
                }
-                queue_delayed_work(ocfs2_wq, &osb->la_enable_wq,
+                queue_delayed_work(osb->ocfs2_wq, &osb->la_enable_wq,
                                   OCFS2_LA_ENABLE_INTERVAL);
                goto out_unlock;
        }
diff --git a/fs/ocfs2/mmap.c b/fs/ocfs2/mmap.c
index 77ebc2bc1cca..9ea081f4e6e4 100644
--- a/fs/ocfs2/mmap.c
+++ b/fs/ocfs2/mmap.c
@@ -104,8 +104,8 @@ static int __ocfs2_page_mkwrite(struct file *file, struct buffer_head *di_bh,
        if (page->index == last_index)
                len = ((size - 1) & ~PAGE_CACHE_MASK) + 1;
-        ret = ocfs2_write_begin_nolock(file, mapping, pos, len, 0, &locked_page,
+        ret = ocfs2_write_begin_nolock(mapping, pos, len, OCFS2_WRITE_MMAP,
-                                       &fsdata, di_bh, page);
+                                       &locked_page, &fsdata, di_bh, page);
        if (ret) {
                if (ret != -ENOSPC)
                        mlog_errno(ret);
diff --git a/fs/ocfs2/ocfs2.h b/fs/ocfs2/ocfs2.h
index 7a0126267847..6cf6538a0651 100644
--- a/fs/ocfs2/ocfs2.h
+++ b/fs/ocfs2/ocfs2.h
@@ -464,6 +464,14 @@ struct ocfs2_super
        struct ocfs2_refcount_tree *osb_ref_tree_lru;
        struct mutex system_file_mutex;
+        /*
+         * OCFS2 needs to schedule several different types of work which
+         * require cluster locking, disk I/O, recovery waits, etc. Since these
+         * types of work tend to be heavy we avoid using the kernel events
+         * workqueue and schedule on our own.
+         */
+        struct workqueue_struct *ocfs2_wq;
 };
 #define OCFS2_SB(sb)        ((struct ocfs2_super *)(sb)->s_fs_info)
diff --git a/fs/ocfs2/ocfs2_trace.h b/fs/ocfs2/ocfs2_trace.h
index 6cb019b7c6a8..f8f5fc5e6c05 100644
--- a/fs/ocfs2/ocfs2_trace.h
+++ b/fs/ocfs2/ocfs2_trace.h
@@ -1450,28 +1450,20 @@ DEFINE_OCFS2_ULL_ULL_ULL_EVENT(ocfs2_remove_inode_range);
 TRACE_EVENT(ocfs2_prepare_inode_for_write,
        TP_PROTO(unsigned long long ino, unsigned long long saved_pos,
-                 int appending, unsigned long count,
+                 unsigned long count),
-                 int *direct_io, int *has_refcount),
+        TP_ARGS(ino, saved_pos, count),
-        TP_ARGS(ino, saved_pos, appending, count, direct_io, has_refcount),
        TP_STRUCT__entry(
                __field(unsigned long long, ino)
                __field(unsigned long long, saved_pos)
-                __field(int, appending)
                __field(unsigned long, count)
-                __field(int, direct_io)
-                __field(int, has_refcount)
        ),
        TP_fast_assign(
                __entry->ino = ino;
                __entry->saved_pos = saved_pos;
-                __entry->appending = appending;
                __entry->count = count;
-                __entry->direct_io = direct_io ? *direct_io : -1;
-                __entry->has_refcount = has_refcount ? *has_refcount : -1;
        ),
-        TP_printk("%llu %llu %d %lu %d %d", __entry->ino,
+        TP_printk("%llu %llu %lu", __entry->ino,
-                  __entry->saved_pos, __entry->appending, __entry->count,
+                  __entry->saved_pos, __entry->count)
-                  __entry->direct_io, __entry->has_refcount)
 );
 DEFINE_OCFS2_INT_EVENT(generic_file_aio_read_ret);
@@ -1540,6 +1532,8 @@ DEFINE_OCFS2_ULL_INT_EVENT(ocfs2_read_locked_inode);
 DEFINE_OCFS2_INT_INT_EVENT(ocfs2_check_orphan_recovery_state);
 DEFINE_OCFS2_ULL_EVENT(ocfs2_validate_inode_block);
+DEFINE_OCFS2_ULL_EVENT(ocfs2_filecheck_validate_inode_block);
+DEFINE_OCFS2_ULL_EVENT(ocfs2_filecheck_repair_inode_block);
 TRACE_EVENT(ocfs2_inode_is_valid_to_delete,
        TP_PROTO(void *task, void *dc_task, unsigned long long ino,
@@ -2035,6 +2029,8 @@ DEFINE_OCFS2_UINT_INT_EVENT(ocfs2_release_dquot);
 DEFINE_OCFS2_UINT_INT_EVENT(ocfs2_acquire_dquot);
+DEFINE_OCFS2_UINT_INT_EVENT(ocfs2_get_next_id);
 DEFINE_OCFS2_UINT_INT_EVENT(ocfs2_mark_dquot_dirty);
 /* End of trace events for fs/ocfs2/quota_global.c. */
diff --git a/fs/ocfs2/quota_global.c b/fs/ocfs2/quota_global.c
index 9c9dd30bc945..3892f3c079ca 100644
--- a/fs/ocfs2/quota_global.c
+++ b/fs/ocfs2/quota_global.c
@@ -726,7 +726,7 @@ static int ocfs2_release_dquot(struct dquot *dquot)
                dqgrab(dquot);
                /* First entry on list -> queue work */
                if (llist_add(&OCFS2_DQUOT(dquot)->list, &osb->dquot_drop_list))
-                        queue_work(ocfs2_wq, &osb->dquot_drop_work);
+                        queue_work(osb->ocfs2_wq, &osb->dquot_drop_work);
                goto out;
        }
        status = ocfs2_lock_global_qf(oinfo, 1);
@@ -860,6 +860,30 @@ out:
        return status;
 }
+static int ocfs2_get_next_id(struct super_block *sb, struct kqid *qid)
+{
+        int type = qid->type;
+        struct ocfs2_mem_dqinfo *info = sb_dqinfo(sb, type)->dqi_priv;
+        int status = 0;
+        trace_ocfs2_get_next_id(from_kqid(&init_user_ns, *qid), type);
+        status = ocfs2_lock_global_qf(info, 0);
+        if (status < 0)
+                goto out;
+        status = ocfs2_qinfo_lock(info, 0);
+        if (status < 0)
+                goto out_global;
+        status = qtree_get_next_id(&info->dqi_gi, qid);
+        ocfs2_qinfo_unlock(info, 0);
+out_global:
+        ocfs2_unlock_global_qf(info, 0);
+out:
+        /* Avoid logging ENOENT since it just means there isn't next ID */
+        if (status && status != -ENOENT)
+                mlog_errno(status);
+        return status;
+}
 static int ocfs2_mark_dquot_dirty(struct dquot *dquot)
 {
        unsigned long mask = (1 << (DQ_LASTSET_B + QIF_ILIMITS_B)) |
@@ -968,4 +992,5 @@ const struct dquot_operations ocfs2_quota_operations = {
        .write_info     = ocfs2_write_info,
        .alloc_dquot    = ocfs2_alloc_dquot,
        .destroy_dquot  = ocfs2_destroy_dquot,
+        .get_next_id    = ocfs2_get_next_id,
 };
diff --git a/fs/ocfs2/resize.c b/fs/ocfs2/resize.c
index 576b9a04873f..18451e0fab81 100644
--- a/fs/ocfs2/resize.c
+++ b/fs/ocfs2/resize.c
@@ -196,7 +196,7 @@ static int update_backups(struct inode * inode, u32 clusters, char *data)
        for (i = 0; i < OCFS2_MAX_BACKUP_SUPERBLOCKS; i++) {
                blkno = ocfs2_backup_super_blkno(inode->i_sb, i);
                cluster = ocfs2_blocks_to_clusters(inode->i_sb, blkno);
-                if (cluster > clusters)
+                if (cluster >= clusters)
                        break;
                ret = ocfs2_read_blocks_sync(osb, blkno, 1, &backup);
diff --git a/fs/ocfs2/stackglue.c b/fs/ocfs2/stackglue.c
index 5d965e83bd43..13219ed73e1d 100644
--- a/fs/ocfs2/stackglue.c
+++ b/fs/ocfs2/stackglue.c
@@ -629,7 +629,8 @@ static struct attribute_group ocfs2_attr_group = {
        .attrs = ocfs2_attrs,
 };
-static struct kset *ocfs2_kset;
+struct kset *ocfs2_kset;
+EXPORT_SYMBOL_GPL(ocfs2_kset);
 static void ocfs2_sysfs_exit(void)
 {
diff --git a/fs/ocfs2/stackglue.h b/fs/ocfs2/stackglue.h
index 66334a30cea8..f2dce10fae54 100644
--- a/fs/ocfs2/stackglue.h
+++ b/fs/ocfs2/stackglue.h
@@ -298,4 +298,6 @@ void ocfs2_stack_glue_set_max_proto_version(struct ocfs2_protocol_version *max_p
 int ocfs2_stack_glue_register(struct ocfs2_stack_plugin *plugin);
 void ocfs2_stack_glue_unregister(struct ocfs2_stack_plugin *plugin);
+extern struct kset *ocfs2_kset;
 #endif  /* STACKGLUE_H */
diff --git a/fs/ocfs2/super.c b/fs/ocfs2/super.c
index faa1365097bc..7db631e1c8b0 100644
--- a/fs/ocfs2/super.c
+++ b/fs/ocfs2/super.c
@@ -74,17 +74,12 @@
 #include "suballoc.h"
 #include "buffer_head_io.h"
+#include "filecheck.h"
 static struct kmem_cache *ocfs2_inode_cachep;
 struct kmem_cache *ocfs2_dquot_cachep;
 struct kmem_cache *ocfs2_qf_chunk_cachep;
-/* OCFS2 needs to schedule several different types of work which
- * require cluster locking, disk I/O, recovery waits, etc. Since these
- * types of work tend to be heavy we avoid using the kernel events
- * workqueue and schedule on our own. */
-struct workqueue_struct *ocfs2_wq = NULL;
 static struct dentry *ocfs2_debugfs_root;
 MODULE_AUTHOR("Oracle");
@@ -236,6 +231,7 @@ static int ocfs2_osb_dump(struct ocfs2_super *osb, char *buf, int len)
        struct ocfs2_recovery_map *rm = osb->recovery_map;
        struct ocfs2_orphan_scan *os = &osb->osb_orphan_scan;
        int i, out = 0;
+        unsigned long flags;
        out += snprintf(buf + out, len - out,
                        "%10s => Id: %-s  Uuid: %-s  Gen: 0x%X  Label: %-s\n",
@@ -271,14 +267,14 @@ static int ocfs2_osb_dump(struct ocfs2_super *osb, char *buf, int len)
                                cconn->cc_version.pv_minor);
        }
-        spin_lock(&osb->dc_task_lock);
+        spin_lock_irqsave(&osb->dc_task_lock, flags);
        out += snprintf(buf + out, len - out,
                        "%10s => Pid: %d  Count: %lu  WakeSeq: %lu  "
                        "WorkSeq: %lu\n", "DownCnvt",
                        (osb->dc_task ?  task_pid_nr(osb->dc_task) : -1),
                        osb->blocked_lock_count, osb->dc_wake_sequence,
                        osb->dc_work_sequence);
-        spin_unlock(&osb->dc_task_lock);
+        spin_unlock_irqrestore(&osb->dc_task_lock, flags);
        spin_lock(&osb->osb_lock);
        out += snprintf(buf + out, len - out, "%10s => Pid: %d  Nodes:",
@@ -1204,6 +1200,9 @@ static int ocfs2_fill_super(struct super_block *sb, void *data, int silent)
        /* Start this when the mount is almost sure of being successful */
        ocfs2_orphan_scan_start(osb);
+        /* Create filecheck sysfile /sys/fs/ocfs2/<devname>/filecheck */
+        ocfs2_filecheck_create_sysfs(sb);
        return status;
 read_super_error:
@@ -1608,33 +1607,25 @@ static int __init ocfs2_init(void)
        if (status < 0)
                goto out2;
-        ocfs2_wq = create_singlethread_workqueue("ocfs2_wq");
-        if (!ocfs2_wq) {
-                status = -ENOMEM;
-                goto out3;
-        }
        ocfs2_debugfs_root = debugfs_create_dir("ocfs2", NULL);
        if (!ocfs2_debugfs_root) {
                status = -ENOMEM;
                mlog(ML_ERROR, "Unable to create ocfs2 debugfs root.\n");
-                goto out4;
+                goto out3;
        }
        ocfs2_set_locking_protocol();
        status = register_quota_format(&ocfs2_quota_format);
        if (status < 0)
-                goto out4;
+                goto out3;
        status = register_filesystem(&ocfs2_fs_type);
        if (!status)
                return 0;
        unregister_quota_format(&ocfs2_quota_format);
-out4:
-        destroy_workqueue(ocfs2_wq);
-        debugfs_remove(ocfs2_debugfs_root);
 out3:
+        debugfs_remove(ocfs2_debugfs_root);
        ocfs2_free_mem_caches();
 out2:
        exit_ocfs2_uptodate_cache();
@@ -1645,11 +1636,6 @@ out1:
 static void __exit ocfs2_exit(void)
 {
-        if (ocfs2_wq) {
-                flush_workqueue(ocfs2_wq);
-                destroy_workqueue(ocfs2_wq);
-        }
        unregister_quota_format(&ocfs2_quota_format);
        debugfs_remove(ocfs2_debugfs_root);
@@ -1667,6 +1653,7 @@ static void ocfs2_put_super(struct super_block *sb)
        ocfs2_sync_blockdev(sb);
        ocfs2_dismount_volume(sb, 0);
+        ocfs2_filecheck_remove_sysfs(sb);
 }
 static int ocfs2_statfs(struct dentry *dentry, struct kstatfs *buf)
@@ -1739,8 +1726,8 @@ static void ocfs2_inode_init_once(void *data)
        spin_lock_init(&oi->ip_lock);
        ocfs2_extent_map_init(&oi->vfs_inode);
        INIT_LIST_HEAD(&oi->ip_io_markers);
+        INIT_LIST_HEAD(&oi->ip_unwritten_list);
        oi->ip_dir_start_lookup = 0;
-        mutex_init(&oi->ip_unaligned_aio);
        init_rwsem(&oi->ip_alloc_sem);
        init_rwsem(&oi->ip_xattr_sem);
        mutex_init(&oi->ip_io_mutex);
@@ -2343,6 +2330,12 @@ static int ocfs2_initialize_super(struct super_block *sb,
        }
        cleancache_init_shared_fs(sb);
+        osb->ocfs2_wq = create_singlethread_workqueue("ocfs2_wq");
+        if (!osb->ocfs2_wq) {
+                status = -ENOMEM;
+                mlog_errno(status);
+        }
 bail:
        return status;
 }
@@ -2530,6 +2523,12 @@ static void ocfs2_delete_osb(struct ocfs2_super *osb)
 {
        /* This function assumes that the caller has the main osb resource */
+        /* ocfs2_initializer_super have already created this workqueue */
+        if (osb->ocfs2_wq) {
+                flush_workqueue(osb->ocfs2_wq);
+                destroy_workqueue(osb->ocfs2_wq);
+        }
        ocfs2_free_slot_info(osb);
        kfree(osb->osb_orphan_wipes);
diff --git a/fs/ocfs2/super.h b/fs/ocfs2/super.h
index b477d0b1c7b6..b023e4f3d740 100644
--- a/fs/ocfs2/super.h
+++ b/fs/ocfs2/super.h
@@ -26,8 +26,6 @@
 #ifndef OCFS2_SUPER_H
 #define OCFS2_SUPER_H
-extern struct workqueue_struct *ocfs2_wq;
 int ocfs2_publish_get_mount_state(struct ocfs2_super *osb,
                                  int node_num);
diff --git a/fs/open.c b/fs/open.c
index 55bdc75e2172..17cb6b1dab75 100644
--- a/fs/open.c
+++ b/fs/open.c
@@ -992,14 +992,12 @@ struct file *filp_open(const char *filename, int flags, umode_t mode)
 EXPORT_SYMBOL(filp_open);
 struct file *file_open_root(struct dentry *dentry, struct vfsmount *mnt,
-                            const char *filename, int flags)
+                            const char *filename, int flags, umode_t mode)
 {
        struct open_flags op;
-        int err = build_open_flags(flags, 0, &op);
+        int err = build_open_flags(flags, mode, &op);
        if (err)
                return ERR_PTR(err);
-        if (flags & O_CREAT)
-                return ERR_PTR(-EINVAL);
        return do_file_open_root(dentry, mnt, filename, &op);
 }
 EXPORT_SYMBOL(file_open_root);
diff --git a/fs/orangefs/Kconfig b/fs/orangefs/Kconfig
new file mode 100644
index 000000000000..1554c02489de
--- /dev/null
+++ b/fs/orangefs/Kconfig
@@ -0,0 +1,6 @@
+config ORANGEFS_FS
+        tristate "ORANGEFS (Powered by PVFS) support"
+        select FS_POSIX_ACL
+        help
+           Orange is a parallel file system designed for use on high end
+           computing (HEC) systems.
diff --git a/fs/orangefs/Makefile b/fs/orangefs/Makefile
new file mode 100644
index 000000000000..a9d6a968fe6d
--- /dev/null
+++ b/fs/orangefs/Makefile
@@ -0,0 +1,10 @@
+#
+# Makefile for the ORANGEFS filesystem.
+#
+obj-$(CONFIG_ORANGEFS_FS) += orangefs.o
+orangefs-objs := acl.o file.o orangefs-cache.o orangefs-utils.o xattr.o \
+                 dcache.o inode.o orangefs-sysfs.o orangefs-mod.o super.o \
+                 devorangefs-req.o namei.o symlink.o dir.o orangefs-bufmap.o \
+                 orangefs-debugfs.o waitqueue.o
diff --git a/fs/orangefs/acl.c b/fs/orangefs/acl.c
new file mode 100644
index 000000000000..03f89dbb2512
--- /dev/null
+++ b/fs/orangefs/acl.c
@@ -0,0 +1,175 @@
+/*
+ * (C) 2001 Clemson University and The University of Chicago
+ *
+ * See COPYING in top-level directory.
+ */
+#include "protocol.h"
+#include "orangefs-kernel.h"
+#include "orangefs-bufmap.h"
+#include <linux/posix_acl_xattr.h>
+#include <linux/fs_struct.h>
+struct posix_acl *orangefs_get_acl(struct inode *inode, int type)
+{
+        struct posix_acl *acl;
+        int ret;
+        char *key = NULL, *value = NULL;
+        switch (type) {
+        case ACL_TYPE_ACCESS:
+                key = ORANGEFS_XATTR_NAME_ACL_ACCESS;
+                break;
+        case ACL_TYPE_DEFAULT:
+                key = ORANGEFS_XATTR_NAME_ACL_DEFAULT;
+                break;
+        default:
+                gossip_err("orangefs_get_acl: bogus value of type %d\n", type);
+                return ERR_PTR(-EINVAL);
+        }
+        /*
+         * Rather than incurring a network call just to determine the exact
+         * length of the attribute, I just allocate a max length to save on
+         * the network call. Conceivably, we could pass NULL to
+         * orangefs_inode_getxattr() to probe the length of the value, but
+         * I don't do that for now.
+         */
+        value = kmalloc(ORANGEFS_MAX_XATTR_VALUELEN, GFP_KERNEL);
+        if (value == NULL)
+                return ERR_PTR(-ENOMEM);
+        gossip_debug(GOSSIP_ACL_DEBUG,
+                     "inode %pU, key %s, type %d\n",
+                     get_khandle_from_ino(inode),
+                     key,
+                     type);
+        ret = orangefs_inode_getxattr(inode,
+                                   "",
+                                   key,
+                                   value,
+                                   ORANGEFS_MAX_XATTR_VALUELEN);
+        /* if the key exists, convert it to an in-memory rep */
+        if (ret > 0) {
+                acl = posix_acl_from_xattr(&init_user_ns, value, ret);
+        } else if (ret == -ENODATA || ret == -ENOSYS) {
+                acl = NULL;
+        } else {
+                gossip_err("inode %pU retrieving acl's failed with error %d\n",
+                           get_khandle_from_ino(inode),
+                           ret);
+                acl = ERR_PTR(ret);
+        }
+        /* kfree(NULL) is safe, so don't worry if value ever got used */
+        kfree(value);
+        return acl;
+}
+int orangefs_set_acl(struct inode *inode, struct posix_acl *acl, int type)
+{
+        struct orangefs_inode_s *orangefs_inode = ORANGEFS_I(inode);
+        int error = 0;
+        void *value = NULL;
+        size_t size = 0;
+        const char *name = NULL;
+        switch (type) {
+        case ACL_TYPE_ACCESS:
+                name = ORANGEFS_XATTR_NAME_ACL_ACCESS;
+                if (acl) {
+                        umode_t mode = inode->i_mode;
+                        /*
+                         * can we represent this with the traditional file
+                         * mode permission bits?
+                         */
+                        error = posix_acl_equiv_mode(acl, &mode);
+                        if (error < 0) {
+                                gossip_err("%s: posix_acl_equiv_mode err: %d\n",
+                                           __func__,
+                                           error);
+                                return error;
+                        }
+                        if (inode->i_mode != mode)
+                                SetModeFlag(orangefs_inode);
+                        inode->i_mode = mode;
+                        mark_inode_dirty_sync(inode);
+                        if (error == 0)
+                                acl = NULL;
+                }
+                break;
+        case ACL_TYPE_DEFAULT:
+                name = ORANGEFS_XATTR_NAME_ACL_DEFAULT;
+                break;
+        default:
+                gossip_err("%s: invalid type %d!\n", __func__, type);
+                return -EINVAL;
+        }
+        gossip_debug(GOSSIP_ACL_DEBUG,
+                     "%s: inode %pU, key %s type %d\n",
+                     __func__, get_khandle_from_ino(inode),
+                     name,
+                     type);
+        if (acl) {
+                size = posix_acl_xattr_size(acl->a_count);
+                value = kmalloc(size, GFP_KERNEL);
+                if (!value)
+                        return -ENOMEM;
+                error = posix_acl_to_xattr(&init_user_ns, acl, value, size);
+                if (error < 0)
+                        goto out;
+        }
+        gossip_debug(GOSSIP_ACL_DEBUG,
+                     "%s: name %s, value %p, size %zd, acl %p\n",
+                     __func__, name, value, size, acl);
+        /*
+         * Go ahead and set the extended attribute now. NOTE: Suppose acl
+         * was NULL, then value will be NULL and size will be 0 and that
+         * will xlate to a removexattr. However, we don't want removexattr
+         * complain if attributes does not exist.
+         */
+        error = orangefs_inode_setxattr(inode, "", name, value, size, 0);
+out:
+        kfree(value);
+        if (!error)
+                set_cached_acl(inode, type, acl);
+        return error;
+}
+int orangefs_init_acl(struct inode *inode, struct inode *dir)
+{
+        struct orangefs_inode_s *orangefs_inode = ORANGEFS_I(inode);
+        struct posix_acl *default_acl, *acl;
+        umode_t mode = inode->i_mode;
+        int error = 0;
+        ClearModeFlag(orangefs_inode);
+        error = posix_acl_create(dir, &mode, &default_acl, &acl);
+        if (error)
+                return error;
+        if (default_acl) {
+                error = orangefs_set_acl(inode, default_acl, ACL_TYPE_DEFAULT);
+                posix_acl_release(default_acl);
+        }
+        if (acl) {
+                if (!error)
+                        error = orangefs_set_acl(inode, acl, ACL_TYPE_ACCESS);
+                posix_acl_release(acl);
+        }
+        /* If mode of the inode was changed, then do a forcible ->setattr */
+        if (mode != inode->i_mode) {
+                SetModeFlag(orangefs_inode);
+                inode->i_mode = mode;
+                orangefs_flush_inode(inode);
+        }
+        return error;
+}
diff --git a/fs/orangefs/dcache.c b/fs/orangefs/dcache.c
new file mode 100644
index 000000000000..5dfc4f3cfe68
--- /dev/null
+++ b/fs/orangefs/dcache.c
@@ -0,0 +1,138 @@
+/*
+ * (C) 2001 Clemson University and The University of Chicago
+ *
+ * See COPYING in top-level directory.
+ */
+/*
+ *  Implementation of dentry (directory cache) functions.
+ */
+#include "protocol.h"
+#include "orangefs-kernel.h"
+/* Returns 1 if dentry can still be trusted, else 0. */
+static int orangefs_revalidate_lookup(struct dentry *dentry)
+{
+        struct dentry *parent_dentry = dget_parent(dentry);
+        struct inode *parent_inode = parent_dentry->d_inode;
+        struct orangefs_inode_s *parent = ORANGEFS_I(parent_inode);
+        struct inode *inode = dentry->d_inode;
+        struct orangefs_kernel_op_s *new_op;
+        int ret = 0;
+        int err = 0;
+        gossip_debug(GOSSIP_DCACHE_DEBUG, "%s: attempting lookup.\n", __func__);
+        new_op = op_alloc(ORANGEFS_VFS_OP_LOOKUP);
+        if (!new_op)
+                goto out_put_parent;
+        new_op->upcall.req.lookup.sym_follow = ORANGEFS_LOOKUP_LINK_NO_FOLLOW;
+        new_op->upcall.req.lookup.parent_refn = parent->refn;
+        strncpy(new_op->upcall.req.lookup.d_name,
+                dentry->d_name.name,
+                ORANGEFS_NAME_MAX);
+        gossip_debug(GOSSIP_DCACHE_DEBUG,
+                     "%s:%s:%d interrupt flag [%d]\n",
+                     __FILE__,
+                     __func__,
+                     __LINE__,
+                     get_interruptible_flag(parent_inode));
+        err = service_operation(new_op, "orangefs_lookup",
+                        get_interruptible_flag(parent_inode));
+        /* Positive dentry: reject if error or not the same inode. */
+        if (inode) {
+                if (err) {
+                        gossip_debug(GOSSIP_DCACHE_DEBUG,
+                            "%s:%s:%d lookup failure.\n",
+                            __FILE__, __func__, __LINE__);
+                        goto out_drop;
+                }
+                if (!match_handle(new_op->downcall.resp.lookup.refn.khandle,
+                    inode)) {
+                        gossip_debug(GOSSIP_DCACHE_DEBUG,
+                            "%s:%s:%d no match.\n",
+                            __FILE__, __func__, __LINE__);
+                        goto out_drop;
+                }
+        /* Negative dentry: reject if success or error other than ENOENT. */
+        } else {
+                gossip_debug(GOSSIP_DCACHE_DEBUG, "%s: negative dentry.\n",
+                    __func__);
+                if (!err || err != -ENOENT) {
+                        if (new_op->downcall.status != 0)
+                                gossip_debug(GOSSIP_DCACHE_DEBUG,
+                                    "%s:%s:%d lookup failure.\n",
+                                    __FILE__, __func__, __LINE__);
+                        goto out_drop;
+                }
+        }
+        ret = 1;
+out_release_op:
+        op_release(new_op);
+out_put_parent:
+        dput(parent_dentry);
+        return ret;
+out_drop:
+        gossip_debug(GOSSIP_DCACHE_DEBUG, "%s:%s:%d revalidate failed\n",
+            __FILE__, __func__, __LINE__);
+        goto out_release_op;
+}
+/*
+ * Verify that dentry is valid.
+ *
+ * Should return 1 if dentry can still be trusted, else 0.
+ */
+static int orangefs_d_revalidate(struct dentry *dentry, unsigned int flags)
+{
+        int ret;
+        if (flags & LOOKUP_RCU)
+                return -ECHILD;
+        gossip_debug(GOSSIP_DCACHE_DEBUG, "%s: called on dentry %p.\n",
+                     __func__, dentry);
+        /* skip root handle lookups. */
+        if (dentry->d_inode && is_root_handle(dentry->d_inode))
+                return 1;
+        /*
+         * If this passes, the positive dentry still exists or the negative
+         * dentry still does not exist.
+         */
+        if (!orangefs_revalidate_lookup(dentry))
+                return 0;
+        /* We do not need to continue with negative dentries. */
+        if (!dentry->d_inode)
+                goto out;
+        /* Now we must perform a getattr to validate the inode contents. */
+        ret = orangefs_inode_check_changed(dentry->d_inode);
+        if (ret < 0) {
+                gossip_debug(GOSSIP_DCACHE_DEBUG, "%s:%s:%d getattr failure.\n",
+                    __FILE__, __func__, __LINE__);
+                return 0;
+        }
+        if (ret == 0)
+                return 0;
+out:
+        gossip_debug(GOSSIP_DCACHE_DEBUG,
+            "%s: negative dentry or positive dentry and inode valid.\n",
+            __func__);
+        return 1;
+}
+const struct dentry_operations orangefs_dentry_operations = {
+        .d_revalidate = orangefs_d_revalidate,
+};
diff --git a/fs/orangefs/devorangefs-req.c b/fs/orangefs/devorangefs-req.c
new file mode 100644
index 000000000000..db170beba797
--- /dev/null
+++ b/fs/orangefs/devorangefs-req.c
@@ -0,0 +1,943 @@
+/*
+ * (C) 2001 Clemson University and The University of Chicago
+ *
+ * Changes by Acxiom Corporation to add protocol version to kernel
+ * communication, Copyright Acxiom Corporation, 2005.
+ *
+ * See COPYING in top-level directory.
+ */
+#include "protocol.h"
+#include "orangefs-kernel.h"
+#include "orangefs-dev-proto.h"
+#include "orangefs-bufmap.h"
+#include <linux/debugfs.h>
+#include <linux/slab.h>
+/* this file implements the /dev/pvfs2-req device node */
+static int open_access_count;
+#define DUMP_DEVICE_ERROR()                                                   \
+do {                                                                          \
+        gossip_err("*****************************************************\n");\
+        gossip_err("ORANGEFS Device Error:  You cannot open the device file ");  \
+        gossip_err("\n/dev/%s more than once.  Please make sure that\nthere " \
+                   "are no ", ORANGEFS_REQDEVICE_NAME);                          \
+        gossip_err("instances of a program using this device\ncurrently "     \
+                   "running. (You must verify this!)\n");                     \
+        gossip_err("For example, you can use the lsof program as follows:\n");\
+        gossip_err("'lsof | grep %s' (run this as root)\n",                   \
+                   ORANGEFS_REQDEVICE_NAME);                                     \
+        gossip_err("  open_access_count = %d\n", open_access_count);          \
+        gossip_err("*****************************************************\n");\
+} while (0)
+static int hash_func(__u64 tag, int table_size)
+{
+        return do_div(tag, (unsigned int)table_size);
+}
+static void orangefs_devreq_add_op(struct orangefs_kernel_op_s *op)
+{
+        int index = hash_func(op->tag, hash_table_size);
+        list_add_tail(&op->list, &htable_ops_in_progress[index]);
+}
+/*
+ * find the op with this tag and remove it from the in progress
+ * hash table.
+ */
+static struct orangefs_kernel_op_s *orangefs_devreq_remove_op(__u64 tag)
+{
+        struct orangefs_kernel_op_s *op, *next;
+        int index;
+        index = hash_func(tag, hash_table_size);
+        spin_lock(&htable_ops_in_progress_lock);
+        list_for_each_entry_safe(op,
+                                 next,
+                                 &htable_ops_in_progress[index],
+                                 list) {
+                if (op->tag == tag && !op_state_purged(op) &&
+                    !op_state_given_up(op)) {
+                        list_del_init(&op->list);
+                        spin_unlock(&htable_ops_in_progress_lock);
+                        return op;
+                }
+        }
+        spin_unlock(&htable_ops_in_progress_lock);
+        return NULL;
+}
+/* Returns whether any FS are still pending remounted */
+static int mark_all_pending_mounts(void)
+{
+        int unmounted = 1;
+        struct orangefs_sb_info_s *orangefs_sb = NULL;
+        spin_lock(&orangefs_superblocks_lock);
+        list_for_each_entry(orangefs_sb, &orangefs_superblocks, list) {
+                /* All of these file system require a remount */
+                orangefs_sb->mount_pending = 1;
+                unmounted = 0;
+        }
+        spin_unlock(&orangefs_superblocks_lock);
+        return unmounted;
+}
+/*
+ * Determine if a given file system needs to be remounted or not
+ *  Returns -1 on error
+ *           0 if already mounted
+ *           1 if needs remount
+ */
+static int fs_mount_pending(__s32 fsid)
+{
+        int mount_pending = -1;
+        struct orangefs_sb_info_s *orangefs_sb = NULL;
+        spin_lock(&orangefs_superblocks_lock);
+        list_for_each_entry(orangefs_sb, &orangefs_superblocks, list) {
+                if (orangefs_sb->fs_id == fsid) {
+                        mount_pending = orangefs_sb->mount_pending;
+                        break;
+                }
+        }
+        spin_unlock(&orangefs_superblocks_lock);
+        return mount_pending;
+}
+static int orangefs_devreq_open(struct inode *inode, struct file *file)
+{
+        int ret = -EINVAL;
+        if (!(file->f_flags & O_NONBLOCK)) {
+                gossip_err("%s: device cannot be opened in blocking mode\n",
+                           __func__);
+                goto out;
+        }
+        ret = -EACCES;
+        gossip_debug(GOSSIP_DEV_DEBUG, "client-core: opening device\n");
+        mutex_lock(&devreq_mutex);
+        if (open_access_count == 0) {
+                open_access_count = 1;
+                ret = 0;
+        } else {
+                DUMP_DEVICE_ERROR();
+        }
+        mutex_unlock(&devreq_mutex);
+out:
+        gossip_debug(GOSSIP_DEV_DEBUG,
+                     "pvfs2-client-core: open device complete (ret = %d)\n",
+                     ret);
+        return ret;
+}
+/* Function for read() callers into the device */
+static ssize_t orangefs_devreq_read(struct file *file,
+                                 char __user *buf,
+                                 size_t count, loff_t *offset)
+{
+        struct orangefs_kernel_op_s *op, *temp;
+        __s32 proto_ver = ORANGEFS_KERNEL_PROTO_VERSION;
+        static __s32 magic = ORANGEFS_DEVREQ_MAGIC;
+        struct orangefs_kernel_op_s *cur_op = NULL;
+        unsigned long ret;
+        /* We do not support blocking IO. */
+        if (!(file->f_flags & O_NONBLOCK)) {
+                gossip_err("%s: blocking read from client-core.\n",
+                           __func__);
+                return -EINVAL;
+        }
+        /*
+         * The client will do an ioctl to find MAX_DEV_REQ_UPSIZE, then
+         * always read with that size buffer.
+         */
+        if (count != MAX_DEV_REQ_UPSIZE) {
+                gossip_err("orangefs: client-core tried to read wrong size\n");
+                return -EINVAL;
+        }
+restart:
+        /* Get next op (if any) from top of list. */
+        spin_lock(&orangefs_request_list_lock);
+        list_for_each_entry_safe(op, temp, &orangefs_request_list, list) {
+                __s32 fsid;
+                /* This lock is held past the end of the loop when we break. */
+                spin_lock(&op->lock);
+                if (unlikely(op_state_purged(op) || op_state_given_up(op))) {
+                        spin_unlock(&op->lock);
+                        continue;
+                }
+                fsid = fsid_of_op(op);
+                if (fsid != ORANGEFS_FS_ID_NULL) {
+                        int ret;
+                        /* Skip ops whose filesystem needs to be mounted. */
+                        ret = fs_mount_pending(fsid);
+                        if (ret == 1) {
+                                gossip_debug(GOSSIP_DEV_DEBUG,
+                                    "%s: mount pending, skipping op tag "
+                                    "%llu %s\n",
+                                    __func__,
+                                    llu(op->tag),
+                                    get_opname_string(op));
+                                spin_unlock(&op->lock);
+                                continue;
+                        /*
+                         * Skip ops whose filesystem we don't know about unless
+                         * it is being mounted.
+                         */
+                        /* XXX: is there a better way to detect this? */
+                        } else if (ret == -1 &&
+                                   !(op->upcall.type ==
+                                        ORANGEFS_VFS_OP_FS_MOUNT ||
+                                     op->upcall.type ==
+                                        ORANGEFS_VFS_OP_GETATTR)) {
+                                gossip_debug(GOSSIP_DEV_DEBUG,
+                                    "orangefs: skipping op tag %llu %s\n",
+                                    llu(op->tag), get_opname_string(op));
+                                gossip_err(
+                                    "orangefs: ERROR: fs_mount_pending %d\n",
+                                    fsid);
+                                spin_unlock(&op->lock);
+                                continue;
+                        }
+                }
+                /*
+                 * Either this op does not pertain to a filesystem, is mounting
+                 * a filesystem, or pertains to a mounted filesystem. Let it
+                 * through.
+                 */
+                cur_op = op;
+                break;
+        }
+        /*
+         * At this point we either have a valid op and can continue or have not
+         * found an op and must ask the client to try again later.
+         */
+        if (!cur_op) {
+                spin_unlock(&orangefs_request_list_lock);
+                return -EAGAIN;
+        }
+        gossip_debug(GOSSIP_DEV_DEBUG, "%s: reading op tag %llu %s\n",
+                     __func__,
+                     llu(cur_op->tag),
+                     get_opname_string(cur_op));
+        /*
+         * Such an op should never be on the list in the first place. If so, we
+         * will abort.
+         */
+        if (op_state_in_progress(cur_op) || op_state_serviced(cur_op)) {
+                gossip_err("orangefs: ERROR: Current op already queued.\n");
+                list_del_init(&cur_op->list);
+                spin_unlock(&cur_op->lock);
+                spin_unlock(&orangefs_request_list_lock);
+                return -EAGAIN;
+        }
+        list_del_init(&cur_op->list);
+        spin_unlock(&orangefs_request_list_lock);
+        spin_unlock(&cur_op->lock);
+        /* Push the upcall out. */
+        ret = copy_to_user(buf, &proto_ver, sizeof(__s32));
+        if (ret != 0)
+                goto error;
+        ret = copy_to_user(buf+sizeof(__s32), &magic, sizeof(__s32));
+        if (ret != 0)
+                goto error;
+        ret = copy_to_user(buf+2 * sizeof(__s32), &cur_op->tag, sizeof(__u64));
+        if (ret != 0)
+                goto error;
+        ret = copy_to_user(buf+2*sizeof(__s32)+sizeof(__u64), &cur_op->upcall,
+                           sizeof(struct orangefs_upcall_s));
+        if (ret != 0)
+                goto error;
+        spin_lock(&htable_ops_in_progress_lock);
+        spin_lock(&cur_op->lock);
+        if (unlikely(op_state_given_up(cur_op))) {
+                spin_unlock(&cur_op->lock);
+                spin_unlock(&htable_ops_in_progress_lock);
+                complete(&cur_op->waitq);
+                goto restart;
+        }
+        /*
+         * Set the operation to be in progress and move it between lists since
+         * it has been sent to the client.
+         */
+        set_op_state_inprogress(cur_op);
+        gossip_debug(GOSSIP_DEV_DEBUG,
+                     "%s: 1 op:%s: op_state:%d: process:%s:\n",
+                     __func__,
+                     get_opname_string(cur_op),
+                     cur_op->op_state,
+                     current->comm);
+        orangefs_devreq_add_op(cur_op);
+        spin_unlock(&cur_op->lock);
+        spin_unlock(&htable_ops_in_progress_lock);
+        /* The client only asks to read one size buffer. */
+        return MAX_DEV_REQ_UPSIZE;
+error:
+        /*
+         * We were unable to copy the op data to the client. Put the op back in
+         * list. If client has crashed, the op will be purged later when the
+         * device is released.
+         */
+        gossip_err("orangefs: Failed to copy data to user space\n");
+        spin_lock(&orangefs_request_list_lock);
+        spin_lock(&cur_op->lock);
+        if (likely(!op_state_given_up(cur_op))) {
+                set_op_state_waiting(cur_op);
+                gossip_debug(GOSSIP_DEV_DEBUG,
+                             "%s: 2 op:%s: op_state:%d: process:%s:\n",
+                             __func__,
+                             get_opname_string(cur_op),
+                             cur_op->op_state,
+                             current->comm);
+                list_add(&cur_op->list, &orangefs_request_list);
+                spin_unlock(&cur_op->lock);
+        } else {
+                spin_unlock(&cur_op->lock);
+                complete(&cur_op->waitq);
+        }
+        spin_unlock(&orangefs_request_list_lock);
+        return -EFAULT;
+}
+/*
+ * Function for writev() callers into the device.
+ *
+ * Userspace should have written:
+ *  - __u32 version
+ *  - __u32 magic
+ *  - __u64 tag
+ *  - struct orangefs_downcall_s
+ *  - trailer buffer (in the case of READDIR operations)
+ */
+static ssize_t orangefs_devreq_write_iter(struct kiocb *iocb,
+                                      struct iov_iter *iter)
+{
+        ssize_t ret;
+        struct orangefs_kernel_op_s *op = NULL;
+        struct {
+                __u32 version;
+                __u32 magic;
+                __u64 tag;
+        } head;
+        int total = ret = iov_iter_count(iter);
+        int n;
+        int downcall_size = sizeof(struct orangefs_downcall_s);
+        int head_size = sizeof(head);
+        gossip_debug(GOSSIP_DEV_DEBUG, "%s: total:%d: ret:%zd:\n",
+                     __func__,
+                     total,
+                     ret);
+        if (total < MAX_DEV_REQ_DOWNSIZE) {
+                gossip_err("%s: total:%d: must be at least:%u:\n",
+                           __func__,
+                           total,
+                           (unsigned int) MAX_DEV_REQ_DOWNSIZE);
+                return -EFAULT;
+        }
+     
+        n = copy_from_iter(&head, head_size, iter);
+        if (n < head_size) {
+                gossip_err("%s: failed to copy head.\n", __func__);
+                return -EFAULT;
+        }
+        if (head.version < ORANGEFS_MINIMUM_USERSPACE_VERSION) {
+                gossip_err("%s: userspace claims version"
+                           "%d, minimum version required: %d.\n",
+                           __func__,
+                           head.version,
+                           ORANGEFS_MINIMUM_USERSPACE_VERSION);
+                return -EPROTO;
+        }
+        if (head.magic != ORANGEFS_DEVREQ_MAGIC) {
+                gossip_err("Error: Device magic number does not match.\n");
+                return -EPROTO;
+        }
+        /* remove the op from the in progress hash table */
+        op = orangefs_devreq_remove_op(head.tag);
+        if (!op) {
+                gossip_err("WARNING: No one's waiting for tag %llu\n",
+                           llu(head.tag));
+                return ret;
+        }
+        n = copy_from_iter(&op->downcall, downcall_size, iter);
+        if (n != downcall_size) {
+                gossip_err("%s: failed to copy downcall.\n", __func__);
+                goto Efault;
+        }
+        if (op->downcall.status)
+                goto wakeup;
+        /*
+         * We've successfully peeled off the head and the downcall. 
+         * Something has gone awry if total doesn't equal the
+         * sum of head_size, downcall_size and trailer_size.
+         */
+        if ((head_size + downcall_size + op->downcall.trailer_size) != total) {
+                gossip_err("%s: funky write, head_size:%d"
+                           ": downcall_size:%d: trailer_size:%lld"
+                           ": total size:%d:\n",
+                           __func__,
+                           head_size,
+                           downcall_size,
+                           op->downcall.trailer_size,
+                           total);
+                goto Efault;
+        }
+        /* Only READDIR operations should have trailers. */
+        if ((op->downcall.type != ORANGEFS_VFS_OP_READDIR) &&
+            (op->downcall.trailer_size != 0)) {
+                gossip_err("%s: %x operation with trailer.",
+                           __func__,
+                           op->downcall.type);
+                goto Efault;
+        }
+        /* READDIR operations should always have trailers. */
+        if ((op->downcall.type == ORANGEFS_VFS_OP_READDIR) &&
+            (op->downcall.trailer_size == 0)) {
+                gossip_err("%s: %x operation with no trailer.",
+                           __func__,
+                           op->downcall.type);
+                goto Efault;
+        }
+        if (op->downcall.type != ORANGEFS_VFS_OP_READDIR)
+                goto wakeup;
+        op->downcall.trailer_buf =
+                vmalloc(op->downcall.trailer_size);
+        if (op->downcall.trailer_buf == NULL) {
+                gossip_err("%s: failed trailer vmalloc.\n",
+                           __func__);
+                goto Enomem;
+        }
+        memset(op->downcall.trailer_buf, 0, op->downcall.trailer_size);
+        n = copy_from_iter(op->downcall.trailer_buf,
+                           op->downcall.trailer_size,
+                           iter);
+        if (n != op->downcall.trailer_size) {
+                gossip_err("%s: failed to copy trailer.\n", __func__);
+                vfree(op->downcall.trailer_buf);
+                goto Efault;
+        }
+wakeup:
+        /*
+         * Return to vfs waitqueue, and back to service_operation
+         * through wait_for_matching_downcall. 
+         */
+        spin_lock(&op->lock);
+        if (unlikely(op_is_cancel(op))) {
+                spin_unlock(&op->lock);
+                put_cancel(op);
+        } else if (unlikely(op_state_given_up(op))) {
+                spin_unlock(&op->lock);
+                complete(&op->waitq);
+        } else {
+                set_op_state_serviced(op);
+                gossip_debug(GOSSIP_DEV_DEBUG,
+                             "%s: op:%s: op_state:%d: process:%s:\n",
+                             __func__,
+                             get_opname_string(op),
+                             op->op_state,
+                             current->comm);
+                spin_unlock(&op->lock);
+        }
+        return ret;
+Efault:
+        op->downcall.status = -(ORANGEFS_ERROR_BIT | 9);
+        ret = -EFAULT;
+        goto wakeup;
+Enomem:
+        op->downcall.status = -(ORANGEFS_ERROR_BIT | 8);
+        ret = -ENOMEM;
+        goto wakeup;
+}
+/*
+ * NOTE: gets called when the last reference to this device is dropped.
+ * Using the open_access_count variable, we enforce a reference count
+ * on this file so that it can be opened by only one process at a time.
+ * the devreq_mutex is used to make sure all i/o has completed
+ * before we call orangefs_bufmap_finalize, and similar such tricky
+ * situations
+ */
+static int orangefs_devreq_release(struct inode *inode, struct file *file)
+{
+        int unmounted = 0;
+        gossip_debug(GOSSIP_DEV_DEBUG,
+                     "%s:pvfs2-client-core: exiting, closing device\n",
+                     __func__);
+        mutex_lock(&devreq_mutex);
+        orangefs_bufmap_finalize();
+        open_access_count = -1;
+        unmounted = mark_all_pending_mounts();
+        gossip_debug(GOSSIP_DEV_DEBUG, "ORANGEFS Device Close: Filesystem(s) %s\n",
+                     (unmounted ? "UNMOUNTED" : "MOUNTED"));
+        purge_waiting_ops();
+        purge_inprogress_ops();
+        orangefs_bufmap_run_down();
+        gossip_debug(GOSSIP_DEV_DEBUG,
+                     "pvfs2-client-core: device close complete\n");
+        open_access_count = 0;
+        mutex_unlock(&devreq_mutex);
+        return 0;
+}
+int is_daemon_in_service(void)
+{
+        int in_service;
+        /*
+         * What this function does is checks if client-core is alive
+         * based on the access count we maintain on the device.
+         */
+        mutex_lock(&devreq_mutex);
+        in_service = open_access_count == 1 ? 0 : -EIO;
+        mutex_unlock(&devreq_mutex);
+        return in_service;
+}
+bool __is_daemon_in_service(void)
+{
+        return open_access_count == 1;
+}
+static inline long check_ioctl_command(unsigned int command)
+{
+        /* Check for valid ioctl codes */
+        if (_IOC_TYPE(command) != ORANGEFS_DEV_MAGIC) {
+                gossip_err("device ioctl magic numbers don't match! Did you rebuild pvfs2-client-core/libpvfs2? [cmd %x, magic %x != %x]\n",
+                        command,
+                        _IOC_TYPE(command),
+                        ORANGEFS_DEV_MAGIC);
+                return -EINVAL;
+        }
+        /* and valid ioctl commands */
+        if (_IOC_NR(command) >= ORANGEFS_DEV_MAXNR || _IOC_NR(command) <= 0) {
+                gossip_err("Invalid ioctl command number [%d >= %d]\n",
+                           _IOC_NR(command), ORANGEFS_DEV_MAXNR);
+                return -ENOIOCTLCMD;
+        }
+        return 0;
+}
+static long dispatch_ioctl_command(unsigned int command, unsigned long arg)
+{
+        static __s32 magic = ORANGEFS_DEVREQ_MAGIC;
+        static __s32 max_up_size = MAX_DEV_REQ_UPSIZE;
+        static __s32 max_down_size = MAX_DEV_REQ_DOWNSIZE;
+        struct ORANGEFS_dev_map_desc user_desc;
+        int ret = 0;
+        struct dev_mask_info_s mask_info = { 0 };
+        struct dev_mask2_info_s mask2_info = { 0, 0 };
+        int upstream_kmod = 1;
+        struct orangefs_sb_info_s *orangefs_sb;
+        /* mtmoore: add locking here */
+        switch (command) {
+        case ORANGEFS_DEV_GET_MAGIC:
+                return ((put_user(magic, (__s32 __user *) arg) == -EFAULT) ?
+                        -EIO :
+                        0);
+        case ORANGEFS_DEV_GET_MAX_UPSIZE:
+                return ((put_user(max_up_size,
+                                  (__s32 __user *) arg) == -EFAULT) ?
+                                        -EIO :
+                                        0);
+        case ORANGEFS_DEV_GET_MAX_DOWNSIZE:
+                return ((put_user(max_down_size,
+                                  (__s32 __user *) arg) == -EFAULT) ?
+                                        -EIO :
+                                        0);
+        case ORANGEFS_DEV_MAP:
+                ret = copy_from_user(&user_desc,
+                                     (struct ORANGEFS_dev_map_desc __user *)
+                                     arg,
+                                     sizeof(struct ORANGEFS_dev_map_desc));
+                /* WTF -EIO and not -EFAULT? */
+                return ret ? -EIO : orangefs_bufmap_initialize(&user_desc);
+        case ORANGEFS_DEV_REMOUNT_ALL:
+                gossip_debug(GOSSIP_DEV_DEBUG,
+                             "%s: got ORANGEFS_DEV_REMOUNT_ALL\n",
+                             __func__);
+                /*
+                 * remount all mounted orangefs volumes to regain the lost
+                 * dynamic mount tables (if any) -- NOTE: this is done
+                 * without keeping the superblock list locked due to the
+                 * upcall/downcall waiting.  also, the request mutex is
+                 * used to ensure that no operations will be serviced until
+                 * all of the remounts are serviced (to avoid ops between
+                 * mounts to fail)
+                 */
+                ret = mutex_lock_interruptible(&request_mutex);
+                if (ret < 0)
+                        return ret;
+                gossip_debug(GOSSIP_DEV_DEBUG,
+                             "%s: priority remount in progress\n",
+                             __func__);
+                spin_lock(&orangefs_superblocks_lock);
+                list_for_each_entry(orangefs_sb, &orangefs_superblocks, list) {
+                        /*
+                         * We have to drop the spinlock, so entries can be
+                         * removed.  They can't be freed, though, so we just
+                         * keep the forward pointers and zero the back ones -
+                         * that way we can get to the rest of the list.
+                         */
+                        if (!orangefs_sb->list.prev)
+                                continue;
+                        gossip_debug(GOSSIP_DEV_DEBUG,
+                                     "%s: Remounting SB %p\n",
+                                     __func__,
+                                     orangefs_sb);
+                        spin_unlock(&orangefs_superblocks_lock);
+                        ret = orangefs_remount(orangefs_sb);
+                        spin_lock(&orangefs_superblocks_lock);
+                        if (ret) {
+                                gossip_debug(GOSSIP_DEV_DEBUG,
+                                             "SB %p remount failed\n",
+                                             orangefs_sb);
+                                break;
+                        }
+                }
+                spin_unlock(&orangefs_superblocks_lock);
+                gossip_debug(GOSSIP_DEV_DEBUG,
+                             "%s: priority remount complete\n",
+                             __func__);
+                mutex_unlock(&request_mutex);
+                return ret;
+        case ORANGEFS_DEV_UPSTREAM:
+                ret = copy_to_user((void __user *)arg,
+                                    &upstream_kmod,
+                                    sizeof(upstream_kmod));
+                if (ret != 0)
+                        return -EIO;
+                else
+                        return ret;
+        case ORANGEFS_DEV_CLIENT_MASK:
+                ret = copy_from_user(&mask2_info,
+                                     (void __user *)arg,
+                                     sizeof(struct dev_mask2_info_s));
+                if (ret != 0)
+                        return -EIO;
+                client_debug_mask.mask1 = mask2_info.mask1_value;
+                client_debug_mask.mask2 = mask2_info.mask2_value;
+                pr_info("%s: client debug mask has been been received "
+                        ":%llx: :%llx:\n",
+                        __func__,
+                        (unsigned long long)client_debug_mask.mask1,
+                        (unsigned long long)client_debug_mask.mask2);
+                return ret;
+        case ORANGEFS_DEV_CLIENT_STRING:
+                ret = copy_from_user(&client_debug_array_string,
+                                     (void __user *)arg,
+                                     ORANGEFS_MAX_DEBUG_STRING_LEN);
+                /*
+                 * The real client-core makes an effort to ensure
+                 * that actual strings that aren't too long to fit in
+                 * this buffer is what we get here. We're going to use
+                 * string functions on the stuff we got, so we'll make
+                 * this extra effort to try and keep from
+                 * flowing out of this buffer when we use the string
+                 * functions, even if somehow the stuff we end up
+                 * with here is garbage.
+                 */
+                client_debug_array_string[ORANGEFS_MAX_DEBUG_STRING_LEN - 1] =
+                        '\0';
+                
+                if (ret != 0) {
+                        pr_info("%s: CLIENT_STRING: copy_from_user failed\n",
+                                __func__);
+                        return -EIO;
+                }
+                pr_info("%s: client debug array string has been received.\n",
+                        __func__);
+                if (!help_string_initialized) {
+                        /* Free the "we don't know yet" default string... */
+                        kfree(debug_help_string);
+                        /* build a proper debug help string */
+                        if (orangefs_prepare_debugfs_help_string(0)) {
+                                gossip_err("%s: no debug help string \n",
+                                           __func__);
+                                return -EIO;
+                        }
+                        /* Replace the boilerplate boot-time debug-help file. */
+                        debugfs_remove(help_file_dentry);
+                        help_file_dentry =
+                                debugfs_create_file(
+                                        ORANGEFS_KMOD_DEBUG_HELP_FILE,
+                                        0444,
+                                        debug_dir,
+                                        debug_help_string,
+                                        &debug_help_fops);
+                        if (!help_file_dentry) {
+                                gossip_err("%s: debugfs_create_file failed for"
+                                           " :%s:!\n",
+                                           __func__,
+                                           ORANGEFS_KMOD_DEBUG_HELP_FILE);
+                                return -EIO;
+                        }
+                }
+                debug_mask_to_string(&client_debug_mask, 1);
+                debugfs_remove(client_debug_dentry);
+                orangefs_client_debug_init();
+                help_string_initialized++;
+                return ret;
+        case ORANGEFS_DEV_DEBUG:
+                ret = copy_from_user(&mask_info,
+                                     (void __user *)arg,
+                                     sizeof(mask_info));
+                if (ret != 0)
+                        return -EIO;
+                if (mask_info.mask_type == KERNEL_MASK) {
+                        if ((mask_info.mask_value == 0)
+                            && (kernel_mask_set_mod_init)) {
+                                /*
+                                 * the kernel debug mask was set when the
+                                 * kernel module was loaded; don't override
+                                 * it if the client-core was started without
+                                 * a value for ORANGEFS_KMODMASK.
+                                 */
+                                return 0;
+                        }
+                        debug_mask_to_string(&mask_info.mask_value,
+                                             mask_info.mask_type);
+                        gossip_debug_mask = mask_info.mask_value;
+                        pr_info("%s: kernel debug mask has been modified to "
+                                ":%s: :%llx:\n",
+                                __func__,
+                                kernel_debug_string,
+                                (unsigned long long)gossip_debug_mask);
+                } else if (mask_info.mask_type == CLIENT_MASK) {
+                        debug_mask_to_string(&mask_info.mask_value,
+                                             mask_info.mask_type);
+                        pr_info("%s: client debug mask has been modified to"
+                                ":%s: :%llx:\n",
+                                __func__,
+                                client_debug_string,
+                                llu(mask_info.mask_value));
+                } else {
+                        gossip_lerr("Invalid mask type....\n");
+                        return -EINVAL;
+                }
+                return ret;
+        default:
+                return -ENOIOCTLCMD;
+        }
+        return -ENOIOCTLCMD;
+}
+static long orangefs_devreq_ioctl(struct file *file,
+                               unsigned int command, unsigned long arg)
+{
+        long ret;
+        /* Check for properly constructed commands */
+        ret = check_ioctl_command(command);
+        if (ret < 0)
+                return (int)ret;
+        return (int)dispatch_ioctl_command(command, arg);
+}
+#ifdef CONFIG_COMPAT            /* CONFIG_COMPAT is in .config */
+/*  Compat structure for the ORANGEFS_DEV_MAP ioctl */
+struct ORANGEFS_dev_map_desc32 {
+        compat_uptr_t ptr;
+        __s32 total_size;
+        __s32 size;
+        __s32 count;
+};
+static unsigned long translate_dev_map26(unsigned long args, long *error)
+{
+        struct ORANGEFS_dev_map_desc32 __user *p32 = (void __user *)args;
+        /*
+         * Depending on the architecture, allocate some space on the
+         * user-call-stack based on our expected layout.
+         */
+        struct ORANGEFS_dev_map_desc __user *p =
+            compat_alloc_user_space(sizeof(*p));
+        compat_uptr_t addr;
+        *error = 0;
+        /* get the ptr from the 32 bit user-space */
+        if (get_user(addr, &p32->ptr))
+                goto err;
+        /* try to put that into a 64-bit layout */
+        if (put_user(compat_ptr(addr), &p->ptr))
+                goto err;
+        /* copy the remaining fields */
+        if (copy_in_user(&p->total_size, &p32->total_size, sizeof(__s32)))
+                goto err;
+        if (copy_in_user(&p->size, &p32->size, sizeof(__s32)))
+                goto err;
+        if (copy_in_user(&p->count, &p32->count, sizeof(__s32)))
+                goto err;
+        return (unsigned long)p;
+err:
+        *error = -EFAULT;
+        return 0;
+}
+/*
+ * 32 bit user-space apps' ioctl handlers when kernel modules
+ * is compiled as a 64 bit one
+ */
+static long orangefs_devreq_compat_ioctl(struct file *filp, unsigned int cmd,
+                                      unsigned long args)
+{
+        long ret;
+        unsigned long arg = args;
+        /* Check for properly constructed commands */
+        ret = check_ioctl_command(cmd);
+        if (ret < 0)
+                return ret;
+        if (cmd == ORANGEFS_DEV_MAP) {
+                /*
+                 * convert the arguments to what we expect internally
+                 * in kernel space
+                 */
+                arg = translate_dev_map26(args, &ret);
+                if (ret < 0) {
+                        gossip_err("Could not translate dev map\n");
+                        return ret;
+                }
+        }
+        /* no other ioctl requires translation */
+        return dispatch_ioctl_command(cmd, arg);
+}
+#endif /* CONFIG_COMPAT is in .config */
+/* the assigned character device major number */
+static int orangefs_dev_major;
+/*
+ * Initialize orangefs device specific state:
+ * Must be called at module load time only
+ */
+int orangefs_dev_init(void)
+{
+        /* register orangefs-req device  */
+        orangefs_dev_major = register_chrdev(0,
+                                          ORANGEFS_REQDEVICE_NAME,
+                                          &orangefs_devreq_file_operations);
+        if (orangefs_dev_major < 0) {
+                gossip_debug(GOSSIP_DEV_DEBUG,
+                             "Failed to register /dev/%s (error %d)\n",
+                             ORANGEFS_REQDEVICE_NAME, orangefs_dev_major);
+                return orangefs_dev_major;
+        }
+        gossip_debug(GOSSIP_DEV_DEBUG,
+                     "*** /dev/%s character device registered ***\n",
+                     ORANGEFS_REQDEVICE_NAME);
+        gossip_debug(GOSSIP_DEV_DEBUG, "'mknod /dev/%s c %d 0'.\n",
+                     ORANGEFS_REQDEVICE_NAME, orangefs_dev_major);
+        return 0;
+}
+void orangefs_dev_cleanup(void)
+{
+        unregister_chrdev(orangefs_dev_major, ORANGEFS_REQDEVICE_NAME);
+        gossip_debug(GOSSIP_DEV_DEBUG,
+                     "*** /dev/%s character device unregistered ***\n",
+                     ORANGEFS_REQDEVICE_NAME);
+}
+static unsigned int orangefs_devreq_poll(struct file *file,
+                                      struct poll_table_struct *poll_table)
+{
+        int poll_revent_mask = 0;
+        poll_wait(file, &orangefs_request_list_waitq, poll_table);
+        if (!list_empty(&orangefs_request_list))
+                poll_revent_mask |= POLL_IN;
+        return poll_revent_mask;
+}
+const struct file_operations orangefs_devreq_file_operations = {
+        .owner = THIS_MODULE,
+        .read = orangefs_devreq_read,
+        .write_iter = orangefs_devreq_write_iter,
+        .open = orangefs_devreq_open,
+        .release = orangefs_devreq_release,
+        .unlocked_ioctl = orangefs_devreq_ioctl,
+#ifdef CONFIG_COMPAT            /* CONFIG_COMPAT is in .config */
+        .compat_ioctl = orangefs_devreq_compat_ioctl,
+#endif
+        .poll = orangefs_devreq_poll
+};
diff --git a/fs/orangefs/dir.c b/fs/orangefs/dir.c
new file mode 100644
index 000000000000..f30b6ecacdd1
--- /dev/null
+++ b/fs/orangefs/dir.c
@@ -0,0 +1,400 @@
+/*
+ * (C) 2001 Clemson University and The University of Chicago
+ *
+ * See COPYING in top-level directory.
+ */
+#include "protocol.h"
+#include "orangefs-kernel.h"
+#include "orangefs-bufmap.h"
+/*
+ * decode routine used by kmod to deal with the blob sent from
+ * userspace for readdirs. The blob contains zero or more of these
+ * sub-blobs:
+ *   __u32 - represents length of the character string that follows.
+ *   string - between 1 and ORANGEFS_NAME_MAX bytes long.
+ *   padding - (if needed) to cause the __u32 plus the string to be
+ *             eight byte aligned.
+ *   khandle - sizeof(khandle) bytes.
+ */
+static long decode_dirents(char *ptr, size_t size,
+                           struct orangefs_readdir_response_s *readdir)
+{
+        int i;
+        struct orangefs_readdir_response_s *rd =
+                (struct orangefs_readdir_response_s *) ptr;
+        char *buf = ptr;
+        int khandle_size = sizeof(struct orangefs_khandle);
+        size_t offset = offsetof(struct orangefs_readdir_response_s,
+                                dirent_array);
+        /* 8 reflects eight byte alignment */
+        int smallest_blob = khandle_size + 8;
+        __u32 len;
+        int aligned_len;
+        int sizeof_u32 = sizeof(__u32);
+        long ret;
+        gossip_debug(GOSSIP_DIR_DEBUG, "%s: size:%zu:\n", __func__, size);
+        /* size is = offset on empty dirs, > offset on non-empty dirs... */
+        if (size < offset) {
+                gossip_err("%s: size:%zu: offset:%zu:\n",
+                           __func__,
+                           size,
+                           offset);
+                ret = -EINVAL;
+                goto out;
+        }
+        if ((size == offset) && (readdir->orangefs_dirent_outcount != 0)) {
+                gossip_err("%s: size:%zu: dirent_outcount:%d:\n",
+                           __func__,
+                           size,
+                           readdir->orangefs_dirent_outcount);
+                ret = -EINVAL;
+                goto out;
+        }
+        readdir->token = rd->token;
+        readdir->orangefs_dirent_outcount = rd->orangefs_dirent_outcount;
+        readdir->dirent_array = kcalloc(readdir->orangefs_dirent_outcount,
+                                        sizeof(*readdir->dirent_array),
+                                        GFP_KERNEL);
+        if (readdir->dirent_array == NULL) {
+                gossip_err("%s: kcalloc failed.\n", __func__);
+                ret = -ENOMEM;
+                goto out;
+        }
+        buf += offset;
+        size -= offset;
+        for (i = 0; i < readdir->orangefs_dirent_outcount; i++) {
+                if (size < smallest_blob) {
+                        gossip_err("%s: size:%zu: smallest_blob:%d:\n",
+                                   __func__,
+                                   size,
+                                   smallest_blob);
+                        ret = -EINVAL;
+                        goto free;
+                }
+                len = *(__u32 *)buf;
+                if ((len < 1) || (len > ORANGEFS_NAME_MAX)) {
+                        gossip_err("%s: len:%d:\n", __func__, len);
+                        ret = -EINVAL;
+                        goto free;
+                }
+                gossip_debug(GOSSIP_DIR_DEBUG,
+                             "%s: size:%zu: len:%d:\n",
+                             __func__,
+                             size,
+                             len);
+                readdir->dirent_array[i].d_name = buf + sizeof_u32;
+                readdir->dirent_array[i].d_length = len;
+                /*
+                 * Calculate "aligned" length of this string and its
+                 * associated __u32 descriptor.
+                 */
+                aligned_len = ((sizeof_u32 + len + 1) + 7) & ~7;
+                gossip_debug(GOSSIP_DIR_DEBUG,
+                             "%s: aligned_len:%d:\n",
+                             __func__,
+                             aligned_len);
+                /*
+                 * The end of the blob should coincide with the end
+                 * of the last sub-blob.
+                 */
+                if (size < aligned_len + khandle_size) {
+                        gossip_err("%s: ran off the end of the blob.\n",
+                                   __func__);
+                        ret = -EINVAL;
+                        goto free;
+                }
+                size -= aligned_len + khandle_size;
+                buf += aligned_len;
+                readdir->dirent_array[i].khandle =
+                        *(struct orangefs_khandle *) buf;
+                buf += khandle_size;
+        }
+        ret = buf - ptr;
+        gossip_debug(GOSSIP_DIR_DEBUG, "%s: returning:%ld:\n", __func__, ret);
+        goto out;
+free:
+        kfree(readdir->dirent_array);
+        readdir->dirent_array = NULL;
+out:
+        return ret;
+}
+/*
+ * Read directory entries from an instance of an open directory.
+ */
+static int orangefs_readdir(struct file *file, struct dir_context *ctx)
+{
+        int ret = 0;
+        int buffer_index;
+        /*
+         * ptoken supports Orangefs' distributed directory logic, added
+         * in 2.9.2.
+         */
+        __u64 *ptoken = file->private_data;
+        __u64 pos = 0;
+        ino_t ino = 0;
+        struct dentry *dentry = file->f_path.dentry;
+        struct orangefs_kernel_op_s *new_op = NULL;
+        struct orangefs_inode_s *orangefs_inode = ORANGEFS_I(dentry->d_inode);
+        int buffer_full = 0;
+        struct orangefs_readdir_response_s readdir_response;
+        void *dents_buf;
+        int i = 0;
+        int len = 0;
+        ino_t current_ino = 0;
+        char *current_entry = NULL;
+        long bytes_decoded;
+        gossip_debug(GOSSIP_DIR_DEBUG,
+                     "%s: ctx->pos:%lld, ptoken = %llu\n",
+                     __func__,
+                     lld(ctx->pos),
+                     llu(*ptoken));
+        pos = (__u64) ctx->pos;
+        /* are we done? */
+        if (pos == ORANGEFS_READDIR_END) {
+                gossip_debug(GOSSIP_DIR_DEBUG,
+                             "Skipping to termination path\n");
+                return 0;
+        }
+        gossip_debug(GOSSIP_DIR_DEBUG,
+                     "orangefs_readdir called on %s (pos=%llu)\n",
+                     dentry->d_name.name, llu(pos));
+        memset(&readdir_response, 0, sizeof(readdir_response));
+        new_op = op_alloc(ORANGEFS_VFS_OP_READDIR);
+        if (!new_op)
+                return -ENOMEM;
+        /*
+         * Only the indices are shared. No memory is actually shared, but the
+         * mechanism is used.
+         */
+        new_op->uses_shared_memory = 1;
+        new_op->upcall.req.readdir.refn = orangefs_inode->refn;
+        new_op->upcall.req.readdir.max_dirent_count =
+            ORANGEFS_MAX_DIRENT_COUNT_READDIR;
+        gossip_debug(GOSSIP_DIR_DEBUG,
+                     "%s: upcall.req.readdir.refn.khandle: %pU\n",
+                     __func__,
+                     &new_op->upcall.req.readdir.refn.khandle);
+        new_op->upcall.req.readdir.token = *ptoken;
+get_new_buffer_index:
+        buffer_index = orangefs_readdir_index_get();
+        if (buffer_index < 0) {
+                ret = buffer_index;
+                gossip_lerr("orangefs_readdir: orangefs_readdir_index_get() failure (%d)\n",
+                            ret);
+                goto out_free_op;
+        }
+        new_op->upcall.req.readdir.buf_index = buffer_index;
+        ret = service_operation(new_op,
+                                "orangefs_readdir",
+                                get_interruptible_flag(dentry->d_inode));
+        gossip_debug(GOSSIP_DIR_DEBUG,
+                     "Readdir downcall status is %d.  ret:%d\n",
+                     new_op->downcall.status,
+                     ret);
+        orangefs_readdir_index_put(buffer_index);
+        if (ret == -EAGAIN && op_state_purged(new_op)) {
+                /* Client-core indices are invalid after it restarted. */
+                gossip_debug(GOSSIP_DIR_DEBUG,
+                        "%s: Getting new buffer_index for retry of readdir..\n",
+                         __func__);
+                goto get_new_buffer_index;
+        }
+        if (ret == -EIO && op_state_purged(new_op)) {
+                gossip_err("%s: Client is down. Aborting readdir call.\n",
+                        __func__);
+                goto out_slot;
+        }
+        if (ret < 0 || new_op->downcall.status != 0) {
+                gossip_debug(GOSSIP_DIR_DEBUG,
+                             "Readdir request failed.  Status:%d\n",
+                             new_op->downcall.status);
+                if (ret >= 0)
+                        ret = new_op->downcall.status;
+                goto out_slot;
+        }
+        dents_buf = new_op->downcall.trailer_buf;
+        if (dents_buf == NULL) {
+                gossip_err("Invalid NULL buffer in readdir response\n");
+                ret = -ENOMEM;
+                goto out_slot;
+        }
+        bytes_decoded = decode_dirents(dents_buf, new_op->downcall.trailer_size,
+                                        &readdir_response);
+        if (bytes_decoded < 0) {
+                ret = bytes_decoded;
+                gossip_err("Could not decode readdir from buffer %d\n", ret);
+                goto out_vfree;
+        }
+        if (bytes_decoded != new_op->downcall.trailer_size) {
+                gossip_err("orangefs_readdir: # bytes decoded (%ld) "
+                           "!= trailer size (%ld)\n",
+                           bytes_decoded,
+                           (long)new_op->downcall.trailer_size);
+                ret = -EINVAL;
+                goto out_destroy_handle;
+        }
+        /*
+         *  orangefs doesn't actually store dot and dot-dot, but
+         *  we need to have them represented.
+         */
+        if (pos == 0) {
+                ino = get_ino_from_khandle(dentry->d_inode);
+                gossip_debug(GOSSIP_DIR_DEBUG,
+                             "%s: calling dir_emit of \".\" with pos = %llu\n",
+                             __func__,
+                             llu(pos));
+                ret = dir_emit(ctx, ".", 1, ino, DT_DIR);
+                pos += 1;
+        }
+        if (pos == 1) {
+                ino = get_parent_ino_from_dentry(dentry);
+                gossip_debug(GOSSIP_DIR_DEBUG,
+                             "%s: calling dir_emit of \"..\" with pos = %llu\n",
+                             __func__,
+                             llu(pos));
+                ret = dir_emit(ctx, "..", 2, ino, DT_DIR);
+                pos += 1;
+        }
+        /*
+         * we stored ORANGEFS_ITERATE_NEXT in ctx->pos last time around
+         * to prevent "finding" dot and dot-dot on any iteration
+         * other than the first.
+         */
+        if (ctx->pos == ORANGEFS_ITERATE_NEXT)
+                ctx->pos = 0;
+        gossip_debug(GOSSIP_DIR_DEBUG,
+                     "%s: dirent_outcount:%d:\n",
+                     __func__,
+                     readdir_response.orangefs_dirent_outcount);
+        for (i = ctx->pos;
+             i < readdir_response.orangefs_dirent_outcount;
+             i++) {
+                len = readdir_response.dirent_array[i].d_length;
+                current_entry = readdir_response.dirent_array[i].d_name;
+                current_ino = orangefs_khandle_to_ino(
+                        &readdir_response.dirent_array[i].khandle);
+                gossip_debug(GOSSIP_DIR_DEBUG,
+                             "calling dir_emit for %s with len %d"
+                             ", ctx->pos %ld\n",
+                             current_entry,
+                             len,
+                             (unsigned long)ctx->pos);
+                /*
+                 * type is unknown. We don't return object type
+                 * in the dirent_array. This leaves getdents
+                 * clueless about type.
+                 */
+                ret =
+                    dir_emit(ctx, current_entry, len, current_ino, DT_UNKNOWN);
+                if (!ret)
+                        break;
+                ctx->pos++;
+                gossip_debug(GOSSIP_DIR_DEBUG,
+                              "%s: ctx->pos:%lld\n",
+                              __func__,
+                              lld(ctx->pos));
+        }
+        /*
+         * we ran all the way through the last batch, set up for
+         * getting another batch...
+         */
+        if (ret) {
+                *ptoken = readdir_response.token;
+                ctx->pos = ORANGEFS_ITERATE_NEXT;
+        }
+        /*
+         * Did we hit the end of the directory?
+         */
+        if (readdir_response.token == ORANGEFS_READDIR_END &&
+            !buffer_full) {
+                gossip_debug(GOSSIP_DIR_DEBUG,
+                "End of dir detected; setting ctx->pos to ORANGEFS_READDIR_END.\n");
+                ctx->pos = ORANGEFS_READDIR_END;
+        }
+out_destroy_handle:
+        /* kfree(NULL) is safe */
+        kfree(readdir_response.dirent_array);
+out_vfree:
+        gossip_debug(GOSSIP_DIR_DEBUG, "vfree %p\n", dents_buf);
+        vfree(dents_buf);
+out_slot:
+        orangefs_readdir_index_put(buffer_index);
+out_free_op:
+        op_release(new_op);
+        gossip_debug(GOSSIP_DIR_DEBUG, "orangefs_readdir returning %d\n", ret);
+        return ret;
+}
+static int orangefs_dir_open(struct inode *inode, struct file *file)
+{
+        __u64 *ptoken;
+        file->private_data = kmalloc(sizeof(__u64), GFP_KERNEL);
+        if (!file->private_data)
+                return -ENOMEM;
+        ptoken = file->private_data;
+        *ptoken = ORANGEFS_READDIR_START;
+        return 0;
+}
+static int orangefs_dir_release(struct inode *inode, struct file *file)
+{
+        orangefs_flush_inode(inode);
+        kfree(file->private_data);
+        return 0;
+}
+/** ORANGEFS implementation of VFS directory operations */
+const struct file_operations orangefs_dir_operations = {
+        .read = generic_read_dir,
+        .iterate = orangefs_readdir,
+        .open = orangefs_dir_open,
+        .release = orangefs_dir_release,
+};
diff --git a/fs/orangefs/downcall.h b/fs/orangefs/downcall.h
new file mode 100644
index 000000000000..66b99210f1f9
--- /dev/null
+++ b/fs/orangefs/downcall.h
@@ -0,0 +1,133 @@
+/*
+ * (C) 2001 Clemson University and The University of Chicago
+ *
+ * See COPYING in top-level directory.
+ */
+/*
+ *  Definitions of downcalls used in Linux kernel module.
+ */
+#ifndef __DOWNCALL_H
+#define __DOWNCALL_H
+/*
+ * Sanitized the device-client core interaction
+ * for clean 32-64 bit usage
+ */
+struct orangefs_io_response {
+        __s64 amt_complete;
+};
+struct orangefs_lookup_response {
+        struct orangefs_object_kref refn;
+};
+struct orangefs_create_response {
+        struct orangefs_object_kref refn;
+};
+struct orangefs_symlink_response {
+        struct orangefs_object_kref refn;
+};
+struct orangefs_getattr_response {
+        struct ORANGEFS_sys_attr_s attributes;
+        char link_target[ORANGEFS_NAME_MAX];
+};
+struct orangefs_mkdir_response {
+        struct orangefs_object_kref refn;
+};
+/*
+ * duplication of some system interface structures so that I don't have
+ * to allocate extra memory
+ */
+struct orangefs_dirent {
+        char *d_name;
+        int d_length;
+        struct orangefs_khandle khandle;
+};
+struct orangefs_statfs_response {
+        __s64 block_size;
+        __s64 blocks_total;
+        __s64 blocks_avail;
+        __s64 files_total;
+        __s64 files_avail;
+};
+struct orangefs_fs_mount_response {
+        __s32 fs_id;
+        __s32 id;
+        struct orangefs_khandle root_khandle;
+};
+/* the getxattr response is the attribute value */
+struct orangefs_getxattr_response {
+        __s32 val_sz;
+        __s32 __pad1;
+        char val[ORANGEFS_MAX_XATTR_VALUELEN];
+};
+/* the listxattr response is an array of attribute names */
+struct orangefs_listxattr_response {
+        __s32 returned_count;
+        __s32 __pad1;
+        __u64 token;
+        char key[ORANGEFS_MAX_XATTR_LISTLEN * ORANGEFS_MAX_XATTR_NAMELEN];
+        __s32 keylen;
+        __s32 __pad2;
+        __s32 lengths[ORANGEFS_MAX_XATTR_LISTLEN];
+};
+struct orangefs_param_response {
+        __s64 value;
+};
+#define PERF_COUNT_BUF_SIZE 4096
+struct orangefs_perf_count_response {
+        char buffer[PERF_COUNT_BUF_SIZE];
+};
+#define FS_KEY_BUF_SIZE 4096
+struct orangefs_fs_key_response {
+        __s32 fs_keylen;
+        __s32 __pad1;
+        char fs_key[FS_KEY_BUF_SIZE];
+};
+struct orangefs_downcall_s {
+        __s32 type;
+        __s32 status;
+        /* currently trailer is used only by readdir */
+        __s64 trailer_size;
+        char *trailer_buf;
+        union {
+                struct orangefs_io_response io;
+                struct orangefs_lookup_response lookup;
+                struct orangefs_create_response create;
+                struct orangefs_symlink_response sym;
+                struct orangefs_getattr_response getattr;
+                struct orangefs_mkdir_response mkdir;
+                struct orangefs_statfs_response statfs;
+                struct orangefs_fs_mount_response fs_mount;
+                struct orangefs_getxattr_response getxattr;
+                struct orangefs_listxattr_response listxattr;
+                struct orangefs_param_response param;
+                struct orangefs_perf_count_response perf_count;
+                struct orangefs_fs_key_response fs_key;
+        } resp;
+};
+struct orangefs_readdir_response_s {
+        __u64 token;
+        __u64 directory_version;
+        __u32 __pad2;
+        __u32 orangefs_dirent_outcount;
+        struct orangefs_dirent *dirent_array;
+};
+#endif /* __DOWNCALL_H */
diff --git a/fs/orangefs/file.c b/fs/orangefs/file.c
new file mode 100644
index 000000000000..ae92795ed965
--- /dev/null
+++ b/fs/orangefs/file.c
@@ -0,0 +1,717 @@
+/*
+ * (C) 2001 Clemson University and The University of Chicago
+ *
+ * See COPYING in top-level directory.
+ */
+/*
+ *  Linux VFS file operations.
+ */
+#include "protocol.h"
+#include "orangefs-kernel.h"
+#include "orangefs-bufmap.h"
+#include <linux/fs.h>
+#include <linux/pagemap.h>
+/*
+ * Copy to client-core's address space from the buffers specified
+ * by the iovec upto total_size bytes.
+ * NOTE: the iovector can either contain addresses which
+ *       can futher be kernel-space or user-space addresses.
+ *       or it can pointers to struct page's
+ */
+static int precopy_buffers(int buffer_index,
+                           struct iov_iter *iter,
+                           size_t total_size)
+{
+        int ret = 0;
+        /*
+         * copy data from application/kernel by pulling it out
+         * of the iovec.
+         */
+        if (total_size) {
+                ret = orangefs_bufmap_copy_from_iovec(iter,
+                                                      buffer_index,
+                                                      total_size);
+                if (ret < 0)
+                gossip_err("%s: Failed to copy-in buffers. Please make sure that the pvfs2-client is running. %ld\n",
+                           __func__,
+                           (long)ret);
+        }
+        if (ret < 0)
+                gossip_err("%s: Failed to copy-in buffers. Please make sure that the pvfs2-client is running. %ld\n",
+                        __func__,
+                        (long)ret);
+        return ret;
+}
+/*
+ * Copy from client-core's address space to the buffers specified
+ * by the iovec upto total_size bytes.
+ * NOTE: the iovector can either contain addresses which
+ *       can futher be kernel-space or user-space addresses.
+ *       or it can pointers to struct page's
+ */
+static int postcopy_buffers(int buffer_index,
+                            struct iov_iter *iter,
+                            size_t total_size)
+{
+        int ret = 0;
+        /*
+         * copy data to application/kernel by pushing it out to
+         * the iovec. NOTE; target buffers can be addresses or
+         * struct page pointers.
+         */
+        if (total_size) {
+                ret = orangefs_bufmap_copy_to_iovec(iter,
+                                                    buffer_index,
+                                                    total_size);
+                if (ret < 0)
+                        gossip_err("%s: Failed to copy-out buffers. Please make sure that the pvfs2-client is running (%ld)\n",
+                                __func__,
+                                (long)ret);
+        }
+        return ret;
+}
+/*
+ * Post and wait for the I/O upcall to finish
+ */
+static ssize_t wait_for_direct_io(enum ORANGEFS_io_type type, struct inode *inode,
+                loff_t *offset, struct iov_iter *iter,
+                size_t total_size, loff_t readahead_size)
+{
+        struct orangefs_inode_s *orangefs_inode = ORANGEFS_I(inode);
+        struct orangefs_khandle *handle = &orangefs_inode->refn.khandle;
+        struct orangefs_kernel_op_s *new_op = NULL;
+        struct iov_iter saved = *iter;
+        int buffer_index = -1;
+        ssize_t ret;
+        new_op = op_alloc(ORANGEFS_VFS_OP_FILE_IO);
+        if (!new_op)
+                return -ENOMEM;
+        /* synchronous I/O */
+        new_op->upcall.req.io.readahead_size = readahead_size;
+        new_op->upcall.req.io.io_type = type;
+        new_op->upcall.req.io.refn = orangefs_inode->refn;
+populate_shared_memory:
+        /* get a shared buffer index */
+        buffer_index = orangefs_bufmap_get();
+        if (buffer_index < 0) {
+                ret = buffer_index;
+                gossip_debug(GOSSIP_FILE_DEBUG,
+                             "%s: orangefs_bufmap_get failure (%zd)\n",
+                             __func__, ret);
+                goto out;
+        }
+        gossip_debug(GOSSIP_FILE_DEBUG,
+                     "%s(%pU): GET op %p -> buffer_index %d\n",
+                     __func__,
+                     handle,
+                     new_op,
+                     buffer_index);
+        new_op->uses_shared_memory = 1;
+        new_op->upcall.req.io.buf_index = buffer_index;
+        new_op->upcall.req.io.count = total_size;
+        new_op->upcall.req.io.offset = *offset;
+        gossip_debug(GOSSIP_FILE_DEBUG,
+                     "%s(%pU): offset: %llu total_size: %zd\n",
+                     __func__,
+                     handle,
+                     llu(*offset),
+                     total_size);
+        /*
+         * Stage 1: copy the buffers into client-core's address space
+         * precopy_buffers only pertains to writes.
+         */
+        if (type == ORANGEFS_IO_WRITE) {
+                ret = precopy_buffers(buffer_index,
+                                      iter,
+                                      total_size);
+                if (ret < 0)
+                        goto out;
+        }
+        gossip_debug(GOSSIP_FILE_DEBUG,
+                     "%s(%pU): Calling post_io_request with tag (%llu)\n",
+                     __func__,
+                     handle,
+                     llu(new_op->tag));
+        /* Stage 2: Service the I/O operation */
+        ret = service_operation(new_op,
+                                type == ORANGEFS_IO_WRITE ?
+                                        "file_write" :
+                                        "file_read",
+                                get_interruptible_flag(inode));
+        /*
+         * If service_operation() returns -EAGAIN #and# the operation was
+         * purged from orangefs_request_list or htable_ops_in_progress, then
+         * we know that the client was restarted, causing the shared memory
+         * area to be wiped clean.  To restart a  write operation in this
+         * case, we must re-copy the data from the user's iovec to a NEW
+         * shared memory location. To restart a read operation, we must get
+         * a new shared memory location.
+         */
+        if (ret == -EAGAIN && op_state_purged(new_op)) {
+                orangefs_bufmap_put(buffer_index);
+                buffer_index = -1;
+                if (type == ORANGEFS_IO_WRITE)
+                        *iter = saved;
+                gossip_debug(GOSSIP_FILE_DEBUG,
+                             "%s:going to repopulate_shared_memory.\n",
+                             __func__);
+                goto populate_shared_memory;
+        }
+        if (ret < 0) {
+                if (ret == -EINTR) {
+                        /*
+                         * We can't return EINTR if any data was written,
+                         * it's not POSIX. It is minimally acceptable
+                         * to give a partial write, the way NFS does.
+                         *
+                         * It would be optimal to return all or nothing,
+                         * but if a userspace write is bigger than
+                         * an IO buffer, and the interrupt occurs
+                         * between buffer writes, that would not be
+                         * possible.
+                         */
+                        switch (new_op->op_state - OP_VFS_STATE_GIVEN_UP) {
+                        /*
+                         * If the op was waiting when the interrupt
+                         * occurred, then the client-core did not
+                         * trigger the write.
+                         */
+                        case OP_VFS_STATE_WAITING:
+                                if (*offset == 0)
+                                        ret = -EINTR;
+                                else
+                                        ret = 0;
+                                break;
+                        /* 
+                         * If the op was in progress when the interrupt
+                         * occurred, then the client-core was able to
+                         * trigger the write.
+                         */
+                        case OP_VFS_STATE_INPROGR:
+                                ret = total_size;
+                                break;
+                        default:
+                                gossip_err("%s: unexpected op state :%d:.\n",
+                                           __func__,
+                                           new_op->op_state);
+                                ret = 0;
+                                break;
+                        }
+                        gossip_debug(GOSSIP_FILE_DEBUG,
+                                     "%s: got EINTR, state:%d: %p\n",
+                                     __func__,
+                                     new_op->op_state,
+                                     new_op);
+                } else {
+                        gossip_err("%s: error in %s handle %pU, returning %zd\n",
+                                __func__,
+                                type == ORANGEFS_IO_READ ?
+                                        "read from" : "write to",
+                                handle, ret);
+                }
+                if (orangefs_cancel_op_in_progress(new_op))
+                        return ret;
+                goto out;
+        }
+        /*
+         * Stage 3: Post copy buffers from client-core's address space
+         * postcopy_buffers only pertains to reads.
+         */
+        if (type == ORANGEFS_IO_READ) {
+                ret = postcopy_buffers(buffer_index,
+                                       iter,
+                                       new_op->downcall.resp.io.amt_complete);
+                if (ret < 0)
+                        goto out;
+        }
+        gossip_debug(GOSSIP_FILE_DEBUG,
+            "%s(%pU): Amount %s, returned by the sys-io call:%d\n",
+            __func__,
+            handle,
+            type == ORANGEFS_IO_READ ?  "read" : "written",
+            (int)new_op->downcall.resp.io.amt_complete);
+        ret = new_op->downcall.resp.io.amt_complete;
+out:
+        if (buffer_index >= 0) {
+                orangefs_bufmap_put(buffer_index);
+                gossip_debug(GOSSIP_FILE_DEBUG,
+                             "%s(%pU): PUT buffer_index %d\n",
+                             __func__, handle, buffer_index);
+                buffer_index = -1;
+        }
+        op_release(new_op);
+        return ret;
+}
+/*
+ * Common entry point for read/write/readv/writev
+ * This function will dispatch it to either the direct I/O
+ * or buffered I/O path depending on the mount options and/or
+ * augmented/extended metadata attached to the file.
+ * Note: File extended attributes override any mount options.
+ */
+static ssize_t do_readv_writev(enum ORANGEFS_io_type type, struct file *file,
+                loff_t *offset, struct iov_iter *iter)
+{
+        struct inode *inode = file->f_mapping->host;
+        struct orangefs_inode_s *orangefs_inode = ORANGEFS_I(inode);
+        struct orangefs_khandle *handle = &orangefs_inode->refn.khandle;
+        size_t count = iov_iter_count(iter);
+        ssize_t total_count = 0;
+        ssize_t ret = -EINVAL;
+        gossip_debug(GOSSIP_FILE_DEBUG,
+                "%s-BEGIN(%pU): count(%d) after estimate_max_iovecs.\n",
+                __func__,
+                handle,
+                (int)count);
+        if (type == ORANGEFS_IO_WRITE) {
+                gossip_debug(GOSSIP_FILE_DEBUG,
+                             "%s(%pU): proceeding with offset : %llu, "
+                             "size %d\n",
+                             __func__,
+                             handle,
+                             llu(*offset),
+                             (int)count);
+        }
+        if (count == 0) {
+                ret = 0;
+                goto out;
+        }
+        while (iov_iter_count(iter)) {
+                size_t each_count = iov_iter_count(iter);
+                size_t amt_complete;
+                /* how much to transfer in this loop iteration */
+                if (each_count > orangefs_bufmap_size_query())
+                        each_count = orangefs_bufmap_size_query();
+                gossip_debug(GOSSIP_FILE_DEBUG,
+                             "%s(%pU): size of each_count(%d)\n",
+                             __func__,
+                             handle,
+                             (int)each_count);
+                gossip_debug(GOSSIP_FILE_DEBUG,
+                             "%s(%pU): BEFORE wait_for_io: offset is %d\n",
+                             __func__,
+                             handle,
+                             (int)*offset);
+                ret = wait_for_direct_io(type, inode, offset, iter,
+                                each_count, 0);
+                gossip_debug(GOSSIP_FILE_DEBUG,
+                             "%s(%pU): return from wait_for_io:%d\n",
+                             __func__,
+                             handle,
+                             (int)ret);
+                if (ret < 0)
+                        goto out;
+                *offset += ret;
+                total_count += ret;
+                amt_complete = ret;
+                gossip_debug(GOSSIP_FILE_DEBUG,
+                             "%s(%pU): AFTER wait_for_io: offset is %d\n",
+                             __func__,
+                             handle,
+                             (int)*offset);
+                /*
+                 * if we got a short I/O operations,
+                 * fall out and return what we got so far
+                 */
+                if (amt_complete < each_count)
+                        break;
+        } /*end while */
+out:
+        if (total_count > 0)
+                ret = total_count;
+        if (ret > 0) {
+                if (type == ORANGEFS_IO_READ) {
+                        file_accessed(file);
+                } else {
+                        SetMtimeFlag(orangefs_inode);
+                        inode->i_mtime = CURRENT_TIME;
+                        mark_inode_dirty_sync(inode);
+                }
+        }
+        gossip_debug(GOSSIP_FILE_DEBUG,
+                     "%s(%pU): Value(%d) returned.\n",
+                     __func__,
+                     handle,
+                     (int)ret);
+        return ret;
+}
+/*
+ * Read data from a specified offset in a file (referenced by inode).
+ * Data may be placed either in a user or kernel buffer.
+ */
+ssize_t orangefs_inode_read(struct inode *inode,
+                            struct iov_iter *iter,
+                            loff_t *offset,
+                            loff_t readahead_size)
+{
+        struct orangefs_inode_s *orangefs_inode = ORANGEFS_I(inode);
+        size_t count = iov_iter_count(iter);
+        size_t bufmap_size;
+        ssize_t ret = -EINVAL;
+        g_orangefs_stats.reads++;
+        bufmap_size = orangefs_bufmap_size_query();
+        if (count > bufmap_size) {
+                gossip_debug(GOSSIP_FILE_DEBUG,
+                             "%s: count is too large (%zd/%zd)!\n",
+                             __func__, count, bufmap_size);
+                return -EINVAL;
+        }
+        gossip_debug(GOSSIP_FILE_DEBUG,
+                     "%s(%pU) %zd@%llu\n",
+                     __func__,
+                     &orangefs_inode->refn.khandle,
+                     count,
+                     llu(*offset));
+        ret = wait_for_direct_io(ORANGEFS_IO_READ, inode, offset, iter,
+                        count, readahead_size);
+        if (ret > 0)
+                *offset += ret;
+        gossip_debug(GOSSIP_FILE_DEBUG,
+                     "%s(%pU): Value(%zd) returned.\n",
+                     __func__,
+                     &orangefs_inode->refn.khandle,
+                     ret);
+        return ret;
+}
+static ssize_t orangefs_file_read_iter(struct kiocb *iocb, struct iov_iter *iter)
+{
+        struct file *file = iocb->ki_filp;
+        loff_t pos = *(&iocb->ki_pos);
+        ssize_t rc = 0;
+        BUG_ON(iocb->private);
+        gossip_debug(GOSSIP_FILE_DEBUG, "orangefs_file_read_iter\n");
+        g_orangefs_stats.reads++;
+        rc = do_readv_writev(ORANGEFS_IO_READ, file, &pos, iter);
+        iocb->ki_pos = pos;
+        return rc;
+}
+static ssize_t orangefs_file_write_iter(struct kiocb *iocb, struct iov_iter *iter)
+{
+        struct file *file = iocb->ki_filp;
+        loff_t pos;
+        ssize_t rc;
+        BUG_ON(iocb->private);
+        gossip_debug(GOSSIP_FILE_DEBUG, "orangefs_file_write_iter\n");
+        mutex_lock(&file->f_mapping->host->i_mutex);
+        /* Make sure generic_write_checks sees an up to date inode size. */
+        if (file->f_flags & O_APPEND) {
+                rc = orangefs_inode_getattr(file->f_mapping->host, 0, 1);
+                if (rc == -ESTALE)
+                        rc = -EIO;
+                if (rc) {
+                        gossip_err("%s: orangefs_inode_getattr failed, "
+                            "rc:%zd:.\n", __func__, rc);
+                        goto out;
+                }
+        }
+        if (file->f_pos > i_size_read(file->f_mapping->host))
+                orangefs_i_size_write(file->f_mapping->host, file->f_pos);
+        rc = generic_write_checks(iocb, iter);
+        if (rc <= 0) {
+                gossip_err("%s: generic_write_checks failed, rc:%zd:.\n",
+                           __func__, rc);
+                goto out;
+        }
+        /*
+         * if we are appending, generic_write_checks would have updated
+         * pos to the end of the file, so we will wait till now to set
+         * pos...
+         */
+        pos = *(&iocb->ki_pos);
+        rc = do_readv_writev(ORANGEFS_IO_WRITE,
+                             file,
+                             &pos,
+                             iter);
+        if (rc < 0) {
+                gossip_err("%s: do_readv_writev failed, rc:%zd:.\n",
+                           __func__, rc);
+                goto out;
+        }
+        iocb->ki_pos = pos;
+        g_orangefs_stats.writes++;
+out:
+        mutex_unlock(&file->f_mapping->host->i_mutex);
+        return rc;
+}
+/*
+ * Perform a miscellaneous operation on a file.
+ */
+static long orangefs_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
+{
+        int ret = -ENOTTY;
+        __u64 val = 0;
+        unsigned long uval;
+        gossip_debug(GOSSIP_FILE_DEBUG,
+                     "orangefs_ioctl: called with cmd %d\n",
+                     cmd);
+        /*
+         * we understand some general ioctls on files, such as the immutable
+         * and append flags
+         */
+        if (cmd == FS_IOC_GETFLAGS) {
+                val = 0;
+                ret = orangefs_inode_getxattr(file_inode(file),
+                                              ORANGEFS_XATTR_NAME_DEFAULT_PREFIX,
+                                              "user.pvfs2.meta_hint",
+                                              &val, sizeof(val));
+                if (ret < 0 && ret != -ENODATA)
+                        return ret;
+                else if (ret == -ENODATA)
+                        val = 0;
+                uval = val;
+                gossip_debug(GOSSIP_FILE_DEBUG,
+                             "orangefs_ioctl: FS_IOC_GETFLAGS: %llu\n",
+                             (unsigned long long)uval);
+                return put_user(uval, (int __user *)arg);
+        } else if (cmd == FS_IOC_SETFLAGS) {
+                ret = 0;
+                if (get_user(uval, (int __user *)arg))
+                        return -EFAULT;
+                /*
+                 * ORANGEFS_MIRROR_FL is set internally when the mirroring mode
+                 * is turned on for a file. The user is not allowed to turn
+                 * on this bit, but the bit is present if the user first gets
+                 * the flags and then updates the flags with some new
+                 * settings. So, we ignore it in the following edit. bligon.
+                 */
+                if ((uval & ~ORANGEFS_MIRROR_FL) &
+                    (~(FS_IMMUTABLE_FL | FS_APPEND_FL | FS_NOATIME_FL))) {
+                        gossip_err("orangefs_ioctl: the FS_IOC_SETFLAGS only supports setting one of FS_IMMUTABLE_FL|FS_APPEND_FL|FS_NOATIME_FL\n");
+                        return -EINVAL;
+                }
+                val = uval;
+                gossip_debug(GOSSIP_FILE_DEBUG,
+                             "orangefs_ioctl: FS_IOC_SETFLAGS: %llu\n",
+                             (unsigned long long)val);
+                ret = orangefs_inode_setxattr(file_inode(file),
+                                              ORANGEFS_XATTR_NAME_DEFAULT_PREFIX,
+                                              "user.pvfs2.meta_hint",
+                                              &val, sizeof(val), 0);
+        }
+        return ret;
+}
+/*
+ * Memory map a region of a file.
+ */
+static int orangefs_file_mmap(struct file *file, struct vm_area_struct *vma)
+{
+        gossip_debug(GOSSIP_FILE_DEBUG,
+                     "orangefs_file_mmap: called on %s\n",
+                     (file ?
+                        (char *)file->f_path.dentry->d_name.name :
+                        (char *)"Unknown"));
+        /* set the sequential readahead hint */
+        vma->vm_flags |= VM_SEQ_READ;
+        vma->vm_flags &= ~VM_RAND_READ;
+        /* Use readonly mmap since we cannot support writable maps. */
+        return generic_file_readonly_mmap(file, vma);
+}
+#define mapping_nrpages(idata) ((idata)->nrpages)
+/*
+ * Called to notify the module that there are no more references to
+ * this file (i.e. no processes have it open).
+ *
+ * \note Not called when each file is closed.
+ */
+static int orangefs_file_release(struct inode *inode, struct file *file)
+{
+        gossip_debug(GOSSIP_FILE_DEBUG,
+                     "orangefs_file_release: called on %s\n",
+                     file->f_path.dentry->d_name.name);
+        orangefs_flush_inode(inode);
+        /*
+         * remove all associated inode pages from the page cache and mmap
+         * readahead cache (if any); this forces an expensive refresh of
+         * data for the next caller of mmap (or 'get_block' accesses)
+         */
+        if (file->f_path.dentry->d_inode &&
+            file->f_path.dentry->d_inode->i_mapping &&
+            mapping_nrpages(&file->f_path.dentry->d_inode->i_data))
+                truncate_inode_pages(file->f_path.dentry->d_inode->i_mapping,
+                                     0);
+        return 0;
+}
+/*
+ * Push all data for a specific file onto permanent storage.
+ */
+static int orangefs_fsync(struct file *file,
+                       loff_t start,
+                       loff_t end,
+                       int datasync)
+{
+        int ret = -EINVAL;
+        struct orangefs_inode_s *orangefs_inode =
+                ORANGEFS_I(file->f_path.dentry->d_inode);
+        struct orangefs_kernel_op_s *new_op = NULL;
+        /* required call */
+        filemap_write_and_wait_range(file->f_mapping, start, end);
+        new_op = op_alloc(ORANGEFS_VFS_OP_FSYNC);
+        if (!new_op)
+                return -ENOMEM;
+        new_op->upcall.req.fsync.refn = orangefs_inode->refn;
+        ret = service_operation(new_op,
+                        "orangefs_fsync",
+                        get_interruptible_flag(file->f_path.dentry->d_inode));
+        gossip_debug(GOSSIP_FILE_DEBUG,
+                     "orangefs_fsync got return value of %d\n",
+                     ret);
+        op_release(new_op);
+        orangefs_flush_inode(file->f_path.dentry->d_inode);
+        return ret;
+}
+/*
+ * Change the file pointer position for an instance of an open file.
+ *
+ * \note If .llseek is overriden, we must acquire lock as described in
+ *       Documentation/filesystems/Locking.
+ *
+ * Future upgrade could support SEEK_DATA and SEEK_HOLE but would
+ * require much changes to the FS
+ */
+static loff_t orangefs_file_llseek(struct file *file, loff_t offset, int origin)
+{
+        int ret = -EINVAL;
+        struct inode *inode = file_inode(file);
+        if (origin == SEEK_END) {
+                /*
+                 * revalidate the inode's file size.
+                 * NOTE: We are only interested in file size here,
+                 * so we set mask accordingly.
+                 */
+                ret = orangefs_inode_getattr(file->f_mapping->host, 0, 1);
+                if (ret == -ESTALE)
+                        ret = -EIO;
+                if (ret) {
+                        gossip_debug(GOSSIP_FILE_DEBUG,
+                                     "%s:%s:%d calling make bad inode\n",
+                                     __FILE__,
+                                     __func__,
+                                     __LINE__);
+                        return ret;
+                }
+        }
+        gossip_debug(GOSSIP_FILE_DEBUG,
+                     "orangefs_file_llseek: offset is %ld | origin is %d"
+                     " | inode size is %lu\n",
+                     (long)offset,
+                     origin,
+                     (unsigned long)i_size_read(inode));
+        return generic_file_llseek(file, offset, origin);
+}
+/*
+ * Support local locks (locks that only this kernel knows about)
+ * if Orangefs was mounted -o local_lock.
+ */
+static int orangefs_lock(struct file *filp, int cmd, struct file_lock *fl)
+{
+        int rc = -EINVAL;
+        if (ORANGEFS_SB(filp->f_inode->i_sb)->flags & ORANGEFS_OPT_LOCAL_LOCK) {
+                if (cmd == F_GETLK) {
+                        rc = 0;
+                        posix_test_lock(filp, fl);
+                } else {
+                        rc = posix_lock_file(filp, fl, NULL);
+                }
+        }
+        return rc;
+}
+/** ORANGEFS implementation of VFS file operations */
+const struct file_operations orangefs_file_operations = {
+        .llseek         = orangefs_file_llseek,
+        .read_iter      = orangefs_file_read_iter,
+        .write_iter     = orangefs_file_write_iter,
+        .lock           = orangefs_lock,
+        .unlocked_ioctl = orangefs_ioctl,
+        .mmap           = orangefs_file_mmap,
+        .open           = generic_file_open,
+        .release        = orangefs_file_release,
+        .fsync          = orangefs_fsync,
+};
diff --git a/fs/orangefs/inode.c b/fs/orangefs/inode.c
new file mode 100644
index 000000000000..2382e267b49e
--- /dev/null
+++ b/fs/orangefs/inode.c
@@ -0,0 +1,475 @@
+/*
+ * (C) 2001 Clemson University and The University of Chicago
+ *
+ * See COPYING in top-level directory.
+ */
+/*
+ *  Linux VFS inode operations.
+ */
+#include "protocol.h"
+#include "orangefs-kernel.h"
+#include "orangefs-bufmap.h"
+static int read_one_page(struct page *page)
+{
+        int ret;
+        int max_block;
+        ssize_t bytes_read = 0;
+        struct inode *inode = page->mapping->host;
+        const __u32 blocksize = PAGE_CACHE_SIZE;        /* inode->i_blksize */
+        const __u32 blockbits = PAGE_CACHE_SHIFT;       /* inode->i_blkbits */
+        struct iov_iter to;
+        struct bio_vec bv = {.bv_page = page, .bv_len = PAGE_SIZE};
+        iov_iter_bvec(&to, ITER_BVEC | READ, &bv, 1, PAGE_SIZE);
+        gossip_debug(GOSSIP_INODE_DEBUG,
+                    "orangefs_readpage called with page %p\n",
+                     page);
+        max_block = ((inode->i_size / blocksize) + 1);
+        if (page->index < max_block) {
+                loff_t blockptr_offset = (((loff_t) page->index) << blockbits);
+                bytes_read = orangefs_inode_read(inode,
+                                                 &to,
+                                                 &blockptr_offset,
+                                                 inode->i_size);
+        }
+        /* this will only zero remaining unread portions of the page data */
+        iov_iter_zero(~0U, &to);
+        /* takes care of potential aliasing */
+        flush_dcache_page(page);
+        if (bytes_read < 0) {
+                ret = bytes_read;
+                SetPageError(page);
+        } else {
+                SetPageUptodate(page);
+                if (PageError(page))
+                        ClearPageError(page);
+                ret = 0;
+        }
+        /* unlock the page after the ->readpage() routine completes */
+        unlock_page(page);
+        return ret;
+}
+static int orangefs_readpage(struct file *file, struct page *page)
+{
+        return read_one_page(page);
+}
+static int orangefs_readpages(struct file *file,
+                           struct address_space *mapping,
+                           struct list_head *pages,
+                           unsigned nr_pages)
+{
+        int page_idx;
+        int ret;
+        gossip_debug(GOSSIP_INODE_DEBUG, "orangefs_readpages called\n");
+        for (page_idx = 0; page_idx < nr_pages; page_idx++) {
+                struct page *page;
+                page = list_entry(pages->prev, struct page, lru);
+                list_del(&page->lru);
+                if (!add_to_page_cache(page,
+                                       mapping,
+                                       page->index,
+                                       GFP_KERNEL)) {
+                        ret = read_one_page(page);
+                        gossip_debug(GOSSIP_INODE_DEBUG,
+                                "failure adding page to cache, read_one_page returned: %d\n",
+                                ret);
+              } else {
+                        page_cache_release(page);
+              }
+        }
+        BUG_ON(!list_empty(pages));
+        return 0;
+}
+static void orangefs_invalidatepage(struct page *page,
+                                 unsigned int offset,
+                                 unsigned int length)
+{
+        gossip_debug(GOSSIP_INODE_DEBUG,
+                     "orangefs_invalidatepage called on page %p "
+                     "(offset is %u)\n",
+                     page,
+                     offset);
+        ClearPageUptodate(page);
+        ClearPageMappedToDisk(page);
+        return;
+}
+static int orangefs_releasepage(struct page *page, gfp_t foo)
+{
+        gossip_debug(GOSSIP_INODE_DEBUG,
+                     "orangefs_releasepage called on page %p\n",
+                     page);
+        return 0;
+}
+/*
+ * Having a direct_IO entry point in the address_space_operations
+ * struct causes the kernel to allows us to use O_DIRECT on
+ * open. Nothing will ever call this thing, but in the future we
+ * will need to be able to use O_DIRECT on open in order to support
+ * AIO. Modeled after NFS, they do this too.
+ */
+/*
+ * static ssize_t orangefs_direct_IO(int rw,
+ *                      struct kiocb *iocb,
+ *                      struct iov_iter *iter,
+ *                      loff_t offset)
+ *{
+ *      gossip_debug(GOSSIP_INODE_DEBUG,
+ *                   "orangefs_direct_IO: %s\n",
+ *                   iocb->ki_filp->f_path.dentry->d_name.name);
+ *
+ *      return -EINVAL;
+ *}
+ */
+struct backing_dev_info orangefs_backing_dev_info = {
+        .name = "orangefs",
+        .ra_pages = 0,
+        .capabilities = BDI_CAP_NO_ACCT_DIRTY | BDI_CAP_NO_WRITEBACK,
+};
+/** ORANGEFS2 implementation of address space operations */
+const struct address_space_operations orangefs_address_operations = {
+        .readpage = orangefs_readpage,
+        .readpages = orangefs_readpages,
+        .invalidatepage = orangefs_invalidatepage,
+        .releasepage = orangefs_releasepage,
+/*      .direct_IO = orangefs_direct_IO */
+};
+static int orangefs_setattr_size(struct inode *inode, struct iattr *iattr)
+{
+        struct orangefs_inode_s *orangefs_inode = ORANGEFS_I(inode);
+        struct orangefs_kernel_op_s *new_op;
+        loff_t orig_size;
+        int ret = -EINVAL;
+        gossip_debug(GOSSIP_INODE_DEBUG,
+                     "%s: %pU: Handle is %pU | fs_id %d | size is %llu\n",
+                     __func__,
+                     get_khandle_from_ino(inode),
+                     &orangefs_inode->refn.khandle,
+                     orangefs_inode->refn.fs_id,
+                     iattr->ia_size);
+        /* Ensure that we have a up to date size, so we know if it changed. */
+        ret = orangefs_inode_getattr(inode, 0, 1);
+        if (ret == -ESTALE)
+                ret = -EIO;
+        if (ret) {
+                gossip_err("%s: orangefs_inode_getattr failed, ret:%d:.\n",
+                    __func__, ret);
+                return ret;
+        }
+        orig_size = i_size_read(inode);
+        truncate_setsize(inode, iattr->ia_size);
+        new_op = op_alloc(ORANGEFS_VFS_OP_TRUNCATE);
+        if (!new_op)
+                return -ENOMEM;
+        new_op->upcall.req.truncate.refn = orangefs_inode->refn;
+        new_op->upcall.req.truncate.size = (__s64) iattr->ia_size;
+        ret = service_operation(new_op, __func__,
+                                get_interruptible_flag(inode));
+        /*
+         * the truncate has no downcall members to retrieve, but
+         * the status value tells us if it went through ok or not
+         */
+        gossip_debug(GOSSIP_INODE_DEBUG,
+                     "orangefs: orangefs_truncate got return value of %d\n",
+                     ret);
+        op_release(new_op);
+        if (ret != 0)
+                return ret;
+        /*
+         * Only change the c/mtime if we are changing the size or we are
+         * explicitly asked to change it.  This handles the semantic difference
+         * between truncate() and ftruncate() as implemented in the VFS.
+         *
+         * The regular truncate() case without ATTR_CTIME and ATTR_MTIME is a
+         * special case where we need to update the times despite not having
+         * these flags set.  For all other operations the VFS set these flags
+         * explicitly if it wants a timestamp update.
+         */
+        if (orig_size != i_size_read(inode) &&
+            !(iattr->ia_valid & (ATTR_CTIME | ATTR_MTIME))) {
+                iattr->ia_ctime = iattr->ia_mtime =
+                        current_fs_time(inode->i_sb);
+                iattr->ia_valid |= ATTR_CTIME | ATTR_MTIME;
+        }
+        return ret;
+}
+/*
+ * Change attributes of an object referenced by dentry.
+ */
+int orangefs_setattr(struct dentry *dentry, struct iattr *iattr)
+{
+        int ret = -EINVAL;
+        struct inode *inode = dentry->d_inode;
+        gossip_debug(GOSSIP_INODE_DEBUG,
+                     "orangefs_setattr: called on %s\n",
+                     dentry->d_name.name);
+        ret = inode_change_ok(inode, iattr);
+        if (ret)
+                goto out;
+        if ((iattr->ia_valid & ATTR_SIZE) &&
+            iattr->ia_size != i_size_read(inode)) {
+                ret = orangefs_setattr_size(inode, iattr);
+                if (ret)
+                        goto out;
+        }
+        setattr_copy(inode, iattr);
+        mark_inode_dirty(inode);
+        ret = orangefs_inode_setattr(inode, iattr);
+        gossip_debug(GOSSIP_INODE_DEBUG,
+                     "orangefs_setattr: inode_setattr returned %d\n",
+                     ret);
+        if (!ret && (iattr->ia_valid & ATTR_MODE))
+                /* change mod on a file that has ACLs */
+                ret = posix_acl_chmod(inode, inode->i_mode);
+out:
+        gossip_debug(GOSSIP_INODE_DEBUG, "orangefs_setattr: returning %d\n", ret);
+        return ret;
+}
+/*
+ * Obtain attributes of an object given a dentry
+ */
+int orangefs_getattr(struct vfsmount *mnt,
+                  struct dentry *dentry,
+                  struct kstat *kstat)
+{
+        int ret = -ENOENT;
+        struct inode *inode = dentry->d_inode;
+        struct orangefs_inode_s *orangefs_inode = NULL;
+        gossip_debug(GOSSIP_INODE_DEBUG,
+                     "orangefs_getattr: called on %s\n",
+                     dentry->d_name.name);
+        ret = orangefs_inode_getattr(inode, 0, 1);
+        if (ret == 0) {
+                generic_fillattr(inode, kstat);
+                /* override block size reported to stat */
+                orangefs_inode = ORANGEFS_I(inode);
+                kstat->blksize = orangefs_inode->blksize;
+        }
+        return ret;
+}
+int orangefs_permission(struct inode *inode, int mask)
+{
+        int ret;
+        if (mask & MAY_NOT_BLOCK)
+                return -ECHILD;
+        gossip_debug(GOSSIP_INODE_DEBUG, "%s: refreshing\n", __func__);
+        /* Make sure the permission (and other common attrs) are up to date. */
+        ret = orangefs_inode_getattr(inode, 0, 0);
+        if (ret < 0)
+                return ret;
+        return generic_permission(inode, mask);
+}
+/* ORANGEDS2 implementation of VFS inode operations for files */
+struct inode_operations orangefs_file_inode_operations = {
+        .get_acl = orangefs_get_acl,
+        .set_acl = orangefs_set_acl,
+        .setattr = orangefs_setattr,
+        .getattr = orangefs_getattr,
+        .setxattr = generic_setxattr,
+        .getxattr = generic_getxattr,
+        .listxattr = orangefs_listxattr,
+        .removexattr = generic_removexattr,
+        .permission = orangefs_permission,
+};
+static int orangefs_init_iops(struct inode *inode)
+{
+        inode->i_mapping->a_ops = &orangefs_address_operations;
+        switch (inode->i_mode & S_IFMT) {
+        case S_IFREG:
+                inode->i_op = &orangefs_file_inode_operations;
+                inode->i_fop = &orangefs_file_operations;
+                inode->i_blkbits = PAGE_CACHE_SHIFT;
+                break;
+        case S_IFLNK:
+                inode->i_op = &orangefs_symlink_inode_operations;
+                break;
+        case S_IFDIR:
+                inode->i_op = &orangefs_dir_inode_operations;
+                inode->i_fop = &orangefs_dir_operations;
+                break;
+        default:
+                gossip_debug(GOSSIP_INODE_DEBUG,
+                             "%s: unsupported mode\n",
+                             __func__);
+                return -EINVAL;
+        }
+        return 0;
+}
+/*
+ * Given a ORANGEFS object identifier (fsid, handle), convert it into a ino_t type
+ * that will be used as a hash-index from where the handle will
+ * be searched for in the VFS hash table of inodes.
+ */
+static inline ino_t orangefs_handle_hash(struct orangefs_object_kref *ref)
+{
+        if (!ref)
+                return 0;
+        return orangefs_khandle_to_ino(&(ref->khandle));
+}
+/*
+ * Called to set up an inode from iget5_locked.
+ */
+static int orangefs_set_inode(struct inode *inode, void *data)
+{
+        struct orangefs_object_kref *ref = (struct orangefs_object_kref *) data;
+        ORANGEFS_I(inode)->refn.fs_id = ref->fs_id;
+        ORANGEFS_I(inode)->refn.khandle = ref->khandle;
+        return 0;
+}
+/*
+ * Called to determine if handles match.
+ */
+static int orangefs_test_inode(struct inode *inode, void *data)
+{
+        struct orangefs_object_kref *ref = (struct orangefs_object_kref *) data;
+        struct orangefs_inode_s *orangefs_inode = NULL;
+        orangefs_inode = ORANGEFS_I(inode);
+        return (!ORANGEFS_khandle_cmp(&(orangefs_inode->refn.khandle), &(ref->khandle))
+                && orangefs_inode->refn.fs_id == ref->fs_id);
+}
+/*
+ * Front-end to lookup the inode-cache maintained by the VFS using the ORANGEFS
+ * file handle.
+ *
+ * @sb: the file system super block instance.
+ * @ref: The ORANGEFS object for which we are trying to locate an inode structure.
+ */
+struct inode *orangefs_iget(struct super_block *sb, struct orangefs_object_kref *ref)
+{
+        struct inode *inode = NULL;
+        unsigned long hash;
+        int error;
+        hash = orangefs_handle_hash(ref);
+        inode = iget5_locked(sb, hash, orangefs_test_inode, orangefs_set_inode, ref);
+        if (!inode || !(inode->i_state & I_NEW))
+                return inode;
+        error = orangefs_inode_getattr(inode, 1, 0);
+        if (error) {
+                iget_failed(inode);
+                return ERR_PTR(error);
+        }
+        inode->i_ino = hash;    /* needed for stat etc */
+        orangefs_init_iops(inode);
+        unlock_new_inode(inode);
+        gossip_debug(GOSSIP_INODE_DEBUG,
+                     "iget handle %pU, fsid %d hash %ld i_ino %lu\n",
+                     &ref->khandle,
+                     ref->fs_id,
+                     hash,
+                     inode->i_ino);
+        return inode;
+}
+/*
+ * Allocate an inode for a newly created file and insert it into the inode hash.
+ */
+struct inode *orangefs_new_inode(struct super_block *sb, struct inode *dir,
+                int mode, dev_t dev, struct orangefs_object_kref *ref)
+{
+        unsigned long hash = orangefs_handle_hash(ref);
+        struct inode *inode;
+        int error;
+        gossip_debug(GOSSIP_INODE_DEBUG,
+                     "%s:(sb is %p | MAJOR(dev)=%u | MINOR(dev)=%u mode=%o)\n",
+                     __func__,
+                     sb,
+                     MAJOR(dev),
+                     MINOR(dev),
+                     mode);
+        inode = new_inode(sb);
+        if (!inode)
+                return NULL;
+        orangefs_set_inode(inode, ref);
+        inode->i_ino = hash;    /* needed for stat etc */
+        error = orangefs_inode_getattr(inode, 1, 0);
+        if (error)
+                goto out_iput;
+        orangefs_init_iops(inode);
+        inode->i_mode = mode;
+        inode->i_uid = current_fsuid();
+        inode->i_gid = current_fsgid();
+        inode->i_atime = inode->i_mtime = inode->i_ctime = CURRENT_TIME;
+        inode->i_size = PAGE_CACHE_SIZE;
+        inode->i_rdev = dev;
+        error = insert_inode_locked4(inode, hash, orangefs_test_inode, ref);
+        if (error < 0)
+                goto out_iput;
+        gossip_debug(GOSSIP_INODE_DEBUG,
+                     "Initializing ACL's for inode %pU\n",
+                     get_khandle_from_ino(inode));
+        orangefs_init_acl(inode, dir);
+        return inode;
+out_iput:
+        iput(inode);
+        return ERR_PTR(error);
+}
diff --git a/fs/orangefs/namei.c b/fs/orangefs/namei.c
new file mode 100644
index 000000000000..5a60c508af4e
--- /dev/null
+++ b/fs/orangefs/namei.c
@@ -0,0 +1,462 @@
+/*
+ * (C) 2001 Clemson University and The University of Chicago
+ *
+ * See COPYING in top-level directory.
+ */
+/*
+ *  Linux VFS namei operations.
+ */
+#include "protocol.h"
+#include "orangefs-kernel.h"
+/*
+ * Get a newly allocated inode to go with a negative dentry.
+ */
+static int orangefs_create(struct inode *dir,
+                        struct dentry *dentry,
+                        umode_t mode,
+                        bool exclusive)
+{
+        struct orangefs_inode_s *parent = ORANGEFS_I(dir);
+        struct orangefs_kernel_op_s *new_op;
+        struct inode *inode;
+        int ret;
+        gossip_debug(GOSSIP_NAME_DEBUG, "%s: %s\n",
+                     __func__,
+                     dentry->d_name.name);
+        new_op = op_alloc(ORANGEFS_VFS_OP_CREATE);
+        if (!new_op)
+                return -ENOMEM;
+        new_op->upcall.req.create.parent_refn = parent->refn;
+        fill_default_sys_attrs(new_op->upcall.req.create.attributes,
+                               ORANGEFS_TYPE_METAFILE, mode);
+        strncpy(new_op->upcall.req.create.d_name,
+                dentry->d_name.name, ORANGEFS_NAME_MAX);
+        ret = service_operation(new_op, __func__, get_interruptible_flag(dir));
+        gossip_debug(GOSSIP_NAME_DEBUG,
+                     "%s: %s: handle:%pU: fsid:%d: new_op:%p: ret:%d:\n",
+                     __func__,
+                     dentry->d_name.name,
+                     &new_op->downcall.resp.create.refn.khandle,
+                     new_op->downcall.resp.create.refn.fs_id,
+                     new_op,
+                     ret);
+        if (ret < 0)
+                goto out;
+        inode = orangefs_new_inode(dir->i_sb, dir, S_IFREG | mode, 0,
+                                &new_op->downcall.resp.create.refn);
+        if (IS_ERR(inode)) {
+                gossip_err("%s: Failed to allocate inode for file :%s:\n",
+                           __func__,
+                           dentry->d_name.name);
+                ret = PTR_ERR(inode);
+                goto out;
+        }
+        gossip_debug(GOSSIP_NAME_DEBUG,
+                     "%s: Assigned inode :%pU: for file :%s:\n",
+                     __func__,
+                     get_khandle_from_ino(inode),
+                     dentry->d_name.name);
+        d_instantiate(dentry, inode);
+        unlock_new_inode(inode);
+        gossip_debug(GOSSIP_NAME_DEBUG,
+                     "%s: dentry instantiated for %s\n",
+                     __func__,
+                     dentry->d_name.name);
+        SetMtimeFlag(parent);
+        dir->i_mtime = dir->i_ctime = current_fs_time(dir->i_sb);
+        mark_inode_dirty_sync(dir);
+        ret = 0;
+out:
+        op_release(new_op);
+        gossip_debug(GOSSIP_NAME_DEBUG,
+                     "%s: %s: returning %d\n",
+                     __func__,
+                     dentry->d_name.name,
+                     ret);
+        return ret;
+}
+/*
+ * Attempt to resolve an object name (dentry->d_name), parent handle, and
+ * fsid into a handle for the object.
+ */
+static struct dentry *orangefs_lookup(struct inode *dir, struct dentry *dentry,
+                                   unsigned int flags)
+{
+        struct orangefs_inode_s *parent = ORANGEFS_I(dir);
+        struct orangefs_kernel_op_s *new_op;
+        struct inode *inode;
+        struct dentry *res;
+        int ret = -EINVAL;
+        /*
+         * in theory we could skip a lookup here (if the intent is to
+         * create) in order to avoid a potentially failed lookup, but
+         * leaving it in can skip a valid lookup and try to create a file
+         * that already exists (e.g. the vfs already handles checking for
+         * -EEXIST on O_EXCL opens, which is broken if we skip this lookup
+         * in the create path)
+         */
+        gossip_debug(GOSSIP_NAME_DEBUG, "%s called on %s\n",
+                     __func__, dentry->d_name.name);
+        if (dentry->d_name.len > (ORANGEFS_NAME_MAX - 1))
+                return ERR_PTR(-ENAMETOOLONG);
+        new_op = op_alloc(ORANGEFS_VFS_OP_LOOKUP);
+        if (!new_op)
+                return ERR_PTR(-ENOMEM);
+        new_op->upcall.req.lookup.sym_follow = ORANGEFS_LOOKUP_LINK_NO_FOLLOW;
+        gossip_debug(GOSSIP_NAME_DEBUG, "%s:%s:%d using parent %pU\n",
+                     __FILE__,
+                     __func__,
+                     __LINE__,
+                     &parent->refn.khandle);
+        new_op->upcall.req.lookup.parent_refn = parent->refn;
+        strncpy(new_op->upcall.req.lookup.d_name, dentry->d_name.name,
+                ORANGEFS_NAME_MAX);
+        gossip_debug(GOSSIP_NAME_DEBUG,
+                     "%s: doing lookup on %s under %pU,%d\n",
+                     __func__,
+                     new_op->upcall.req.lookup.d_name,
+                     &new_op->upcall.req.lookup.parent_refn.khandle,
+                     new_op->upcall.req.lookup.parent_refn.fs_id);
+        ret = service_operation(new_op, __func__, get_interruptible_flag(dir));
+        gossip_debug(GOSSIP_NAME_DEBUG,
+                     "Lookup Got %pU, fsid %d (ret=%d)\n",
+                     &new_op->downcall.resp.lookup.refn.khandle,
+                     new_op->downcall.resp.lookup.refn.fs_id,
+                     ret);
+        if (ret < 0) {
+                if (ret == -ENOENT) {
+                        /*
+                         * if no inode was found, add a negative dentry to
+                         * dcache anyway; if we don't, we don't hold expected
+                         * lookup semantics and we most noticeably break
+                         * during directory renames.
+                         *
+                         * however, if the operation failed or exited, do not
+                         * add the dentry (e.g. in the case that a touch is
+                         * issued on a file that already exists that was
+                         * interrupted during this lookup -- no need to add
+                         * another negative dentry for an existing file)
+                         */
+                        gossip_debug(GOSSIP_NAME_DEBUG,
+                                     "orangefs_lookup: Adding *negative* dentry "
+                                     "%p for %s\n",
+                                     dentry,
+                                     dentry->d_name.name);
+                        d_add(dentry, NULL);
+                        res = NULL;
+                        goto out;
+                }
+                /* must be a non-recoverable error */
+                res = ERR_PTR(ret);
+                goto out;
+        }
+        inode = orangefs_iget(dir->i_sb, &new_op->downcall.resp.lookup.refn);
+        if (IS_ERR(inode)) {
+                gossip_debug(GOSSIP_NAME_DEBUG,
+                        "error %ld from iget\n", PTR_ERR(inode));
+                res = ERR_CAST(inode);
+                goto out;
+        }
+        gossip_debug(GOSSIP_NAME_DEBUG,
+                     "%s:%s:%d "
+                     "Found good inode [%lu] with count [%d]\n",
+                     __FILE__,
+                     __func__,
+                     __LINE__,
+                     inode->i_ino,
+                     (int)atomic_read(&inode->i_count));
+        /* update dentry/inode pair into dcache */
+        res = d_splice_alias(inode, dentry);
+        gossip_debug(GOSSIP_NAME_DEBUG,
+                     "Lookup success (inode ct = %d)\n",
+                     (int)atomic_read(&inode->i_count));
+out:
+        op_release(new_op);
+        return res;
+}
+/* return 0 on success; non-zero otherwise */
+static int orangefs_unlink(struct inode *dir, struct dentry *dentry)
+{
+        struct inode *inode = dentry->d_inode;
+        struct orangefs_inode_s *parent = ORANGEFS_I(dir);
+        struct orangefs_kernel_op_s *new_op;
+        int ret;
+        gossip_debug(GOSSIP_NAME_DEBUG,
+                     "%s: called on %s\n"
+                     "  (inode %pU): Parent is %pU | fs_id %d\n",
+                     __func__,
+                     dentry->d_name.name,
+                     get_khandle_from_ino(inode),
+                     &parent->refn.khandle,
+                     parent->refn.fs_id);
+        new_op = op_alloc(ORANGEFS_VFS_OP_REMOVE);
+        if (!new_op)
+                return -ENOMEM;
+        new_op->upcall.req.remove.parent_refn = parent->refn;
+        strncpy(new_op->upcall.req.remove.d_name, dentry->d_name.name,
+                ORANGEFS_NAME_MAX);
+        ret = service_operation(new_op, "orangefs_unlink",
+                                get_interruptible_flag(inode));
+        gossip_debug(GOSSIP_NAME_DEBUG,
+                     "%s: service_operation returned:%d:\n",
+                     __func__,
+                     ret);
+        op_release(new_op);
+        if (!ret) {
+                drop_nlink(inode);
+                SetMtimeFlag(parent);
+                dir->i_mtime = dir->i_ctime = current_fs_time(dir->i_sb);
+                mark_inode_dirty_sync(dir);
+        }
+        return ret;
+}
+static int orangefs_symlink(struct inode *dir,
+                         struct dentry *dentry,
+                         const char *symname)
+{
+        struct orangefs_inode_s *parent = ORANGEFS_I(dir);
+        struct orangefs_kernel_op_s *new_op;
+        struct inode *inode;
+        int mode = 755;
+        int ret;
+        gossip_debug(GOSSIP_NAME_DEBUG, "%s: called\n", __func__);
+        if (!symname)
+                return -EINVAL;
+        if (strlen(symname)+1 > ORANGEFS_NAME_MAX)
+                return -ENAMETOOLONG;
+        new_op = op_alloc(ORANGEFS_VFS_OP_SYMLINK);
+        if (!new_op)
+                return -ENOMEM;
+        new_op->upcall.req.sym.parent_refn = parent->refn;
+        fill_default_sys_attrs(new_op->upcall.req.sym.attributes,
+                               ORANGEFS_TYPE_SYMLINK,
+                               mode);
+        strncpy(new_op->upcall.req.sym.entry_name,
+                dentry->d_name.name,
+                ORANGEFS_NAME_MAX);
+        strncpy(new_op->upcall.req.sym.target, symname, ORANGEFS_NAME_MAX);
+        ret = service_operation(new_op, __func__, get_interruptible_flag(dir));
+        gossip_debug(GOSSIP_NAME_DEBUG,
+                     "Symlink Got ORANGEFS handle %pU on fsid %d (ret=%d)\n",
+                     &new_op->downcall.resp.sym.refn.khandle,
+                     new_op->downcall.resp.sym.refn.fs_id, ret);
+        if (ret < 0) {
+                gossip_debug(GOSSIP_NAME_DEBUG,
+                            "%s: failed with error code %d\n",
+                            __func__, ret);
+                goto out;
+        }
+        inode = orangefs_new_inode(dir->i_sb, dir, S_IFLNK | mode, 0,
+                                &new_op->downcall.resp.sym.refn);
+        if (IS_ERR(inode)) {
+                gossip_err
+                    ("*** Failed to allocate orangefs symlink inode\n");
+                ret = PTR_ERR(inode);
+                goto out;
+        }
+        gossip_debug(GOSSIP_NAME_DEBUG,
+                     "Assigned symlink inode new number of %pU\n",
+                     get_khandle_from_ino(inode));
+        d_instantiate(dentry, inode);
+        unlock_new_inode(inode);
+        gossip_debug(GOSSIP_NAME_DEBUG,
+                     "Inode (Symlink) %pU -> %s\n",
+                     get_khandle_from_ino(inode),
+                     dentry->d_name.name);
+        SetMtimeFlag(parent);
+        dir->i_mtime = dir->i_ctime = current_fs_time(dir->i_sb);
+        mark_inode_dirty_sync(dir);
+        ret = 0;
+out:
+        op_release(new_op);
+        return ret;
+}
+static int orangefs_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode)
+{
+        struct orangefs_inode_s *parent = ORANGEFS_I(dir);
+        struct orangefs_kernel_op_s *new_op;
+        struct inode *inode;
+        int ret;
+        new_op = op_alloc(ORANGEFS_VFS_OP_MKDIR);
+        if (!new_op)
+                return -ENOMEM;
+        new_op->upcall.req.mkdir.parent_refn = parent->refn;
+        fill_default_sys_attrs(new_op->upcall.req.mkdir.attributes,
+                              ORANGEFS_TYPE_DIRECTORY, mode);
+        strncpy(new_op->upcall.req.mkdir.d_name,
+                dentry->d_name.name, ORANGEFS_NAME_MAX);
+        ret = service_operation(new_op, __func__, get_interruptible_flag(dir));
+        gossip_debug(GOSSIP_NAME_DEBUG,
+                     "Mkdir Got ORANGEFS handle %pU on fsid %d\n",
+                     &new_op->downcall.resp.mkdir.refn.khandle,
+                     new_op->downcall.resp.mkdir.refn.fs_id);
+        if (ret < 0) {
+                gossip_debug(GOSSIP_NAME_DEBUG,
+                             "%s: failed with error code %d\n",
+                             __func__, ret);
+                goto out;
+        }
+        inode = orangefs_new_inode(dir->i_sb, dir, S_IFDIR | mode, 0,
+                                &new_op->downcall.resp.mkdir.refn);
+        if (IS_ERR(inode)) {
+                gossip_err("*** Failed to allocate orangefs dir inode\n");
+                ret = PTR_ERR(inode);
+                goto out;
+        }
+        gossip_debug(GOSSIP_NAME_DEBUG,
+                     "Assigned dir inode new number of %pU\n",
+                     get_khandle_from_ino(inode));
+        d_instantiate(dentry, inode);
+        unlock_new_inode(inode);
+        gossip_debug(GOSSIP_NAME_DEBUG,
+                     "Inode (Directory) %pU -> %s\n",
+                     get_khandle_from_ino(inode),
+                     dentry->d_name.name);
+        /*
+         * NOTE: we have no good way to keep nlink consistent for directories
+         * across clients; keep constant at 1.
+         */
+        SetMtimeFlag(parent);
+        dir->i_mtime = dir->i_ctime = current_fs_time(dir->i_sb);
+        mark_inode_dirty_sync(dir);
+out:
+        op_release(new_op);
+        return ret;
+}
+static int orangefs_rename(struct inode *old_dir,
+                        struct dentry *old_dentry,
+                        struct inode *new_dir,
+                        struct dentry *new_dentry)
+{
+        struct orangefs_kernel_op_s *new_op;
+        int ret;
+        gossip_debug(GOSSIP_NAME_DEBUG,
+                     "orangefs_rename: called (%s/%s => %s/%s) ct=%d\n",
+                     old_dentry->d_parent->d_name.name,
+                     old_dentry->d_name.name,
+                     new_dentry->d_parent->d_name.name,
+                     new_dentry->d_name.name,
+                     d_count(new_dentry));
+        new_op = op_alloc(ORANGEFS_VFS_OP_RENAME);
+        if (!new_op)
+                return -EINVAL;
+        new_op->upcall.req.rename.old_parent_refn = ORANGEFS_I(old_dir)->refn;
+        new_op->upcall.req.rename.new_parent_refn = ORANGEFS_I(new_dir)->refn;
+        strncpy(new_op->upcall.req.rename.d_old_name,
+                old_dentry->d_name.name,
+                ORANGEFS_NAME_MAX);
+        strncpy(new_op->upcall.req.rename.d_new_name,
+                new_dentry->d_name.name,
+                ORANGEFS_NAME_MAX);
+        ret = service_operation(new_op,
+                                "orangefs_rename",
+                                get_interruptible_flag(old_dentry->d_inode));
+        gossip_debug(GOSSIP_NAME_DEBUG,
+                     "orangefs_rename: got downcall status %d\n",
+                     ret);
+        if (new_dentry->d_inode)
+                new_dentry->d_inode->i_ctime = CURRENT_TIME;
+        op_release(new_op);
+        return ret;
+}
+/* ORANGEFS implementation of VFS inode operations for directories */
+struct inode_operations orangefs_dir_inode_operations = {
+        .lookup = orangefs_lookup,
+        .get_acl = orangefs_get_acl,
+        .set_acl = orangefs_set_acl,
+        .create = orangefs_create,
+        .unlink = orangefs_unlink,
+        .symlink = orangefs_symlink,
+        .mkdir = orangefs_mkdir,
+        .rmdir = orangefs_unlink,
+        .rename = orangefs_rename,
+        .setattr = orangefs_setattr,
+        .getattr = orangefs_getattr,
+        .setxattr = generic_setxattr,
+        .getxattr = generic_getxattr,
+        .removexattr = generic_removexattr,
+        .listxattr = orangefs_listxattr,
+        .permission = orangefs_permission,
+};
diff --git a/fs/orangefs/orangefs-bufmap.c b/fs/orangefs/orangefs-bufmap.c
new file mode 100644
index 000000000000..1f8acc9f9a88
--- /dev/null
+++ b/fs/orangefs/orangefs-bufmap.c
@@ -0,0 +1,556 @@
+/*
+ * (C) 2001 Clemson University and The University of Chicago
+ *
+ * See COPYING in top-level directory.
+ */
+#include "protocol.h"
+#include "orangefs-kernel.h"
+#include "orangefs-bufmap.h"
+struct slot_map {
+        int c;
+        wait_queue_head_t q;
+        int count;
+        unsigned long *map;
+};
+static struct slot_map rw_map = {
+        .c = -1,
+        .q = __WAIT_QUEUE_HEAD_INITIALIZER(rw_map.q)
+};
+static struct slot_map readdir_map = {
+        .c = -1,
+        .q = __WAIT_QUEUE_HEAD_INITIALIZER(readdir_map.q)
+};
+static void install(struct slot_map *m, int count, unsigned long *map)
+{
+        spin_lock(&m->q.lock);
+        m->c = m->count = count;
+        m->map = map;
+        wake_up_all_locked(&m->q);
+        spin_unlock(&m->q.lock);
+}
+static void mark_killed(struct slot_map *m)
+{
+        spin_lock(&m->q.lock);
+        m->c -= m->count + 1;
+        spin_unlock(&m->q.lock);
+}
+static void run_down(struct slot_map *m)
+{
+        DEFINE_WAIT(wait);
+        spin_lock(&m->q.lock);
+        if (m->c != -1) {
+                for (;;) {
+                        if (likely(list_empty(&wait.task_list)))
+                                __add_wait_queue_tail(&m->q, &wait);
+                        set_current_state(TASK_UNINTERRUPTIBLE);
+                        if (m->c == -1)
+                                break;
+                        spin_unlock(&m->q.lock);
+                        schedule();
+                        spin_lock(&m->q.lock);
+                }
+                __remove_wait_queue(&m->q, &wait);
+                __set_current_state(TASK_RUNNING);
+        }
+        m->map = NULL;
+        spin_unlock(&m->q.lock);
+}
+static void put(struct slot_map *m, int slot)
+{
+        int v;
+        spin_lock(&m->q.lock);
+        __clear_bit(slot, m->map);
+        v = ++m->c;
+        if (unlikely(v == 1))   /* no free slots -> one free slot */
+                wake_up_locked(&m->q);
+        else if (unlikely(v == -1))     /* finished dying */
+                wake_up_all_locked(&m->q);
+        spin_unlock(&m->q.lock);
+}
+static int wait_for_free(struct slot_map *m)
+{
+        long left = slot_timeout_secs * HZ;
+        DEFINE_WAIT(wait);
+        do {
+                long n = left, t;
+                if (likely(list_empty(&wait.task_list)))
+                        __add_wait_queue_tail_exclusive(&m->q, &wait);
+                set_current_state(TASK_INTERRUPTIBLE);
+                if (m->c > 0)
+                        break;
+                if (m->c < 0) {
+                        /* we are waiting for map to be installed */
+                        /* it would better be there soon, or we go away */
+                        if (n > ORANGEFS_BUFMAP_WAIT_TIMEOUT_SECS * HZ)
+                                n = ORANGEFS_BUFMAP_WAIT_TIMEOUT_SECS * HZ;
+                }
+                spin_unlock(&m->q.lock);
+                t = schedule_timeout(n);
+                spin_lock(&m->q.lock);
+                if (unlikely(!t) && n != left && m->c < 0)
+                        left = t;
+                else
+                        left = t + (left - n);
+                if (unlikely(signal_pending(current)))
+                        left = -EINTR;
+        } while (left > 0);
+        if (!list_empty(&wait.task_list))
+                list_del(&wait.task_list);
+        else if (left <= 0 && waitqueue_active(&m->q))
+                __wake_up_locked_key(&m->q, TASK_INTERRUPTIBLE, NULL);
+        __set_current_state(TASK_RUNNING);
+        if (likely(left > 0))
+                return 0;
+        return left < 0 ? -EINTR : -ETIMEDOUT;
+}
+static int get(struct slot_map *m)
+{
+        int res = 0;
+        spin_lock(&m->q.lock);
+        if (unlikely(m->c <= 0))
+                res = wait_for_free(m);
+        if (likely(!res)) {
+                m->c--;
+                res = find_first_zero_bit(m->map, m->count);
+                __set_bit(res, m->map);
+        }
+        spin_unlock(&m->q.lock);
+        return res;
+}
+/* used to describe mapped buffers */
+struct orangefs_bufmap_desc {
+        void *uaddr;                    /* user space address pointer */
+        struct page **page_array;       /* array of mapped pages */
+        int array_count;                /* size of above arrays */
+        struct list_head list_link;
+};
+static struct orangefs_bufmap {
+        int desc_size;
+        int desc_shift;
+        int desc_count;
+        int total_size;
+        int page_count;
+        struct page **page_array;
+        struct orangefs_bufmap_desc *desc_array;
+        /* array to track usage of buffer descriptors */
+        unsigned long *buffer_index_array;
+        /* array to track usage of buffer descriptors for readdir */
+#define N DIV_ROUND_UP(ORANGEFS_READDIR_DEFAULT_DESC_COUNT, BITS_PER_LONG)
+        unsigned long readdir_index_array[N];
+#undef N
+} *__orangefs_bufmap;
+static DEFINE_SPINLOCK(orangefs_bufmap_lock);
+static void
+orangefs_bufmap_unmap(struct orangefs_bufmap *bufmap)
+{
+        int i;
+        for (i = 0; i < bufmap->page_count; i++)
+                page_cache_release(bufmap->page_array[i]);
+}
+static void
+orangefs_bufmap_free(struct orangefs_bufmap *bufmap)
+{
+        kfree(bufmap->page_array);
+        kfree(bufmap->desc_array);
+        kfree(bufmap->buffer_index_array);
+        kfree(bufmap);
+}
+/*
+ * XXX: Can the size and shift change while the caller gives up the 
+ * XXX: lock between calling this and doing something useful?
+ */
+int orangefs_bufmap_size_query(void)
+{
+        struct orangefs_bufmap *bufmap;
+        int size = 0;
+        spin_lock(&orangefs_bufmap_lock);
+        bufmap = __orangefs_bufmap;
+        if (bufmap)
+                size = bufmap->desc_size;
+        spin_unlock(&orangefs_bufmap_lock);
+        return size;
+}
+int orangefs_bufmap_shift_query(void)
+{
+        struct orangefs_bufmap *bufmap;
+        int shift = 0;
+        spin_lock(&orangefs_bufmap_lock);
+        bufmap = __orangefs_bufmap;
+        if (bufmap)
+                shift = bufmap->desc_shift;
+        spin_unlock(&orangefs_bufmap_lock);
+        return shift;
+}
+static DECLARE_WAIT_QUEUE_HEAD(bufmap_waitq);
+static DECLARE_WAIT_QUEUE_HEAD(readdir_waitq);
+/*
+ * orangefs_get_bufmap_init
+ *
+ * If bufmap_init is 1, then the shared memory system, including the
+ * buffer_index_array, is available.  Otherwise, it is not.
+ *
+ * returns the value of bufmap_init
+ */
+int orangefs_get_bufmap_init(void)
+{
+        return __orangefs_bufmap ? 1 : 0;
+}
+static struct orangefs_bufmap *
+orangefs_bufmap_alloc(struct ORANGEFS_dev_map_desc *user_desc)
+{
+        struct orangefs_bufmap *bufmap;
+        bufmap = kzalloc(sizeof(*bufmap), GFP_KERNEL);
+        if (!bufmap)
+                goto out;
+        bufmap->total_size = user_desc->total_size;
+        bufmap->desc_count = user_desc->count;
+        bufmap->desc_size = user_desc->size;
+        bufmap->desc_shift = ilog2(bufmap->desc_size);
+        bufmap->buffer_index_array =
+                kzalloc(DIV_ROUND_UP(bufmap->desc_count, BITS_PER_LONG), GFP_KERNEL);
+        if (!bufmap->buffer_index_array) {
+                gossip_err("orangefs: could not allocate %d buffer indices\n",
+                                bufmap->desc_count);
+                goto out_free_bufmap;
+        }
+        bufmap->desc_array =
+                kcalloc(bufmap->desc_count, sizeof(struct orangefs_bufmap_desc),
+                        GFP_KERNEL);
+        if (!bufmap->desc_array) {
+                gossip_err("orangefs: could not allocate %d descriptors\n",
+                                bufmap->desc_count);
+                goto out_free_index_array;
+        }
+        bufmap->page_count = bufmap->total_size / PAGE_SIZE;
+        /* allocate storage to track our page mappings */
+        bufmap->page_array =
+                kcalloc(bufmap->page_count, sizeof(struct page *), GFP_KERNEL);
+        if (!bufmap->page_array)
+                goto out_free_desc_array;
+        return bufmap;
+out_free_desc_array:
+        kfree(bufmap->desc_array);
+out_free_index_array:
+        kfree(bufmap->buffer_index_array);
+out_free_bufmap:
+        kfree(bufmap);
+out:
+        return NULL;
+}
+static int
+orangefs_bufmap_map(struct orangefs_bufmap *bufmap,
+                struct ORANGEFS_dev_map_desc *user_desc)
+{
+        int pages_per_desc = bufmap->desc_size / PAGE_SIZE;
+        int offset = 0, ret, i;
+        /* map the pages */
+        ret = get_user_pages_fast((unsigned long)user_desc->ptr,
+                             bufmap->page_count, 1, bufmap->page_array);
+        if (ret < 0)
+                return ret;
+        if (ret != bufmap->page_count) {
+                gossip_err("orangefs error: asked for %d pages, only got %d.\n",
+                                bufmap->page_count, ret);
+                for (i = 0; i < ret; i++) {
+                        SetPageError(bufmap->page_array[i]);
+                        page_cache_release(bufmap->page_array[i]);
+                }
+                return -ENOMEM;
+        }
+        /*
+         * ideally we want to get kernel space pointers for each page, but
+         * we can't kmap that many pages at once if highmem is being used.
+         * so instead, we just kmap/kunmap the page address each time the
+         * kaddr is needed.
+         */
+        for (i = 0; i < bufmap->page_count; i++)
+                flush_dcache_page(bufmap->page_array[i]);
+        /* build a list of available descriptors */
+        for (offset = 0, i = 0; i < bufmap->desc_count; i++) {
+                bufmap->desc_array[i].page_array = &bufmap->page_array[offset];
+                bufmap->desc_array[i].array_count = pages_per_desc;
+                bufmap->desc_array[i].uaddr =
+                    (user_desc->ptr + (i * pages_per_desc * PAGE_SIZE));
+                offset += pages_per_desc;
+        }
+        return 0;
+}
+/*
+ * orangefs_bufmap_initialize()
+ *
+ * initializes the mapped buffer interface
+ *
+ * returns 0 on success, -errno on failure
+ */
+int orangefs_bufmap_initialize(struct ORANGEFS_dev_map_desc *user_desc)
+{
+        struct orangefs_bufmap *bufmap;
+        int ret = -EINVAL;
+        gossip_debug(GOSSIP_BUFMAP_DEBUG,
+                     "orangefs_bufmap_initialize: called (ptr ("
+                     "%p) sz (%d) cnt(%d).\n",
+                     user_desc->ptr,
+                     user_desc->size,
+                     user_desc->count);
+        /*
+         * sanity check alignment and size of buffer that caller wants to
+         * work with
+         */
+        if (PAGE_ALIGN((unsigned long)user_desc->ptr) !=
+            (unsigned long)user_desc->ptr) {
+                gossip_err("orangefs error: memory alignment (front). %p\n",
+                           user_desc->ptr);
+                goto out;
+        }
+        if (PAGE_ALIGN(((unsigned long)user_desc->ptr + user_desc->total_size))
+            != (unsigned long)(user_desc->ptr + user_desc->total_size)) {
+                gossip_err("orangefs error: memory alignment (back).(%p + %d)\n",
+                           user_desc->ptr,
+                           user_desc->total_size);
+                goto out;
+        }
+        if (user_desc->total_size != (user_desc->size * user_desc->count)) {
+                gossip_err("orangefs error: user provided an oddly sized buffer: (%d, %d, %d)\n",
+                           user_desc->total_size,
+                           user_desc->size,
+                           user_desc->count);
+                goto out;
+        }
+        if ((user_desc->size % PAGE_SIZE) != 0) {
+                gossip_err("orangefs error: bufmap size not page size divisible (%d).\n",
+                           user_desc->size);
+                goto out;
+        }
+        ret = -ENOMEM;
+        bufmap = orangefs_bufmap_alloc(user_desc);
+        if (!bufmap)
+                goto out;
+        ret = orangefs_bufmap_map(bufmap, user_desc);
+        if (ret)
+                goto out_free_bufmap;
+        spin_lock(&orangefs_bufmap_lock);
+        if (__orangefs_bufmap) {
+                spin_unlock(&orangefs_bufmap_lock);
+                gossip_err("orangefs: error: bufmap already initialized.\n");
+                ret = -EINVAL;
+                goto out_unmap_bufmap;
+        }
+        __orangefs_bufmap = bufmap;
+        install(&rw_map,
+                bufmap->desc_count,
+                bufmap->buffer_index_array);
+        install(&readdir_map,
+                ORANGEFS_READDIR_DEFAULT_DESC_COUNT,
+                bufmap->readdir_index_array);
+        spin_unlock(&orangefs_bufmap_lock);
+        gossip_debug(GOSSIP_BUFMAP_DEBUG,
+                     "orangefs_bufmap_initialize: exiting normally\n");
+        return 0;
+out_unmap_bufmap:
+        orangefs_bufmap_unmap(bufmap);
+out_free_bufmap:
+        orangefs_bufmap_free(bufmap);
+out:
+        return ret;
+}
+/*
+ * orangefs_bufmap_finalize()
+ *
+ * shuts down the mapped buffer interface and releases any resources
+ * associated with it
+ *
+ * no return value
+ */
+void orangefs_bufmap_finalize(void)
+{
+        struct orangefs_bufmap *bufmap = __orangefs_bufmap;
+        if (!bufmap)
+                return;
+        gossip_debug(GOSSIP_BUFMAP_DEBUG, "orangefs_bufmap_finalize: called\n");
+        mark_killed(&rw_map);
+        mark_killed(&readdir_map);
+        gossip_debug(GOSSIP_BUFMAP_DEBUG,
+                     "orangefs_bufmap_finalize: exiting normally\n");
+}
+void orangefs_bufmap_run_down(void)
+{
+        struct orangefs_bufmap *bufmap = __orangefs_bufmap;
+        if (!bufmap)
+                return;
+        run_down(&rw_map);
+        run_down(&readdir_map);
+        spin_lock(&orangefs_bufmap_lock);
+        __orangefs_bufmap = NULL;
+        spin_unlock(&orangefs_bufmap_lock);
+        orangefs_bufmap_unmap(bufmap);
+        orangefs_bufmap_free(bufmap);
+}
+/*
+ * orangefs_bufmap_get()
+ *
+ * gets a free mapped buffer descriptor, will sleep until one becomes
+ * available if necessary
+ *
+ * returns slot on success, -errno on failure
+ */
+int orangefs_bufmap_get(void)
+{
+        return get(&rw_map);
+}
+/*
+ * orangefs_bufmap_put()
+ *
+ * returns a mapped buffer descriptor to the collection
+ *
+ * no return value
+ */
+void orangefs_bufmap_put(int buffer_index)
+{
+        put(&rw_map, buffer_index);
+}
+/*
+ * orangefs_readdir_index_get()
+ *
+ * gets a free descriptor, will sleep until one becomes
+ * available if necessary.
+ * Although the readdir buffers are not mapped into kernel space
+ * we could do that at a later point of time. Regardless, these
+ * indices are used by the client-core.
+ *
+ * returns slot on success, -errno on failure
+ */
+int orangefs_readdir_index_get(void)
+{
+        return get(&readdir_map);
+}
+void orangefs_readdir_index_put(int buffer_index)
+{
+        put(&readdir_map, buffer_index);
+}
+/*
+ * we've been handed an iovec, we need to copy it to 
+ * the shared memory descriptor at "buffer_index".
+ */
+int orangefs_bufmap_copy_from_iovec(struct iov_iter *iter,
+                                int buffer_index,
+                                size_t size)
+{
+        struct orangefs_bufmap_desc *to;
+        int i;
+        gossip_debug(GOSSIP_BUFMAP_DEBUG,
+                     "%s: buffer_index:%d: size:%zu:\n",
+                     __func__, buffer_index, size);
+        to = &__orangefs_bufmap->desc_array[buffer_index];
+        for (i = 0; size; i++) {
+                struct page *page = to->page_array[i];
+                size_t n = size;
+                if (n > PAGE_SIZE)
+                        n = PAGE_SIZE;
+                n = copy_page_from_iter(page, 0, n, iter);
+                if (!n)
+                        return -EFAULT;
+                size -= n;
+        }
+        return 0;
+}
+/*
+ * we've been handed an iovec, we need to fill it from
+ * the shared memory descriptor at "buffer_index".
+ */
+int orangefs_bufmap_copy_to_iovec(struct iov_iter *iter,
+                                    int buffer_index,
+                                    size_t size)
+{
+        struct orangefs_bufmap_desc *from;
+        int i;
+        from = &__orangefs_bufmap->desc_array[buffer_index];
+        gossip_debug(GOSSIP_BUFMAP_DEBUG,
+                     "%s: buffer_index:%d: size:%zu:\n",
+                     __func__, buffer_index, size);
+        for (i = 0; size; i++) {
+                struct page *page = from->page_array[i];
+                size_t n = size;
+                if (n > PAGE_SIZE)
+                        n = PAGE_SIZE;
+                n = copy_page_to_iter(page, 0, n, iter);
+                if (!n)
+                        return -EFAULT;
+                size -= n;
+        }
+        return 0;
+}
diff --git a/fs/orangefs/orangefs-bufmap.h b/fs/orangefs/orangefs-bufmap.h
new file mode 100644
index 000000000000..71f64f4057b5
--- /dev/null
+++ b/fs/orangefs/orangefs-bufmap.h
@@ -0,0 +1,36 @@
+/*
+ * (C) 2001 Clemson University and The University of Chicago
+ *
+ * See COPYING in top-level directory.
+ */
+#ifndef __ORANGEFS_BUFMAP_H
+#define __ORANGEFS_BUFMAP_H
+int orangefs_bufmap_size_query(void);
+int orangefs_bufmap_shift_query(void);
+int orangefs_bufmap_initialize(struct ORANGEFS_dev_map_desc *user_desc);
+void orangefs_bufmap_finalize(void);
+void orangefs_bufmap_run_down(void);
+int orangefs_bufmap_get(void);
+void orangefs_bufmap_put(int buffer_index);
+int orangefs_readdir_index_get(void);
+void orangefs_readdir_index_put(int buffer_index);
+int orangefs_bufmap_copy_from_iovec(struct iov_iter *iter,
+                                int buffer_index,
+                                size_t size);
+int orangefs_bufmap_copy_to_iovec(struct iov_iter *iter,
+                              int buffer_index,
+                              size_t size);
+#endif /* __ORANGEFS_BUFMAP_H */
diff --git a/fs/orangefs/orangefs-cache.c b/fs/orangefs/orangefs-cache.c
new file mode 100644
index 000000000000..900a2e38e11b
--- /dev/null
+++ b/fs/orangefs/orangefs-cache.c
@@ -0,0 +1,161 @@
+/*
+ * (C) 2001 Clemson University and The University of Chicago
+ *
+ * See COPYING in top-level directory.
+ */
+#include "protocol.h"
+#include "orangefs-kernel.h"
+/* tags assigned to kernel upcall operations */
+static __u64 next_tag_value;
+static DEFINE_SPINLOCK(next_tag_value_lock);
+/* the orangefs memory caches */
+/* a cache for orangefs upcall/downcall operations */
+static struct kmem_cache *op_cache;
+int op_cache_initialize(void)
+{
+        op_cache = kmem_cache_create("orangefs_op_cache",
+                                     sizeof(struct orangefs_kernel_op_s),
+                                     0,
+                                     ORANGEFS_CACHE_CREATE_FLAGS,
+                                     NULL);
+        if (!op_cache) {
+                gossip_err("Cannot create orangefs_op_cache\n");
+                return -ENOMEM;
+        }
+        /* initialize our atomic tag counter */
+        spin_lock(&next_tag_value_lock);
+        next_tag_value = 100;
+        spin_unlock(&next_tag_value_lock);
+        return 0;
+}
+int op_cache_finalize(void)
+{
+        kmem_cache_destroy(op_cache);
+        return 0;
+}
+char *get_opname_string(struct orangefs_kernel_op_s *new_op)
+{
+        if (new_op) {
+                __s32 type = new_op->upcall.type;
+                if (type == ORANGEFS_VFS_OP_FILE_IO)
+                        return "OP_FILE_IO";
+                else if (type == ORANGEFS_VFS_OP_LOOKUP)
+                        return "OP_LOOKUP";
+                else if (type == ORANGEFS_VFS_OP_CREATE)
+                        return "OP_CREATE";
+                else if (type == ORANGEFS_VFS_OP_GETATTR)
+                        return "OP_GETATTR";
+                else if (type == ORANGEFS_VFS_OP_REMOVE)
+                        return "OP_REMOVE";
+                else if (type == ORANGEFS_VFS_OP_MKDIR)
+                        return "OP_MKDIR";
+                else if (type == ORANGEFS_VFS_OP_READDIR)
+                        return "OP_READDIR";
+                else if (type == ORANGEFS_VFS_OP_READDIRPLUS)
+                        return "OP_READDIRPLUS";
+                else if (type == ORANGEFS_VFS_OP_SETATTR)
+                        return "OP_SETATTR";
+                else if (type == ORANGEFS_VFS_OP_SYMLINK)
+                        return "OP_SYMLINK";
+                else if (type == ORANGEFS_VFS_OP_RENAME)
+                        return "OP_RENAME";
+                else if (type == ORANGEFS_VFS_OP_STATFS)
+                        return "OP_STATFS";
+                else if (type == ORANGEFS_VFS_OP_TRUNCATE)
+                        return "OP_TRUNCATE";
+                else if (type == ORANGEFS_VFS_OP_MMAP_RA_FLUSH)
+                        return "OP_MMAP_RA_FLUSH";
+                else if (type == ORANGEFS_VFS_OP_FS_MOUNT)
+                        return "OP_FS_MOUNT";
+                else if (type == ORANGEFS_VFS_OP_FS_UMOUNT)
+                        return "OP_FS_UMOUNT";
+                else if (type == ORANGEFS_VFS_OP_GETXATTR)
+                        return "OP_GETXATTR";
+                else if (type == ORANGEFS_VFS_OP_SETXATTR)
+                        return "OP_SETXATTR";
+                else if (type == ORANGEFS_VFS_OP_LISTXATTR)
+                        return "OP_LISTXATTR";
+                else if (type == ORANGEFS_VFS_OP_REMOVEXATTR)
+                        return "OP_REMOVEXATTR";
+                else if (type == ORANGEFS_VFS_OP_PARAM)
+                        return "OP_PARAM";
+                else if (type == ORANGEFS_VFS_OP_PERF_COUNT)
+                        return "OP_PERF_COUNT";
+                else if (type == ORANGEFS_VFS_OP_CANCEL)
+                        return "OP_CANCEL";
+                else if (type == ORANGEFS_VFS_OP_FSYNC)
+                        return "OP_FSYNC";
+                else if (type == ORANGEFS_VFS_OP_FSKEY)
+                        return "OP_FSKEY";
+        }
+        return "OP_UNKNOWN?";
+}
+void orangefs_new_tag(struct orangefs_kernel_op_s *op)
+{
+        spin_lock(&next_tag_value_lock);
+        op->tag = next_tag_value++;
+        if (next_tag_value == 0)
+                next_tag_value = 100;
+        spin_unlock(&next_tag_value_lock);
+}
+struct orangefs_kernel_op_s *op_alloc(__s32 type)
+{
+        struct orangefs_kernel_op_s *new_op = NULL;
+        new_op = kmem_cache_zalloc(op_cache, GFP_KERNEL);
+        if (new_op) {
+                INIT_LIST_HEAD(&new_op->list);
+                spin_lock_init(&new_op->lock);
+                init_completion(&new_op->waitq);
+                new_op->upcall.type = ORANGEFS_VFS_OP_INVALID;
+                new_op->downcall.type = ORANGEFS_VFS_OP_INVALID;
+                new_op->downcall.status = -1;
+                new_op->op_state = OP_VFS_STATE_UNKNOWN;
+                /* initialize the op specific tag and upcall credentials */
+                orangefs_new_tag(new_op);
+                new_op->upcall.type = type;
+                new_op->attempts = 0;
+                gossip_debug(GOSSIP_CACHE_DEBUG,
+                             "Alloced OP (%p: %llu %s)\n",
+                             new_op,
+                             llu(new_op->tag),
+                             get_opname_string(new_op));
+                new_op->upcall.uid = from_kuid(current_user_ns(),
+                                               current_fsuid());
+                new_op->upcall.gid = from_kgid(current_user_ns(),
+                                               current_fsgid());
+        } else {
+                gossip_err("op_alloc: kmem_cache_zalloc failed!\n");
+        }
+        return new_op;
+}
+void op_release(struct orangefs_kernel_op_s *orangefs_op)
+{
+        if (orangefs_op) {
+                gossip_debug(GOSSIP_CACHE_DEBUG,
+                             "Releasing OP (%p: %llu)\n",
+                             orangefs_op,
+                             llu(orangefs_op->tag));
+                kmem_cache_free(op_cache, orangefs_op);
+        } else {
+                gossip_err("NULL pointer in op_release\n");
+        }
+}
diff --git a/fs/orangefs/orangefs-debug.h b/fs/orangefs/orangefs-debug.h
new file mode 100644
index 000000000000..387db17cde2b
--- /dev/null
+++ b/fs/orangefs/orangefs-debug.h
@@ -0,0 +1,92 @@
+/*
+ * (C) 2001 Clemson University and The University of Chicago
+ *
+ * See COPYING in top-level directory.
+ */
+/* This file just defines debugging masks to be used with the gossip
+ * logging utility.  All debugging masks for ORANGEFS are kept here to make
+ * sure we don't have collisions.
+ */
+#ifndef __ORANGEFS_DEBUG_H
+#define __ORANGEFS_DEBUG_H
+#ifdef __KERNEL__
+#include <linux/types.h>
+#else
+#include <stdint.h>
+#endif
+#define GOSSIP_NO_DEBUG                 (__u64)0
+#define GOSSIP_SUPER_DEBUG              ((__u64)1 << 0)
+#define GOSSIP_INODE_DEBUG              ((__u64)1 << 1)
+#define GOSSIP_FILE_DEBUG               ((__u64)1 << 2)
+#define GOSSIP_DIR_DEBUG                ((__u64)1 << 3)
+#define GOSSIP_UTILS_DEBUG              ((__u64)1 << 4)
+#define GOSSIP_WAIT_DEBUG               ((__u64)1 << 5)
+#define GOSSIP_ACL_DEBUG                ((__u64)1 << 6)
+#define GOSSIP_DCACHE_DEBUG             ((__u64)1 << 7)
+#define GOSSIP_DEV_DEBUG                ((__u64)1 << 8)
+#define GOSSIP_NAME_DEBUG               ((__u64)1 << 9)
+#define GOSSIP_BUFMAP_DEBUG             ((__u64)1 << 10)
+#define GOSSIP_CACHE_DEBUG              ((__u64)1 << 11)
+#define GOSSIP_DEBUGFS_DEBUG            ((__u64)1 << 12)
+#define GOSSIP_XATTR_DEBUG              ((__u64)1 << 13)
+#define GOSSIP_INIT_DEBUG               ((__u64)1 << 14)
+#define GOSSIP_SYSFS_DEBUG              ((__u64)1 << 15)
+#define GOSSIP_MAX_NR                 16
+#define GOSSIP_MAX_DEBUG              (((__u64)1 << GOSSIP_MAX_NR) - 1)
+/*function prototypes*/
+__u64 ORANGEFS_kmod_eventlog_to_mask(const char *event_logging);
+__u64 ORANGEFS_debug_eventlog_to_mask(const char *event_logging);
+char *ORANGEFS_debug_mask_to_eventlog(__u64 mask);
+char *ORANGEFS_kmod_mask_to_eventlog(__u64 mask);
+/* a private internal type */
+struct __keyword_mask_s {
+        const char *keyword;
+        __u64 mask_val;
+};
+/*
+ * Map all kmod keywords to kmod debug masks here. Keep this
+ * structure "packed":
+ *
+ *   "all" is always last...
+ *
+ *   keyword     mask_val     index
+ *     foo          1           0
+ *     bar          2           1
+ *     baz          4           2
+ *     qux          8           3
+ *      .           .           .
+ */
+static struct __keyword_mask_s s_kmod_keyword_mask_map[] = {
+        {"super", GOSSIP_SUPER_DEBUG},
+        {"inode", GOSSIP_INODE_DEBUG},
+        {"file", GOSSIP_FILE_DEBUG},
+        {"dir", GOSSIP_DIR_DEBUG},
+        {"utils", GOSSIP_UTILS_DEBUG},
+        {"wait", GOSSIP_WAIT_DEBUG},
+        {"acl", GOSSIP_ACL_DEBUG},
+        {"dcache", GOSSIP_DCACHE_DEBUG},
+        {"dev", GOSSIP_DEV_DEBUG},
+        {"name", GOSSIP_NAME_DEBUG},
+        {"bufmap", GOSSIP_BUFMAP_DEBUG},
+        {"cache", GOSSIP_CACHE_DEBUG},
+        {"debugfs", GOSSIP_DEBUGFS_DEBUG},
+        {"xattr", GOSSIP_XATTR_DEBUG},
+        {"init", GOSSIP_INIT_DEBUG},
+        {"sysfs", GOSSIP_SYSFS_DEBUG},
+        {"none", GOSSIP_NO_DEBUG},
+        {"all", GOSSIP_MAX_DEBUG}
+};
+static const int num_kmod_keyword_mask_map = (int)
+        (sizeof(s_kmod_keyword_mask_map) / sizeof(struct __keyword_mask_s));
+#endif /* __ORANGEFS_DEBUG_H */
diff --git a/fs/orangefs/orangefs-debugfs.c b/fs/orangefs/orangefs-debugfs.c
new file mode 100644
index 000000000000..19670b8b4053
--- /dev/null
+++ b/fs/orangefs/orangefs-debugfs.c
@@ -0,0 +1,455 @@
+/*
+ * What:                /sys/kernel/debug/orangefs/debug-help
+ * Date:                June 2015
+ * Contact:             Mike Marshall <hubcap@omnibond.com>
+ * Description:
+ *                      List of client and kernel debug keywords.
+ *
+ *
+ * What:                /sys/kernel/debug/orangefs/client-debug
+ * Date:                June 2015
+ * Contact:             Mike Marshall <hubcap@omnibond.com>
+ * Description:
+ *                      Debug setting for "the client", the userspace
+ *                      helper for the kernel module.
+ *
+ *
+ * What:                /sys/kernel/debug/orangefs/kernel-debug
+ * Date:                June 2015
+ * Contact:             Mike Marshall <hubcap@omnibond.com>
+ * Description:
+ *                      Debug setting for the orangefs kernel module.
+ *
+ *                      Any of the keywords, or comma-separated lists
+ *                      of keywords, from debug-help can be catted to
+ *                      client-debug or kernel-debug.
+ *
+ *                      "none", "all" and "verbose" are special keywords
+ *                      for client-debug. Setting client-debug to "all"
+ *                      is kind of like trying to drink water from a
+ *                      fire hose, "verbose" triggers most of the same
+ *                      output except for the constant flow of output
+ *                      from the main wait loop.
+ *
+ *                      "none" and "all" are similar settings for kernel-debug
+ *                      no need for a "verbose".
+ */
+#include <linux/debugfs.h>
+#include <linux/slab.h>
+#include <linux/uaccess.h>
+#include "orangefs-debugfs.h"
+#include "protocol.h"
+#include "orangefs-kernel.h"
+static int orangefs_debug_disabled = 1;
+static int orangefs_debug_help_open(struct inode *, struct file *);
+const struct file_operations debug_help_fops = {
+        .open           = orangefs_debug_help_open,
+        .read           = seq_read,
+        .release        = seq_release,
+        .llseek         = seq_lseek,
+};
+static void *help_start(struct seq_file *, loff_t *);
+static void *help_next(struct seq_file *, void *, loff_t *);
+static void help_stop(struct seq_file *, void *);
+static int help_show(struct seq_file *, void *);
+static const struct seq_operations help_debug_ops = {
+        .start  = help_start,
+        .next   = help_next,
+        .stop   = help_stop,
+        .show   = help_show,
+};
+/*
+ * Used to protect data in ORANGEFS_KMOD_DEBUG_FILE and
+ * ORANGEFS_KMOD_DEBUG_FILE.
+ */
+static DEFINE_MUTEX(orangefs_debug_lock);
+int orangefs_debug_open(struct inode *, struct file *);
+static ssize_t orangefs_debug_read(struct file *,
+                                 char __user *,
+                                 size_t,
+                                 loff_t *);
+static ssize_t orangefs_debug_write(struct file *,
+                                  const char __user *,
+                                  size_t,
+                                  loff_t *);
+static const struct file_operations kernel_debug_fops = {
+        .open           = orangefs_debug_open,
+        .read           = orangefs_debug_read,
+        .write          = orangefs_debug_write,
+        .llseek         = generic_file_llseek,
+};
+/*
+ * initialize kmod debug operations, create orangefs debugfs dir and
+ * ORANGEFS_KMOD_DEBUG_HELP_FILE.
+ */
+int orangefs_debugfs_init(void)
+{
+        int rc = -ENOMEM;
+        debug_dir = debugfs_create_dir("orangefs", NULL);
+        if (!debug_dir) {
+                pr_info("%s: debugfs_create_dir failed.\n", __func__);
+                goto out;
+        }
+        help_file_dentry = debugfs_create_file(ORANGEFS_KMOD_DEBUG_HELP_FILE,
+                                  0444,
+                                  debug_dir,
+                                  debug_help_string,
+                                  &debug_help_fops);
+        if (!help_file_dentry) {
+                pr_info("%s: debugfs_create_file failed.\n", __func__);
+                goto out;
+        }
+        orangefs_debug_disabled = 0;
+        rc = 0;
+out:
+        return rc;
+}
+void orangefs_debugfs_cleanup(void)
+{
+        if (debug_dir)
+                debugfs_remove_recursive(debug_dir);
+}
+/* open ORANGEFS_KMOD_DEBUG_HELP_FILE */
+static int orangefs_debug_help_open(struct inode *inode, struct file *file)
+{
+        int rc = -ENODEV;
+        int ret;
+        gossip_debug(GOSSIP_DEBUGFS_DEBUG,
+                     "orangefs_debug_help_open: start\n");
+        if (orangefs_debug_disabled)
+                goto out;
+        ret = seq_open(file, &help_debug_ops);
+        if (ret)
+                goto out;
+        ((struct seq_file *)(file->private_data))->private = inode->i_private;
+        rc = 0;
+out:
+        gossip_debug(GOSSIP_DEBUGFS_DEBUG,
+                     "orangefs_debug_help_open: rc:%d:\n",
+                     rc);
+        return rc;
+}
+/*
+ * I think start always gets called again after stop. Start
+ * needs to return NULL when it is done. The whole "payload"
+ * in this case is a single (long) string, so by the second
+ * time we get to start (pos = 1), we're done.
+ */
+static void *help_start(struct seq_file *m, loff_t *pos)
+{
+        void *payload = NULL;
+        gossip_debug(GOSSIP_DEBUGFS_DEBUG, "help_start: start\n");
+        if (*pos == 0)
+                payload = m->private;
+        return payload;
+}
+static void *help_next(struct seq_file *m, void *v, loff_t *pos)
+{
+        gossip_debug(GOSSIP_DEBUGFS_DEBUG, "help_next: start\n");
+        return NULL;
+}
+static void help_stop(struct seq_file *m, void *p)
+{
+        gossip_debug(GOSSIP_DEBUGFS_DEBUG, "help_stop: start\n");
+}
+static int help_show(struct seq_file *m, void *v)
+{
+        gossip_debug(GOSSIP_DEBUGFS_DEBUG, "help_show: start\n");
+        seq_puts(m, v);
+        return 0;
+}
+/*
+ * initialize the kernel-debug file.
+ */
+int orangefs_kernel_debug_init(void)
+{
+        int rc = -ENOMEM;
+        struct dentry *ret;
+        char *k_buffer = NULL;
+        gossip_debug(GOSSIP_DEBUGFS_DEBUG, "%s: start\n", __func__);
+        k_buffer = kzalloc(ORANGEFS_MAX_DEBUG_STRING_LEN, GFP_KERNEL);
+        if (!k_buffer)
+                goto out;
+        if (strlen(kernel_debug_string) + 1 < ORANGEFS_MAX_DEBUG_STRING_LEN) {
+                strcpy(k_buffer, kernel_debug_string);
+                strcat(k_buffer, "\n");
+        } else {
+                strcpy(k_buffer, "none\n");
+                pr_info("%s: overflow 1!\n", __func__);
+        }
+        ret = debugfs_create_file(ORANGEFS_KMOD_DEBUG_FILE,
+                                  0444,
+                                  debug_dir,
+                                  k_buffer,
+                                  &kernel_debug_fops);
+        if (!ret) {
+                pr_info("%s: failed to create %s.\n",
+                        __func__,
+                        ORANGEFS_KMOD_DEBUG_FILE);
+                goto out;
+        }
+        rc = 0;
+out:
+        gossip_debug(GOSSIP_DEBUGFS_DEBUG, "%s: rc:%d:\n", __func__, rc);
+        return rc;
+}
+/*
+ * initialize the client-debug file.
+ */
+int orangefs_client_debug_init(void)
+{
+        int rc = -ENOMEM;
+        char *c_buffer = NULL;
+        gossip_debug(GOSSIP_DEBUGFS_DEBUG, "%s: start\n", __func__);
+        c_buffer = kzalloc(ORANGEFS_MAX_DEBUG_STRING_LEN, GFP_KERNEL);
+        if (!c_buffer)
+                goto out;
+        if (strlen(client_debug_string) + 1 < ORANGEFS_MAX_DEBUG_STRING_LEN) {
+                strcpy(c_buffer, client_debug_string);
+                strcat(c_buffer, "\n");
+        } else {
+                strcpy(c_buffer, "none\n");
+                pr_info("%s: overflow! 2\n", __func__);
+        }
+        client_debug_dentry = debugfs_create_file(ORANGEFS_CLIENT_DEBUG_FILE,
+                                                  0444,
+                                                  debug_dir,
+                                                  c_buffer,
+                                                  &kernel_debug_fops);
+        if (!client_debug_dentry) {
+                pr_info("%s: failed to create updated %s.\n",
+                        __func__,
+                        ORANGEFS_CLIENT_DEBUG_FILE);
+                goto out;
+        }
+        rc = 0;
+out:
+        gossip_debug(GOSSIP_DEBUGFS_DEBUG, "%s: rc:%d:\n", __func__, rc);
+        return rc;
+}
+/* open ORANGEFS_KMOD_DEBUG_FILE or ORANGEFS_CLIENT_DEBUG_FILE.*/
+int orangefs_debug_open(struct inode *inode, struct file *file)
+{
+        int rc = -ENODEV;
+        gossip_debug(GOSSIP_DEBUGFS_DEBUG,
+                     "%s: orangefs_debug_disabled: %d\n",
+                     __func__,
+                     orangefs_debug_disabled);
+        if (orangefs_debug_disabled)
+                goto out;
+        rc = 0;
+        mutex_lock(&orangefs_debug_lock);
+        file->private_data = inode->i_private;
+        mutex_unlock(&orangefs_debug_lock);
+out:
+        gossip_debug(GOSSIP_DEBUGFS_DEBUG,
+                     "orangefs_debug_open: rc: %d\n",
+                     rc);
+        return rc;
+}
+static ssize_t orangefs_debug_read(struct file *file,
+                                 char __user *ubuf,
+                                 size_t count,
+                                 loff_t *ppos)
+{
+        char *buf;
+        int sprintf_ret;
+        ssize_t read_ret = -ENOMEM;
+        gossip_debug(GOSSIP_DEBUGFS_DEBUG, "orangefs_debug_read: start\n");
+        buf = kmalloc(ORANGEFS_MAX_DEBUG_STRING_LEN, GFP_KERNEL);
+        if (!buf)
+                goto out;
+        mutex_lock(&orangefs_debug_lock);
+        sprintf_ret = sprintf(buf, "%s", (char *)file->private_data);
+        mutex_unlock(&orangefs_debug_lock);
+        read_ret = simple_read_from_buffer(ubuf, count, ppos, buf, sprintf_ret);
+        kfree(buf);
+out:
+        gossip_debug(GOSSIP_DEBUGFS_DEBUG,
+                     "orangefs_debug_read: ret: %zu\n",
+                     read_ret);
+        return read_ret;
+}
+static ssize_t orangefs_debug_write(struct file *file,
+                                  const char __user *ubuf,
+                                  size_t count,
+                                  loff_t *ppos)
+{
+        char *buf;
+        int rc = -EFAULT;
+        size_t silly = 0;
+        char *debug_string;
+        struct orangefs_kernel_op_s *new_op = NULL;
+        struct client_debug_mask c_mask = { NULL, 0, 0 };
+        gossip_debug(GOSSIP_DEBUGFS_DEBUG,
+                "orangefs_debug_write: %s\n",
+                file->f_path.dentry->d_name.name);
+        /*
+         * Thwart users who try to jamb a ridiculous number
+         * of bytes into the debug file...
+         */
+        if (count > ORANGEFS_MAX_DEBUG_STRING_LEN + 1) {
+                silly = count;
+                count = ORANGEFS_MAX_DEBUG_STRING_LEN + 1;
+        }
+        buf = kzalloc(ORANGEFS_MAX_DEBUG_STRING_LEN, GFP_KERNEL);
+        if (!buf)
+                goto out;
+        if (copy_from_user(buf, ubuf, count - 1)) {
+                gossip_debug(GOSSIP_DEBUGFS_DEBUG,
+                             "%s: copy_from_user failed!\n",
+                             __func__);
+                goto out;
+        }
+        /*
+         * Map the keyword string from userspace into a valid debug mask.
+         * The mapping process involves mapping the human-inputted string
+         * into a valid mask, and then rebuilding the string from the
+         * verified valid mask.
+         *
+         * A service operation is required to set a new client-side
+         * debug mask.
+         */
+        if (!strcmp(file->f_path.dentry->d_name.name,
+                    ORANGEFS_KMOD_DEBUG_FILE)) {
+                debug_string_to_mask(buf, &gossip_debug_mask, 0);
+                debug_mask_to_string(&gossip_debug_mask, 0);
+                debug_string = kernel_debug_string;
+                gossip_debug(GOSSIP_DEBUGFS_DEBUG,
+                             "New kernel debug string is %s\n",
+                             kernel_debug_string);
+        } else {
+                /* Can't reset client debug mask if client is not running. */
+                if (is_daemon_in_service()) {
+                        pr_info("%s: Client not running :%d:\n",
+                                __func__,
+                                is_daemon_in_service());
+                        goto out;
+                }
+                debug_string_to_mask(buf, &c_mask, 1);
+                debug_mask_to_string(&c_mask, 1);
+                debug_string = client_debug_string;
+                new_op = op_alloc(ORANGEFS_VFS_OP_PARAM);
+                if (!new_op) {
+                        pr_info("%s: op_alloc failed!\n", __func__);
+                        goto out;
+                }
+                new_op->upcall.req.param.op =
+                        ORANGEFS_PARAM_REQUEST_OP_TWO_MASK_VALUES;
+                new_op->upcall.req.param.type = ORANGEFS_PARAM_REQUEST_SET;
+                memset(new_op->upcall.req.param.s_value,
+                       0,
+                       ORANGEFS_MAX_DEBUG_STRING_LEN);
+                sprintf(new_op->upcall.req.param.s_value,
+                        "%llx %llx\n",
+                        c_mask.mask1,
+                        c_mask.mask2);
+                /* service_operation returns 0 on success... */
+                rc = service_operation(new_op,
+                                       "orangefs_param",
+                                        ORANGEFS_OP_INTERRUPTIBLE);
+                if (rc)
+                        gossip_debug(GOSSIP_DEBUGFS_DEBUG,
+                                     "%s: service_operation failed! rc:%d:\n",
+                                     __func__,
+                                     rc);
+                op_release(new_op);
+        }
+        mutex_lock(&orangefs_debug_lock);
+        memset(file->f_inode->i_private, 0, ORANGEFS_MAX_DEBUG_STRING_LEN);
+        sprintf((char *)file->f_inode->i_private, "%s\n", debug_string);
+        mutex_unlock(&orangefs_debug_lock);
+        *ppos += count;
+        if (silly)
+                rc = silly;
+        else
+                rc = count;
+out:
+        gossip_debug(GOSSIP_DEBUGFS_DEBUG,
+                     "orangefs_debug_write: rc: %d\n",
+                     rc);
+        kfree(buf);
+        return rc;
+}
diff --git a/fs/orangefs/orangefs-debugfs.h b/fs/orangefs/orangefs-debugfs.h
new file mode 100644
index 000000000000..e4828c0e3ef9
--- /dev/null
+++ b/fs/orangefs/orangefs-debugfs.h
@@ -0,0 +1,3 @@
+int orangefs_debugfs_init(void);
+int orangefs_kernel_debug_init(void);
+void orangefs_debugfs_cleanup(void);
diff --git a/fs/orangefs/orangefs-dev-proto.h b/fs/orangefs/orangefs-dev-proto.h
new file mode 100644
index 000000000000..9eac9d9a3f3a
--- /dev/null
+++ b/fs/orangefs/orangefs-dev-proto.h
@@ -0,0 +1,62 @@
+/*
+ * (C) 2001 Clemson University and The University of Chicago
+ *
+ * See COPYING in top-level directory.
+ */
+#ifndef _ORANGEFS_DEV_PROTO_H
+#define _ORANGEFS_DEV_PROTO_H
+/*
+ * types and constants shared between user space and kernel space for
+ * device interaction using a common protocol
+ */
+/*
+ * valid orangefs kernel operation types
+ */
+#define ORANGEFS_VFS_OP_INVALID           0xFF000000
+#define ORANGEFS_VFS_OP_FILE_IO        0xFF000001
+#define ORANGEFS_VFS_OP_LOOKUP         0xFF000002
+#define ORANGEFS_VFS_OP_CREATE         0xFF000003
+#define ORANGEFS_VFS_OP_GETATTR        0xFF000004
+#define ORANGEFS_VFS_OP_REMOVE         0xFF000005
+#define ORANGEFS_VFS_OP_MKDIR          0xFF000006
+#define ORANGEFS_VFS_OP_READDIR        0xFF000007
+#define ORANGEFS_VFS_OP_SETATTR        0xFF000008
+#define ORANGEFS_VFS_OP_SYMLINK        0xFF000009
+#define ORANGEFS_VFS_OP_RENAME         0xFF00000A
+#define ORANGEFS_VFS_OP_STATFS         0xFF00000B
+#define ORANGEFS_VFS_OP_TRUNCATE       0xFF00000C
+#define ORANGEFS_VFS_OP_MMAP_RA_FLUSH  0xFF00000D
+#define ORANGEFS_VFS_OP_FS_MOUNT       0xFF00000E
+#define ORANGEFS_VFS_OP_FS_UMOUNT      0xFF00000F
+#define ORANGEFS_VFS_OP_GETXATTR       0xFF000010
+#define ORANGEFS_VFS_OP_SETXATTR          0xFF000011
+#define ORANGEFS_VFS_OP_LISTXATTR         0xFF000012
+#define ORANGEFS_VFS_OP_REMOVEXATTR       0xFF000013
+#define ORANGEFS_VFS_OP_PARAM          0xFF000014
+#define ORANGEFS_VFS_OP_PERF_COUNT     0xFF000015
+#define ORANGEFS_VFS_OP_CANCEL            0xFF00EE00
+#define ORANGEFS_VFS_OP_FSYNC          0xFF00EE01
+#define ORANGEFS_VFS_OP_FSKEY             0xFF00EE02
+#define ORANGEFS_VFS_OP_READDIRPLUS       0xFF00EE03
+/*
+ * Misc constants. Please retain them as multiples of 8!
+ * Otherwise 32-64 bit interactions will be messed up :)
+ */
+#define ORANGEFS_MAX_DEBUG_STRING_LEN   0x00000400
+#define ORANGEFS_MAX_DEBUG_ARRAY_LEN    0x00000800
+/*
+ * The maximum number of directory entries in a single request is 96.
+ * XXX: Why can this not be higher. The client-side code can handle up to 512.
+ * XXX: What happens if we expect more than the client can return?
+ */
+#define ORANGEFS_MAX_DIRENT_COUNT_READDIR 96
+#include "upcall.h"
+#include "downcall.h"
+#endif
diff --git a/fs/orangefs/orangefs-kernel.h b/fs/orangefs/orangefs-kernel.h
new file mode 100644
index 000000000000..a9925e296ceb
--- /dev/null
+++ b/fs/orangefs/orangefs-kernel.h
@@ -0,0 +1,623 @@
+/*
+ * (C) 2001 Clemson University and The University of Chicago
+ *
+ * See COPYING in top-level directory.
+ */
+/*
+ *  The ORANGEFS Linux kernel support allows ORANGEFS volumes to be mounted and
+ *  accessed through the Linux VFS (i.e. using standard I/O system calls).
+ *  This support is only needed on clients that wish to mount the file system.
+ *
+ */
+/*
+ *  Declarations and macros for the ORANGEFS Linux kernel support.
+ */
+#ifndef __ORANGEFSKERNEL_H
+#define __ORANGEFSKERNEL_H
+#include <linux/kernel.h>
+#include <linux/moduleparam.h>
+#include <linux/statfs.h>
+#include <linux/backing-dev.h>
+#include <linux/device.h>
+#include <linux/mpage.h>
+#include <linux/namei.h>
+#include <linux/errno.h>
+#include <linux/init.h>
+#include <linux/module.h>
+#include <linux/slab.h>
+#include <linux/types.h>
+#include <linux/fs.h>
+#include <linux/vmalloc.h>
+#include <linux/aio.h>
+#include <linux/posix_acl.h>
+#include <linux/posix_acl_xattr.h>
+#include <linux/compat.h>
+#include <linux/mount.h>
+#include <linux/uaccess.h>
+#include <linux/atomic.h>
+#include <linux/uio.h>
+#include <linux/sched.h>
+#include <linux/mm.h>
+#include <linux/wait.h>
+#include <linux/dcache.h>
+#include <linux/pagemap.h>
+#include <linux/poll.h>
+#include <linux/rwsem.h>
+#include <linux/xattr.h>
+#include <linux/exportfs.h>
+#include <asm/unaligned.h>
+#include "orangefs-dev-proto.h"
+#ifdef ORANGEFS_KERNEL_DEBUG
+#define ORANGEFS_DEFAULT_OP_TIMEOUT_SECS       10
+#else
+#define ORANGEFS_DEFAULT_OP_TIMEOUT_SECS       20
+#endif
+#define ORANGEFS_BUFMAP_WAIT_TIMEOUT_SECS   30
+#define ORANGEFS_DEFAULT_SLOT_TIMEOUT_SECS     900      /* 15 minutes */
+#define ORANGEFS_REQDEVICE_NAME          "pvfs2-req"
+#define ORANGEFS_DEVREQ_MAGIC             0x20030529
+#define ORANGEFS_LINK_MAX                 0x000000FF
+#define ORANGEFS_PURGE_RETRY_COUNT     0x00000005
+#define ORANGEFS_MAX_NUM_OPTIONS          0x00000004
+#define ORANGEFS_MAX_MOUNT_OPT_LEN        0x00000080
+#define ORANGEFS_MAX_FSKEY_LEN            64
+#define MAX_DEV_REQ_UPSIZE (2 * sizeof(__s32) +   \
+sizeof(__u64) + sizeof(struct orangefs_upcall_s))
+#define MAX_DEV_REQ_DOWNSIZE (2 * sizeof(__s32) + \
+sizeof(__u64) + sizeof(struct orangefs_downcall_s))
+/*
+ * valid orangefs kernel operation states
+ *
+ * unknown  - op was just initialized
+ * waiting  - op is on request_list (upward bound)
+ * inprogr  - op is in progress (waiting for downcall)
+ * serviced - op has matching downcall; ok
+ * purged   - op has to start a timer since client-core
+ *            exited uncleanly before servicing op
+ * given up - submitter has given up waiting for it
+ */
+enum orangefs_vfs_op_states {
+        OP_VFS_STATE_UNKNOWN = 0,
+        OP_VFS_STATE_WAITING = 1,
+        OP_VFS_STATE_INPROGR = 2,
+        OP_VFS_STATE_SERVICED = 4,
+        OP_VFS_STATE_PURGED = 8,
+        OP_VFS_STATE_GIVEN_UP = 16,
+};
+/*
+ * An array of client_debug_mask will be built to hold debug keyword/mask
+ * values fetched from userspace.
+ */
+struct client_debug_mask {
+        char *keyword;
+        __u64 mask1;
+        __u64 mask2;
+};
+/*
+ * orangefs kernel memory related flags
+ */
+#if ((defined ORANGEFS_KERNEL_DEBUG) && (defined CONFIG_DEBUG_SLAB))
+#define ORANGEFS_CACHE_CREATE_FLAGS SLAB_RED_ZONE
+#else
+#define ORANGEFS_CACHE_CREATE_FLAGS 0
+#endif /* ((defined ORANGEFS_KERNEL_DEBUG) && (defined CONFIG_DEBUG_SLAB)) */
+/* orangefs xattr and acl related defines */
+#define ORANGEFS_XATTR_INDEX_POSIX_ACL_ACCESS  1
+#define ORANGEFS_XATTR_INDEX_POSIX_ACL_DEFAULT 2
+#define ORANGEFS_XATTR_INDEX_TRUSTED           3
+#define ORANGEFS_XATTR_INDEX_DEFAULT           4
+#define ORANGEFS_XATTR_NAME_ACL_ACCESS XATTR_NAME_POSIX_ACL_ACCESS
+#define ORANGEFS_XATTR_NAME_ACL_DEFAULT XATTR_NAME_POSIX_ACL_DEFAULT
+#define ORANGEFS_XATTR_NAME_TRUSTED_PREFIX "trusted."
+#define ORANGEFS_XATTR_NAME_DEFAULT_PREFIX ""
+/* these functions are defined in orangefs-utils.c */
+int orangefs_prepare_cdm_array(char *debug_array_string);
+int orangefs_prepare_debugfs_help_string(int);
+/* defined in orangefs-debugfs.c */
+int orangefs_client_debug_init(void);
+void debug_string_to_mask(char *, void *, int);
+void do_c_mask(int, char *, struct client_debug_mask **);
+void do_k_mask(int, char *, __u64 **);
+void debug_mask_to_string(void *, int);
+void do_k_string(void *, int);
+void do_c_string(void *, int);
+int check_amalgam_keyword(void *, int);
+int keyword_is_amalgam(char *);
+/*these variables are defined in orangefs-mod.c */
+extern char kernel_debug_string[ORANGEFS_MAX_DEBUG_STRING_LEN];
+extern char client_debug_string[ORANGEFS_MAX_DEBUG_STRING_LEN];
+extern char client_debug_array_string[ORANGEFS_MAX_DEBUG_STRING_LEN];
+extern unsigned int kernel_mask_set_mod_init;
+extern int orangefs_init_acl(struct inode *inode, struct inode *dir);
+extern const struct xattr_handler *orangefs_xattr_handlers[];
+extern struct posix_acl *orangefs_get_acl(struct inode *inode, int type);
+extern int orangefs_set_acl(struct inode *inode, struct posix_acl *acl, int type);
+/*
+ * Redefine xtvec structure so that we could move helper functions out of
+ * the define
+ */
+struct xtvec {
+        __kernel_off_t xtv_off;         /* must be off_t */
+        __kernel_size_t xtv_len;        /* must be size_t */
+};
+/*
+ * orangefs data structures
+ */
+struct orangefs_kernel_op_s {
+        enum orangefs_vfs_op_states op_state;
+        __u64 tag;
+        /*
+         * Set uses_shared_memory to non zero if this operation uses
+         * shared memory. If true, then a retry on the op must also
+         * get a new shared memory buffer and re-populate it.
+         * Cancels don't care - it only matters for service_operation()
+         * retry logics and cancels don't go through it anymore. It
+         * safely stays non-zero when we use it as slot_to_free.
+         */
+        union {
+                int uses_shared_memory;
+                int slot_to_free;
+        };
+        struct orangefs_upcall_s upcall;
+        struct orangefs_downcall_s downcall;
+        struct completion waitq;
+        spinlock_t lock;
+        int attempts;
+        struct list_head list;
+};
+#define set_op_state_waiting(op)     ((op)->op_state = OP_VFS_STATE_WAITING)
+#define set_op_state_inprogress(op)  ((op)->op_state = OP_VFS_STATE_INPROGR)
+#define set_op_state_given_up(op)  ((op)->op_state = OP_VFS_STATE_GIVEN_UP)
+static inline void set_op_state_serviced(struct orangefs_kernel_op_s *op)
+{
+        op->op_state = OP_VFS_STATE_SERVICED;
+        complete(&op->waitq);
+}
+#define op_state_waiting(op)     ((op)->op_state & OP_VFS_STATE_WAITING)
+#define op_state_in_progress(op) ((op)->op_state & OP_VFS_STATE_INPROGR)
+#define op_state_serviced(op)    ((op)->op_state & OP_VFS_STATE_SERVICED)
+#define op_state_purged(op)      ((op)->op_state & OP_VFS_STATE_PURGED)
+#define op_state_given_up(op)    ((op)->op_state & OP_VFS_STATE_GIVEN_UP)
+#define op_is_cancel(op)         ((op)->upcall.type == ORANGEFS_VFS_OP_CANCEL)
+void op_release(struct orangefs_kernel_op_s *op);
+extern void orangefs_bufmap_put(int);
+static inline void put_cancel(struct orangefs_kernel_op_s *op)
+{
+        orangefs_bufmap_put(op->slot_to_free);
+        op_release(op);
+}
+static inline void set_op_state_purged(struct orangefs_kernel_op_s *op)
+{
+        spin_lock(&op->lock);
+        if (unlikely(op_is_cancel(op))) {
+                list_del_init(&op->list);
+                spin_unlock(&op->lock);
+                put_cancel(op);
+        } else {
+                op->op_state |= OP_VFS_STATE_PURGED;
+                complete(&op->waitq);
+                spin_unlock(&op->lock);
+        }
+}
+/* per inode private orangefs info */
+struct orangefs_inode_s {
+        struct orangefs_object_kref refn;
+        char link_target[ORANGEFS_NAME_MAX];
+        __s64 blksize;
+        /*
+         * Reading/Writing Extended attributes need to acquire the appropriate
+         * reader/writer semaphore on the orangefs_inode_s structure.
+         */
+        struct rw_semaphore xattr_sem;
+        struct inode vfs_inode;
+        sector_t last_failed_block_index_read;
+        /*
+         * State of in-memory attributes not yet flushed to disk associated
+         * with this object
+         */
+        unsigned long pinode_flags;
+};
+#define P_ATIME_FLAG 0
+#define P_MTIME_FLAG 1
+#define P_CTIME_FLAG 2
+#define P_MODE_FLAG  3
+#define ClearAtimeFlag(pinode) clear_bit(P_ATIME_FLAG, &(pinode)->pinode_flags)
+#define SetAtimeFlag(pinode)   set_bit(P_ATIME_FLAG, &(pinode)->pinode_flags)
+#define AtimeFlag(pinode)      test_bit(P_ATIME_FLAG, &(pinode)->pinode_flags)
+#define ClearMtimeFlag(pinode) clear_bit(P_MTIME_FLAG, &(pinode)->pinode_flags)
+#define SetMtimeFlag(pinode)   set_bit(P_MTIME_FLAG, &(pinode)->pinode_flags)
+#define MtimeFlag(pinode)      test_bit(P_MTIME_FLAG, &(pinode)->pinode_flags)
+#define ClearCtimeFlag(pinode) clear_bit(P_CTIME_FLAG, &(pinode)->pinode_flags)
+#define SetCtimeFlag(pinode)   set_bit(P_CTIME_FLAG, &(pinode)->pinode_flags)
+#define CtimeFlag(pinode)      test_bit(P_CTIME_FLAG, &(pinode)->pinode_flags)
+#define ClearModeFlag(pinode) clear_bit(P_MODE_FLAG, &(pinode)->pinode_flags)
+#define SetModeFlag(pinode)   set_bit(P_MODE_FLAG, &(pinode)->pinode_flags)
+#define ModeFlag(pinode)      test_bit(P_MODE_FLAG, &(pinode)->pinode_flags)
+/* per superblock private orangefs info */
+struct orangefs_sb_info_s {
+        struct orangefs_khandle root_khandle;
+        __s32 fs_id;
+        int id;
+        int flags;
+#define ORANGEFS_OPT_INTR       0x01
+#define ORANGEFS_OPT_LOCAL_LOCK 0x02
+        char devname[ORANGEFS_MAX_SERVER_ADDR_LEN];
+        struct super_block *sb;
+        int mount_pending;
+        struct list_head list;
+};
+/*
+ * structure that holds the state of any async I/O operation issued
+ * through the VFS. Needed especially to handle cancellation requests
+ * or even completion notification so that the VFS client-side daemon
+ * can free up its vfs_request slots.
+ */
+struct orangefs_kiocb_s {
+        /* the pointer to the task that initiated the AIO */
+        struct task_struct *tsk;
+        /* pointer to the kiocb that kicked this operation */
+        struct kiocb *kiocb;
+        /* buffer index that was used for the I/O */
+        struct orangefs_bufmap *bufmap;
+        int buffer_index;
+        /* orangefs kernel operation type */
+        struct orangefs_kernel_op_s *op;
+        /* The user space buffers from/to which I/O is being staged */
+        struct iovec *iov;
+        /* number of elements in the iovector */
+        unsigned long nr_segs;
+        /* set to indicate the type of the operation */
+        int rw;
+        /* file offset */
+        loff_t offset;
+        /* and the count in bytes */
+        size_t bytes_to_be_copied;
+        ssize_t bytes_copied;
+        int needs_cleanup;
+};
+struct orangefs_stats {
+        unsigned long cache_hits;
+        unsigned long cache_misses;
+        unsigned long reads;
+        unsigned long writes;
+};
+extern struct orangefs_stats g_orangefs_stats;
+/*
+ * NOTE: See Documentation/filesystems/porting for information
+ * on implementing FOO_I and properly accessing fs private data
+ */
+static inline struct orangefs_inode_s *ORANGEFS_I(struct inode *inode)
+{
+        return container_of(inode, struct orangefs_inode_s, vfs_inode);
+}
+static inline struct orangefs_sb_info_s *ORANGEFS_SB(struct super_block *sb)
+{
+        return (struct orangefs_sb_info_s *) sb->s_fs_info;
+}
+/* ino_t descends from "unsigned long", 8 bytes, 64 bits. */
+static inline ino_t orangefs_khandle_to_ino(struct orangefs_khandle *khandle)
+{
+        union {
+                unsigned char u[8];
+                __u64 ino;
+        } ihandle;
+        ihandle.u[0] = khandle->u[0] ^ khandle->u[4];
+        ihandle.u[1] = khandle->u[1] ^ khandle->u[5];
+        ihandle.u[2] = khandle->u[2] ^ khandle->u[6];
+        ihandle.u[3] = khandle->u[3] ^ khandle->u[7];
+        ihandle.u[4] = khandle->u[12] ^ khandle->u[8];
+        ihandle.u[5] = khandle->u[13] ^ khandle->u[9];
+        ihandle.u[6] = khandle->u[14] ^ khandle->u[10];
+        ihandle.u[7] = khandle->u[15] ^ khandle->u[11];
+        return ihandle.ino;
+}
+static inline struct orangefs_khandle *get_khandle_from_ino(struct inode *inode)
+{
+        return &(ORANGEFS_I(inode)->refn.khandle);
+}
+static inline __s32 get_fsid_from_ino(struct inode *inode)
+{
+        return ORANGEFS_I(inode)->refn.fs_id;
+}
+static inline ino_t get_ino_from_khandle(struct inode *inode)
+{
+        struct orangefs_khandle *khandle;
+        ino_t ino;
+        khandle = get_khandle_from_ino(inode);
+        ino = orangefs_khandle_to_ino(khandle);
+        return ino;
+}
+static inline ino_t get_parent_ino_from_dentry(struct dentry *dentry)
+{
+        return get_ino_from_khandle(dentry->d_parent->d_inode);
+}
+static inline int is_root_handle(struct inode *inode)
+{
+        gossip_debug(GOSSIP_DCACHE_DEBUG,
+                     "%s: root handle: %pU, this handle: %pU:\n",
+                     __func__,
+                     &ORANGEFS_SB(inode->i_sb)->root_khandle,
+                     get_khandle_from_ino(inode));
+        if (ORANGEFS_khandle_cmp(&(ORANGEFS_SB(inode->i_sb)->root_khandle),
+                             get_khandle_from_ino(inode)))
+                return 0;
+        else
+                return 1;
+}
+static inline int match_handle(struct orangefs_khandle resp_handle,
+                               struct inode *inode)
+{
+        gossip_debug(GOSSIP_DCACHE_DEBUG,
+                     "%s: one handle: %pU, another handle:%pU:\n",
+                     __func__,
+                     &resp_handle,
+                     get_khandle_from_ino(inode));
+        if (ORANGEFS_khandle_cmp(&resp_handle, get_khandle_from_ino(inode)))
+                return 0;
+        else
+                return 1;
+}
+/*
+ * defined in orangefs-cache.c
+ */
+int op_cache_initialize(void);
+int op_cache_finalize(void);
+struct orangefs_kernel_op_s *op_alloc(__s32 type);
+void orangefs_new_tag(struct orangefs_kernel_op_s *op);
+char *get_opname_string(struct orangefs_kernel_op_s *new_op);
+int orangefs_inode_cache_initialize(void);
+int orangefs_inode_cache_finalize(void);
+/*
+ * defined in orangefs-mod.c
+ */
+void purge_inprogress_ops(void);
+/*
+ * defined in waitqueue.c
+ */
+void purge_waiting_ops(void);
+/*
+ * defined in super.c
+ */
+struct dentry *orangefs_mount(struct file_system_type *fst,
+                           int flags,
+                           const char *devname,
+                           void *data);
+void orangefs_kill_sb(struct super_block *sb);
+int orangefs_remount(struct orangefs_sb_info_s *);
+int fsid_key_table_initialize(void);
+void fsid_key_table_finalize(void);
+/*
+ * defined in inode.c
+ */
+__u32 convert_to_orangefs_mask(unsigned long lite_mask);
+struct inode *orangefs_new_inode(struct super_block *sb,
+                              struct inode *dir,
+                              int mode,
+                              dev_t dev,
+                              struct orangefs_object_kref *ref);
+int orangefs_setattr(struct dentry *dentry, struct iattr *iattr);
+int orangefs_getattr(struct vfsmount *mnt,
+                  struct dentry *dentry,
+                  struct kstat *kstat);
+int orangefs_permission(struct inode *inode, int mask);
+/*
+ * defined in xattr.c
+ */
+int orangefs_setxattr(struct dentry *dentry,
+                   const char *name,
+                   const void *value,
+                   size_t size,
+                   int flags);
+ssize_t orangefs_getxattr(struct dentry *dentry,
+                       const char *name,
+                       void *buffer,
+                       size_t size);
+ssize_t orangefs_listxattr(struct dentry *dentry, char *buffer, size_t size);
+/*
+ * defined in namei.c
+ */
+struct inode *orangefs_iget(struct super_block *sb,
+                         struct orangefs_object_kref *ref);
+ssize_t orangefs_inode_read(struct inode *inode,
+                            struct iov_iter *iter,
+                            loff_t *offset,
+                            loff_t readahead_size);
+/*
+ * defined in devorangefs-req.c
+ */
+int orangefs_dev_init(void);
+void orangefs_dev_cleanup(void);
+int is_daemon_in_service(void);
+bool __is_daemon_in_service(void);
+/*
+ * defined in orangefs-utils.c
+ */
+__s32 fsid_of_op(struct orangefs_kernel_op_s *op);
+int orangefs_flush_inode(struct inode *inode);
+ssize_t orangefs_inode_getxattr(struct inode *inode,
+                             const char *prefix,
+                             const char *name,
+                             void *buffer,
+                             size_t size);
+int orangefs_inode_setxattr(struct inode *inode,
+                         const char *prefix,
+                         const char *name,
+                         const void *value,
+                         size_t size,
+                         int flags);
+int orangefs_inode_getattr(struct inode *inode, int new, int size);
+int orangefs_inode_check_changed(struct inode *inode);
+int orangefs_inode_setattr(struct inode *inode, struct iattr *iattr);
+void orangefs_make_bad_inode(struct inode *inode);
+int orangefs_unmount_sb(struct super_block *sb);
+bool orangefs_cancel_op_in_progress(struct orangefs_kernel_op_s *op);
+int orangefs_normalize_to_errno(__s32 error_code);
+extern struct mutex devreq_mutex;
+extern struct mutex request_mutex;
+extern int debug;
+extern int op_timeout_secs;
+extern int slot_timeout_secs;
+extern struct list_head orangefs_superblocks;
+extern spinlock_t orangefs_superblocks_lock;
+extern struct list_head orangefs_request_list;
+extern spinlock_t orangefs_request_list_lock;
+extern wait_queue_head_t orangefs_request_list_waitq;
+extern struct list_head *htable_ops_in_progress;
+extern spinlock_t htable_ops_in_progress_lock;
+extern int hash_table_size;
+extern const struct address_space_operations orangefs_address_operations;
+extern struct backing_dev_info orangefs_backing_dev_info;
+extern struct inode_operations orangefs_file_inode_operations;
+extern const struct file_operations orangefs_file_operations;
+extern struct inode_operations orangefs_symlink_inode_operations;
+extern struct inode_operations orangefs_dir_inode_operations;
+extern const struct file_operations orangefs_dir_operations;
+extern const struct dentry_operations orangefs_dentry_operations;
+extern const struct file_operations orangefs_devreq_file_operations;
+extern wait_queue_head_t orangefs_bufmap_init_waitq;
+/*
+ * misc convenience macros
+ */
+#define ORANGEFS_OP_INTERRUPTIBLE 1   /* service_operation() is interruptible */
+#define ORANGEFS_OP_PRIORITY      2   /* service_operation() is high priority */
+#define ORANGEFS_OP_CANCELLATION  4   /* this is a cancellation */
+#define ORANGEFS_OP_NO_MUTEX      8   /* don't acquire request_mutex */
+#define ORANGEFS_OP_ASYNC         16  /* Queue it, but don't wait */
+int service_operation(struct orangefs_kernel_op_s *op,
+                      const char *op_name,
+                      int flags);
+#define get_interruptible_flag(inode) \
+        ((ORANGEFS_SB(inode->i_sb)->flags & ORANGEFS_OPT_INTR) ? \
+                ORANGEFS_OP_INTERRUPTIBLE : 0)
+#define fill_default_sys_attrs(sys_attr, type, mode)                    \
+do {                                                                    \
+        sys_attr.owner = from_kuid(current_user_ns(), current_fsuid()); \
+        sys_attr.group = from_kgid(current_user_ns(), current_fsgid()); \
+        sys_attr.perms = ORANGEFS_util_translate_mode(mode);            \
+        sys_attr.mtime = 0;                                             \
+        sys_attr.atime = 0;                                             \
+        sys_attr.ctime = 0;                                             \
+        sys_attr.mask = ORANGEFS_ATTR_SYS_ALL_SETABLE;                  \
+} while (0)
+static inline void orangefs_i_size_write(struct inode *inode, loff_t i_size)
+{
+#if BITS_PER_LONG == 32 && defined(CONFIG_SMP)
+        mutex_lock(&inode->i_mutex);
+#endif
+        i_size_write(inode, i_size);
+#if BITS_PER_LONG == 32 && defined(CONFIG_SMP)
+        mutex_unlock(&inode->i_mutex);
+#endif
+}
+#endif /* __ORANGEFSKERNEL_H */
diff --git a/fs/orangefs/orangefs-mod.c b/fs/orangefs/orangefs-mod.c
new file mode 100644
index 000000000000..6f072a8c0de1
--- /dev/null
+++ b/fs/orangefs/orangefs-mod.c
@@ -0,0 +1,293 @@
+/*
+ * (C) 2001 Clemson University and The University of Chicago
+ *
+ * Changes by Acxiom Corporation to add proc file handler for pvfs2 client
+ * parameters, Copyright Acxiom Corporation, 2005.
+ *
+ * See COPYING in top-level directory.
+ */
+#include "protocol.h"
+#include "orangefs-kernel.h"
+#include "orangefs-debugfs.h"
+#include "orangefs-sysfs.h"
+/* ORANGEFS_VERSION is a ./configure define */
+#ifndef ORANGEFS_VERSION
+#define ORANGEFS_VERSION "upstream"
+#endif
+/*
+ * global variables declared here
+ */
+/* array of client debug keyword/mask values */
+struct client_debug_mask *cdm_array;
+int cdm_element_count;
+char kernel_debug_string[ORANGEFS_MAX_DEBUG_STRING_LEN] = "none";
+char client_debug_string[ORANGEFS_MAX_DEBUG_STRING_LEN];
+char client_debug_array_string[ORANGEFS_MAX_DEBUG_STRING_LEN];
+char *debug_help_string;
+int help_string_initialized;
+struct dentry *help_file_dentry;
+struct dentry *client_debug_dentry;
+struct dentry *debug_dir;
+int client_verbose_index;
+int client_all_index;
+struct orangefs_stats g_orangefs_stats;
+/* the size of the hash tables for ops in progress */
+int hash_table_size = 509;
+static ulong module_parm_debug_mask;
+__u64 gossip_debug_mask;
+struct client_debug_mask client_debug_mask = { NULL, 0, 0 };
+unsigned int kernel_mask_set_mod_init; /* implicitly false */
+int op_timeout_secs = ORANGEFS_DEFAULT_OP_TIMEOUT_SECS;
+int slot_timeout_secs = ORANGEFS_DEFAULT_SLOT_TIMEOUT_SECS;
+MODULE_LICENSE("GPL");
+MODULE_AUTHOR("ORANGEFS Development Team");
+MODULE_DESCRIPTION("The Linux Kernel VFS interface to ORANGEFS");
+MODULE_PARM_DESC(module_parm_debug_mask, "debugging level (see orangefs-debug.h for values)");
+MODULE_PARM_DESC(op_timeout_secs, "Operation timeout in seconds");
+MODULE_PARM_DESC(slot_timeout_secs, "Slot timeout in seconds");
+MODULE_PARM_DESC(hash_table_size,
+                 "size of hash table for operations in progress");
+static struct file_system_type orangefs_fs_type = {
+        .name = "pvfs2",
+        .mount = orangefs_mount,
+        .kill_sb = orangefs_kill_sb,
+        .owner = THIS_MODULE,
+};
+module_param(hash_table_size, int, 0);
+module_param(module_parm_debug_mask, ulong, 0644);
+module_param(op_timeout_secs, int, 0);
+module_param(slot_timeout_secs, int, 0);
+/* synchronizes the request device file */
+DEFINE_MUTEX(devreq_mutex);
+/*
+ * Blocks non-priority requests from being queued for servicing.  This
+ * could be used for protecting the request list data structure, but
+ * for now it's only being used to stall the op addition to the request
+ * list
+ */
+DEFINE_MUTEX(request_mutex);
+/* hash table for storing operations waiting for matching downcall */
+struct list_head *htable_ops_in_progress;
+DEFINE_SPINLOCK(htable_ops_in_progress_lock);
+/* list for queueing upcall operations */
+LIST_HEAD(orangefs_request_list);
+/* used to protect the above orangefs_request_list */
+DEFINE_SPINLOCK(orangefs_request_list_lock);
+/* used for incoming request notification */
+DECLARE_WAIT_QUEUE_HEAD(orangefs_request_list_waitq);
+static int __init orangefs_init(void)
+{
+        int ret = -1;
+        __u32 i = 0;
+        /* convert input debug mask to a 64-bit unsigned integer */
+        gossip_debug_mask = (unsigned long long) module_parm_debug_mask;
+        /*
+         * set the kernel's gossip debug string; invalid mask values will
+         * be ignored.
+         */
+        debug_mask_to_string(&gossip_debug_mask, 0);
+        /* remove any invalid values from the mask */
+        debug_string_to_mask(kernel_debug_string, &gossip_debug_mask, 0);
+        /*
+         * if the mask has a non-zero value, then indicate that the mask
+         * was set when the kernel module was loaded.  The orangefs dev ioctl
+         * command will look at this boolean to determine if the kernel's
+         * debug mask should be overwritten when the client-core is started.
+         */
+        if (gossip_debug_mask != 0)
+                kernel_mask_set_mod_init = true;
+        pr_info("%s: called with debug mask: :%s: :%llx:\n",
+                __func__,
+                kernel_debug_string,
+                (unsigned long long)gossip_debug_mask);
+        ret = bdi_init(&orangefs_backing_dev_info);
+        if (ret)
+                return ret;
+        if (op_timeout_secs < 0)
+                op_timeout_secs = 0;
+        if (slot_timeout_secs < 0)
+                slot_timeout_secs = 0;
+        /* initialize global book keeping data structures */
+        ret = op_cache_initialize();
+        if (ret < 0)
+                goto err;
+        ret = orangefs_inode_cache_initialize();
+        if (ret < 0)
+                goto cleanup_op;
+        htable_ops_in_progress =
+            kcalloc(hash_table_size, sizeof(struct list_head), GFP_KERNEL);
+        if (!htable_ops_in_progress) {
+                gossip_err("Failed to initialize op hashtable");
+                ret = -ENOMEM;
+                goto cleanup_inode;
+        }
+        /* initialize a doubly linked at each hash table index */
+        for (i = 0; i < hash_table_size; i++)
+                INIT_LIST_HEAD(&htable_ops_in_progress[i]);
+        ret = fsid_key_table_initialize();
+        if (ret < 0)
+                goto cleanup_progress_table;
+        /*
+         * Build the contents of /sys/kernel/debug/orangefs/debug-help
+         * from the keywords in the kernel keyword/mask array.
+         *
+         * The keywords in the client keyword/mask array are
+         * unknown at boot time.
+         *
+         * orangefs_prepare_debugfs_help_string will be used again
+         * later to rebuild the debug-help file after the client starts
+         * and passes along the needed info. The argument signifies
+         * which time orangefs_prepare_debugfs_help_string is being
+         * called.
+         */
+        ret = orangefs_prepare_debugfs_help_string(1);
+        if (ret)
+                goto cleanup_key_table;
+        ret = orangefs_debugfs_init();
+        if (ret)
+                goto debugfs_init_failed;
+        ret = orangefs_kernel_debug_init();
+        if (ret)
+                goto kernel_debug_init_failed;
+        ret = orangefs_sysfs_init();
+        if (ret)
+                goto sysfs_init_failed;
+        /* Initialize the orangefsdev subsystem. */
+        ret = orangefs_dev_init();
+        if (ret < 0) {
+                gossip_err("%s: could not initialize device subsystem %d!\n",
+                           __func__,
+                           ret);
+                goto cleanup_device;
+        }
+        ret = register_filesystem(&orangefs_fs_type);
+        if (ret == 0) {
+                pr_info("orangefs: module version %s loaded\n", ORANGEFS_VERSION);
+                ret = 0;
+                goto out;
+        }
+        orangefs_sysfs_exit();
+cleanup_device:
+        orangefs_dev_cleanup();
+sysfs_init_failed:
+kernel_debug_init_failed:
+debugfs_init_failed:
+        orangefs_debugfs_cleanup();
+cleanup_key_table:
+        fsid_key_table_finalize();
+cleanup_progress_table:
+        kfree(htable_ops_in_progress);
+cleanup_inode:
+        orangefs_inode_cache_finalize();
+cleanup_op:
+        op_cache_finalize();
+err:
+        bdi_destroy(&orangefs_backing_dev_info);
+out:
+        return ret;
+}
+static void __exit orangefs_exit(void)
+{
+        int i = 0;
+        gossip_debug(GOSSIP_INIT_DEBUG, "orangefs: orangefs_exit called\n");
+        unregister_filesystem(&orangefs_fs_type);
+        orangefs_debugfs_cleanup();
+        orangefs_sysfs_exit();
+        fsid_key_table_finalize();
+        orangefs_dev_cleanup();
+        BUG_ON(!list_empty(&orangefs_request_list));
+        for (i = 0; i < hash_table_size; i++)
+                BUG_ON(!list_empty(&htable_ops_in_progress[i]));
+        orangefs_inode_cache_finalize();
+        op_cache_finalize();
+        kfree(htable_ops_in_progress);
+        bdi_destroy(&orangefs_backing_dev_info);
+        pr_info("orangefs: module version %s unloaded\n", ORANGEFS_VERSION);
+}
+/*
+ * What we do in this function is to walk the list of operations
+ * that are in progress in the hash table and mark them as purged as well.
+ */
+void purge_inprogress_ops(void)
+{
+        int i;
+        for (i = 0; i < hash_table_size; i++) {
+                struct orangefs_kernel_op_s *op;
+                struct orangefs_kernel_op_s *next;
+                spin_lock(&htable_ops_in_progress_lock);
+                list_for_each_entry_safe(op,
+                                         next,
+                                         &htable_ops_in_progress[i],
+                                         list) {
+                        set_op_state_purged(op);
+                        gossip_debug(GOSSIP_DEV_DEBUG,
+                                     "%s: op:%s: op_state:%d: process:%s:\n",
+                                     __func__,
+                                     get_opname_string(op),
+                                     op->op_state,
+                                     current->comm);
+                }
+                spin_unlock(&htable_ops_in_progress_lock);
+        }
+}
+module_init(orangefs_init);
+module_exit(orangefs_exit);
diff --git a/fs/orangefs/orangefs-sysfs.c b/fs/orangefs/orangefs-sysfs.c
new file mode 100644
index 000000000000..5c03113e3ad2
--- /dev/null
+++ b/fs/orangefs/orangefs-sysfs.c
@@ -0,0 +1,1772 @@
+/*
+ * Documentation/ABI/stable/orangefs-sysfs:
+ *
+ * What:                /sys/fs/orangefs/perf_counter_reset
+ * Date:                June 2015
+ * Contact:             Mike Marshall <hubcap@omnibond.com>
+ * Description:
+ *                      echo a 0 or a 1 into perf_counter_reset to
+ *                      reset all the counters in
+ *                      /sys/fs/orangefs/perf_counters
+ *                      except ones with PINT_PERF_PRESERVE set.
+ *
+ *
+ * What:                /sys/fs/orangefs/perf_counters/...
+ * Date:                Jun 2015
+ * Contact:             Mike Marshall <hubcap@omnibond.com>
+ * Description:
+ *                      Counters and settings for various caches.
+ *                      Read only.
+ *
+ *
+ * What:                /sys/fs/orangefs/perf_time_interval_secs
+ * Date:                Jun 2015
+ * Contact:             Mike Marshall <hubcap@omnibond.com>
+ * Description:
+ *                      Length of perf counter intervals in
+ *                      seconds.
+ *
+ *
+ * What:                /sys/fs/orangefs/perf_history_size
+ * Date:                Jun 2015
+ * Contact:             Mike Marshall <hubcap@omnibond.com>
+ * Description:
+ *                      The perf_counters cache statistics have N, or
+ *                      perf_history_size, samples. The default is
+ *                      one.
+ *
+ *                      Every perf_time_interval_secs the (first)
+ *                      samples are reset.
+ *
+ *                      If N is greater than one, the "current" set
+ *                      of samples is reset, and the samples from the
+ *                      other N-1 intervals remain available.
+ *
+ *
+ * What:                /sys/fs/orangefs/op_timeout_secs
+ * Date:                Jun 2015
+ * Contact:             Mike Marshall <hubcap@omnibond.com>
+ * Description:
+ *                      Service operation timeout in seconds.
+ *
+ *
+ * What:                /sys/fs/orangefs/slot_timeout_secs
+ * Date:                Jun 2015
+ * Contact:             Mike Marshall <hubcap@omnibond.com>
+ * Description:
+ *                      "Slot" timeout in seconds. A "slot"
+ *                      is an indexed buffer in the shared
+ *                      memory segment used for communication
+ *                      between the kernel module and userspace.
+ *                      Slots are requested and waited for,
+ *                      the wait times out after slot_timeout_secs.
+ *
+ *
+ * What:                /sys/fs/orangefs/acache/...
+ * Date:                Jun 2015
+ * Contact:             Mike Marshall <hubcap@omnibond.com>
+ * Description:
+ *                      Attribute cache configurable settings.
+ *
+ *
+ * What:                /sys/fs/orangefs/ncache/...
+ * Date:                Jun 2015
+ * Contact:             Mike Marshall <hubcap@omnibond.com>
+ * Description:
+ *                      Name cache configurable settings.
+ *
+ *
+ * What:                /sys/fs/orangefs/capcache/...
+ * Date:                Jun 2015
+ * Contact:             Mike Marshall <hubcap@omnibond.com>
+ * Description:
+ *                      Capability cache configurable settings.
+ *
+ *
+ * What:                /sys/fs/orangefs/ccache/...
+ * Date:                Jun 2015
+ * Contact:             Mike Marshall <hubcap@omnibond.com>
+ * Description:
+ *                      Credential cache configurable settings.
+ *
+ */
+#include <linux/fs.h>
+#include <linux/kobject.h>
+#include <linux/string.h>
+#include <linux/sysfs.h>
+#include <linux/module.h>
+#include <linux/init.h>
+#include "protocol.h"
+#include "orangefs-kernel.h"
+#include "orangefs-sysfs.h"
+#define ORANGEFS_KOBJ_ID "orangefs"
+#define ACACHE_KOBJ_ID "acache"
+#define CAPCACHE_KOBJ_ID "capcache"
+#define CCACHE_KOBJ_ID "ccache"
+#define NCACHE_KOBJ_ID "ncache"
+#define PC_KOBJ_ID "pc"
+#define STATS_KOBJ_ID "stats"
+struct orangefs_obj {
+        struct kobject kobj;
+        int op_timeout_secs;
+        int perf_counter_reset;
+        int perf_history_size;
+        int perf_time_interval_secs;
+        int slot_timeout_secs;
+};
+struct acache_orangefs_obj {
+        struct kobject kobj;
+        int hard_limit;
+        int reclaim_percentage;
+        int soft_limit;
+        int timeout_msecs;
+};
+struct capcache_orangefs_obj {
+        struct kobject kobj;
+        int hard_limit;
+        int reclaim_percentage;
+        int soft_limit;
+        int timeout_secs;
+};
+struct ccache_orangefs_obj {
+        struct kobject kobj;
+        int hard_limit;
+        int reclaim_percentage;
+        int soft_limit;
+        int timeout_secs;
+};
+struct ncache_orangefs_obj {
+        struct kobject kobj;
+        int hard_limit;
+        int reclaim_percentage;
+        int soft_limit;
+        int timeout_msecs;
+};
+struct pc_orangefs_obj {
+        struct kobject kobj;
+        char *acache;
+        char *capcache;
+        char *ncache;
+};
+struct stats_orangefs_obj {
+        struct kobject kobj;
+        int reads;
+        int writes;
+};
+struct orangefs_attribute {
+        struct attribute attr;
+        ssize_t (*show)(struct orangefs_obj *orangefs_obj,
+                        struct orangefs_attribute *attr,
+                        char *buf);
+        ssize_t (*store)(struct orangefs_obj *orangefs_obj,
+                         struct orangefs_attribute *attr,
+                         const char *buf,
+                         size_t count);
+};
+struct acache_orangefs_attribute {
+        struct attribute attr;
+        ssize_t (*show)(struct acache_orangefs_obj *acache_orangefs_obj,
+                        struct acache_orangefs_attribute *attr,
+                        char *buf);
+        ssize_t (*store)(struct acache_orangefs_obj *acache_orangefs_obj,
+                         struct acache_orangefs_attribute *attr,
+                         const char *buf,
+                         size_t count);
+};
+struct capcache_orangefs_attribute {
+        struct attribute attr;
+        ssize_t (*show)(struct capcache_orangefs_obj *capcache_orangefs_obj,
+                        struct capcache_orangefs_attribute *attr,
+                        char *buf);
+        ssize_t (*store)(struct capcache_orangefs_obj *capcache_orangefs_obj,
+                         struct capcache_orangefs_attribute *attr,
+                         const char *buf,
+                         size_t count);
+};
+struct ccache_orangefs_attribute {
+        struct attribute attr;
+        ssize_t (*show)(struct ccache_orangefs_obj *ccache_orangefs_obj,
+                        struct ccache_orangefs_attribute *attr,
+                        char *buf);
+        ssize_t (*store)(struct ccache_orangefs_obj *ccache_orangefs_obj,
+                         struct ccache_orangefs_attribute *attr,
+                         const char *buf,
+                         size_t count);
+};
+struct ncache_orangefs_attribute {
+        struct attribute attr;
+        ssize_t (*show)(struct ncache_orangefs_obj *ncache_orangefs_obj,
+                        struct ncache_orangefs_attribute *attr,
+                        char *buf);
+        ssize_t (*store)(struct ncache_orangefs_obj *ncache_orangefs_obj,
+                         struct ncache_orangefs_attribute *attr,
+                         const char *buf,
+                         size_t count);
+};
+struct pc_orangefs_attribute {
+        struct attribute attr;
+        ssize_t (*show)(struct pc_orangefs_obj *pc_orangefs_obj,
+                        struct pc_orangefs_attribute *attr,
+                        char *buf);
+        ssize_t (*store)(struct pc_orangefs_obj *pc_orangefs_obj,
+                         struct pc_orangefs_attribute *attr,
+                         const char *buf,
+                         size_t count);
+};
+struct stats_orangefs_attribute {
+        struct attribute attr;
+        ssize_t (*show)(struct stats_orangefs_obj *stats_orangefs_obj,
+                        struct stats_orangefs_attribute *attr,
+                        char *buf);
+        ssize_t (*store)(struct stats_orangefs_obj *stats_orangefs_obj,
+                         struct stats_orangefs_attribute *attr,
+                         const char *buf,
+                         size_t count);
+};
+static ssize_t orangefs_attr_show(struct kobject *kobj,
+                                  struct attribute *attr,
+                                  char *buf)
+{
+        struct orangefs_attribute *attribute;
+        struct orangefs_obj *orangefs_obj;
+        int rc;
+        attribute = container_of(attr, struct orangefs_attribute, attr);
+        orangefs_obj = container_of(kobj, struct orangefs_obj, kobj);
+        if (!attribute->show) {
+                rc = -EIO;
+                goto out;
+        }
+        rc = attribute->show(orangefs_obj, attribute, buf);
+out:
+        return rc;
+}
+static ssize_t orangefs_attr_store(struct kobject *kobj,
+                                   struct attribute *attr,
+                                   const char *buf,
+                                   size_t len)
+{
+        struct orangefs_attribute *attribute;
+        struct orangefs_obj *orangefs_obj;
+        int rc;
+        gossip_debug(GOSSIP_SYSFS_DEBUG,
+                     "orangefs_attr_store: start\n");
+        attribute = container_of(attr, struct orangefs_attribute, attr);
+        orangefs_obj = container_of(kobj, struct orangefs_obj, kobj);
+        if (!attribute->store) {
+                rc = -EIO;
+                goto out;
+        }
+        rc = attribute->store(orangefs_obj, attribute, buf, len);
+out:
+        return rc;
+}
+static const struct sysfs_ops orangefs_sysfs_ops = {
+        .show = orangefs_attr_show,
+        .store = orangefs_attr_store,
+};
+static ssize_t acache_orangefs_attr_show(struct kobject *kobj,
+                                         struct attribute *attr,
+                                         char *buf)
+{
+        struct acache_orangefs_attribute *attribute;
+        struct acache_orangefs_obj *acache_orangefs_obj;
+        int rc;
+        attribute = container_of(attr, struct acache_orangefs_attribute, attr);
+        acache_orangefs_obj =
+                container_of(kobj, struct acache_orangefs_obj, kobj);
+        if (!attribute->show) {
+                rc = -EIO;
+                goto out;
+        }
+        rc = attribute->show(acache_orangefs_obj, attribute, buf);
+out:
+        return rc;
+}
+static ssize_t acache_orangefs_attr_store(struct kobject *kobj,
+                                          struct attribute *attr,
+                                          const char *buf,
+                                          size_t len)
+{
+        struct acache_orangefs_attribute *attribute;
+        struct acache_orangefs_obj *acache_orangefs_obj;
+        int rc;
+        gossip_debug(GOSSIP_SYSFS_DEBUG,
+                     "acache_orangefs_attr_store: start\n");
+        attribute = container_of(attr, struct acache_orangefs_attribute, attr);
+        acache_orangefs_obj =
+                container_of(kobj, struct acache_orangefs_obj, kobj);
+        if (!attribute->store) {
+                rc = -EIO;
+                goto out;
+        }
+        rc = attribute->store(acache_orangefs_obj, attribute, buf, len);
+out:
+        return rc;
+}
+static const struct sysfs_ops acache_orangefs_sysfs_ops = {
+        .show = acache_orangefs_attr_show,
+        .store = acache_orangefs_attr_store,
+};
+static ssize_t capcache_orangefs_attr_show(struct kobject *kobj,
+                                           struct attribute *attr,
+                                           char *buf)
+{
+        struct capcache_orangefs_attribute *attribute;
+        struct capcache_orangefs_obj *capcache_orangefs_obj;
+        int rc;
+        attribute =
+                container_of(attr, struct capcache_orangefs_attribute, attr);
+        capcache_orangefs_obj =
+                container_of(kobj, struct capcache_orangefs_obj, kobj);
+        if (!attribute->show) {
+                rc = -EIO;
+                goto out;
+        }
+        rc = attribute->show(capcache_orangefs_obj, attribute, buf);
+out:
+        return rc;
+}
+static ssize_t capcache_orangefs_attr_store(struct kobject *kobj,
+                                            struct attribute *attr,
+                                            const char *buf,
+                                            size_t len)
+{
+        struct capcache_orangefs_attribute *attribute;
+        struct capcache_orangefs_obj *capcache_orangefs_obj;
+        int rc;
+        gossip_debug(GOSSIP_SYSFS_DEBUG,
+                     "capcache_orangefs_attr_store: start\n");
+        attribute =
+                container_of(attr, struct capcache_orangefs_attribute, attr);
+        capcache_orangefs_obj =
+                container_of(kobj, struct capcache_orangefs_obj, kobj);
+        if (!attribute->store) {
+                rc = -EIO;
+                goto out;
+        }
+        rc = attribute->store(capcache_orangefs_obj, attribute, buf, len);
+out:
+        return rc;
+}
+static const struct sysfs_ops capcache_orangefs_sysfs_ops = {
+        .show = capcache_orangefs_attr_show,
+        .store = capcache_orangefs_attr_store,
+};
+static ssize_t ccache_orangefs_attr_show(struct kobject *kobj,
+                                         struct attribute *attr,
+                                         char *buf)
+{
+        struct ccache_orangefs_attribute *attribute;
+        struct ccache_orangefs_obj *ccache_orangefs_obj;
+        int rc;
+        attribute =
+                container_of(attr, struct ccache_orangefs_attribute, attr);
+        ccache_orangefs_obj =
+                container_of(kobj, struct ccache_orangefs_obj, kobj);
+        if (!attribute->show) {
+                rc = -EIO;
+                goto out;
+        }
+        rc = attribute->show(ccache_orangefs_obj, attribute, buf);
+out:
+        return rc;
+}
+static ssize_t ccache_orangefs_attr_store(struct kobject *kobj,
+                                          struct attribute *attr,
+                                          const char *buf,
+                                          size_t len)
+{
+        struct ccache_orangefs_attribute *attribute;
+        struct ccache_orangefs_obj *ccache_orangefs_obj;
+        int rc;
+        gossip_debug(GOSSIP_SYSFS_DEBUG,
+                     "ccache_orangefs_attr_store: start\n");
+        attribute =
+                container_of(attr, struct ccache_orangefs_attribute, attr);
+        ccache_orangefs_obj =
+                container_of(kobj, struct ccache_orangefs_obj, kobj);
+        if (!attribute->store) {
+                rc = -EIO;
+                goto out;
+        }
+        rc = attribute->store(ccache_orangefs_obj, attribute, buf, len);
+out:
+        return rc;
+}
+static const struct sysfs_ops ccache_orangefs_sysfs_ops = {
+        .show = ccache_orangefs_attr_show,
+        .store = ccache_orangefs_attr_store,
+};
+static ssize_t ncache_orangefs_attr_show(struct kobject *kobj,
+                                         struct attribute *attr,
+                                         char *buf)
+{
+        struct ncache_orangefs_attribute *attribute;
+        struct ncache_orangefs_obj *ncache_orangefs_obj;
+        int rc;
+        attribute = container_of(attr, struct ncache_orangefs_attribute, attr);
+        ncache_orangefs_obj =
+                container_of(kobj, struct ncache_orangefs_obj, kobj);
+        if (!attribute->show) {
+                rc = -EIO;
+                goto out;
+        }
+        rc = attribute->show(ncache_orangefs_obj, attribute, buf);
+out:
+        return rc;
+}
+static ssize_t ncache_orangefs_attr_store(struct kobject *kobj,
+                                          struct attribute *attr,
+                                          const char *buf,
+                                          size_t len)
+{
+        struct ncache_orangefs_attribute *attribute;
+        struct ncache_orangefs_obj *ncache_orangefs_obj;
+        int rc;
+        gossip_debug(GOSSIP_SYSFS_DEBUG,
+                     "ncache_orangefs_attr_store: start\n");
+        attribute = container_of(attr, struct ncache_orangefs_attribute, attr);
+        ncache_orangefs_obj =
+                container_of(kobj, struct ncache_orangefs_obj, kobj);
+        if (!attribute->store) {
+                rc = -EIO;
+                goto out;
+        }
+        rc = attribute->store(ncache_orangefs_obj, attribute, buf, len);
+out:
+        return rc;
+}
+static const struct sysfs_ops ncache_orangefs_sysfs_ops = {
+        .show = ncache_orangefs_attr_show,
+        .store = ncache_orangefs_attr_store,
+};
+static ssize_t pc_orangefs_attr_show(struct kobject *kobj,
+                                     struct attribute *attr,
+                                     char *buf)
+{
+        struct pc_orangefs_attribute *attribute;
+        struct pc_orangefs_obj *pc_orangefs_obj;
+        int rc;
+        attribute = container_of(attr, struct pc_orangefs_attribute, attr);
+        pc_orangefs_obj =
+                container_of(kobj, struct pc_orangefs_obj, kobj);
+        if (!attribute->show) {
+                rc = -EIO;
+                goto out;
+        }
+        rc = attribute->show(pc_orangefs_obj, attribute, buf);
+out:
+        return rc;
+}
+static const struct sysfs_ops pc_orangefs_sysfs_ops = {
+        .show = pc_orangefs_attr_show,
+};
+static ssize_t stats_orangefs_attr_show(struct kobject *kobj,
+                                        struct attribute *attr,
+                                        char *buf)
+{
+        struct stats_orangefs_attribute *attribute;
+        struct stats_orangefs_obj *stats_orangefs_obj;
+        int rc;
+        attribute = container_of(attr, struct stats_orangefs_attribute, attr);
+        stats_orangefs_obj =
+                container_of(kobj, struct stats_orangefs_obj, kobj);
+        if (!attribute->show) {
+                rc = -EIO;
+                goto out;
+        }
+        rc = attribute->show(stats_orangefs_obj, attribute, buf);
+out:
+        return rc;
+}
+static const struct sysfs_ops stats_orangefs_sysfs_ops = {
+        .show = stats_orangefs_attr_show,
+};
+static void orangefs_release(struct kobject *kobj)
+{
+        struct orangefs_obj *orangefs_obj;
+        orangefs_obj = container_of(kobj, struct orangefs_obj, kobj);
+        kfree(orangefs_obj);
+}
+static void acache_orangefs_release(struct kobject *kobj)
+{
+        struct acache_orangefs_obj *acache_orangefs_obj;
+        acache_orangefs_obj =
+                container_of(kobj, struct acache_orangefs_obj, kobj);
+        kfree(acache_orangefs_obj);
+}
+static void capcache_orangefs_release(struct kobject *kobj)
+{
+        struct capcache_orangefs_obj *capcache_orangefs_obj;
+        capcache_orangefs_obj =
+                container_of(kobj, struct capcache_orangefs_obj, kobj);
+        kfree(capcache_orangefs_obj);
+}
+static void ccache_orangefs_release(struct kobject *kobj)
+{
+        struct ccache_orangefs_obj *ccache_orangefs_obj;
+        ccache_orangefs_obj =
+                container_of(kobj, struct ccache_orangefs_obj, kobj);
+        kfree(ccache_orangefs_obj);
+}
+static void ncache_orangefs_release(struct kobject *kobj)
+{
+        struct ncache_orangefs_obj *ncache_orangefs_obj;
+        ncache_orangefs_obj =
+                container_of(kobj, struct ncache_orangefs_obj, kobj);
+        kfree(ncache_orangefs_obj);
+}
+static void pc_orangefs_release(struct kobject *kobj)
+{
+        struct pc_orangefs_obj *pc_orangefs_obj;
+        pc_orangefs_obj =
+                container_of(kobj, struct pc_orangefs_obj, kobj);
+        kfree(pc_orangefs_obj);
+}
+static void stats_orangefs_release(struct kobject *kobj)
+{
+        struct stats_orangefs_obj *stats_orangefs_obj;
+        stats_orangefs_obj =
+                container_of(kobj, struct stats_orangefs_obj, kobj);
+        kfree(stats_orangefs_obj);
+}
+static ssize_t sysfs_int_show(char *kobj_id, char *buf, void *attr)
+{
+        int rc = -EIO;
+        struct orangefs_attribute *orangefs_attr;
+        struct stats_orangefs_attribute *stats_orangefs_attr;
+        gossip_debug(GOSSIP_SYSFS_DEBUG, "sysfs_int_show: id:%s:\n", kobj_id);
+        if (!strcmp(kobj_id, ORANGEFS_KOBJ_ID)) {
+                orangefs_attr = (struct orangefs_attribute *)attr;
+                if (!strcmp(orangefs_attr->attr.name, "op_timeout_secs")) {
+                        rc = scnprintf(buf,
+                                       PAGE_SIZE,
+                                       "%d\n",
+                                       op_timeout_secs);
+                        goto out;
+                } else if (!strcmp(orangefs_attr->attr.name,
+                                   "slot_timeout_secs")) {
+                        rc = scnprintf(buf,
+                                       PAGE_SIZE,
+                                       "%d\n",
+                                       slot_timeout_secs);
+                        goto out;
+                } else {
+                        goto out;
+                }
+        } else if (!strcmp(kobj_id, STATS_KOBJ_ID)) {
+                stats_orangefs_attr = (struct stats_orangefs_attribute *)attr;
+                if (!strcmp(stats_orangefs_attr->attr.name, "reads")) {
+                        rc = scnprintf(buf,
+                                       PAGE_SIZE,
+                                       "%lu\n",
+                                       g_orangefs_stats.reads);
+                        goto out;
+                } else if (!strcmp(stats_orangefs_attr->attr.name, "writes")) {
+                        rc = scnprintf(buf,
+                                       PAGE_SIZE,
+                                       "%lu\n",
+                                       g_orangefs_stats.writes);
+                        goto out;
+                } else {
+                        goto out;
+                }
+        }
+out:
+        return rc;
+}
+static ssize_t int_orangefs_show(struct orangefs_obj *orangefs_obj,
+                                 struct orangefs_attribute *attr,
+                                 char *buf)
+{
+        int rc;
+        gossip_debug(GOSSIP_SYSFS_DEBUG,
+                     "int_orangefs_show:start attr->attr.name:%s:\n",
+                     attr->attr.name);
+        rc = sysfs_int_show(ORANGEFS_KOBJ_ID, buf, (void *) attr);
+        return rc;
+}
+static ssize_t int_stats_show(struct stats_orangefs_obj *stats_orangefs_obj,
+                        struct stats_orangefs_attribute *attr,
+                        char *buf)
+{
+        int rc;
+        gossip_debug(GOSSIP_SYSFS_DEBUG,
+                     "int_stats_show:start attr->attr.name:%s:\n",
+                     attr->attr.name);
+        rc = sysfs_int_show(STATS_KOBJ_ID, buf, (void *) attr);
+        return rc;
+}
+static ssize_t int_store(struct orangefs_obj *orangefs_obj,
+                         struct orangefs_attribute *attr,
+                         const char *buf,
+                         size_t count)
+{
+        int rc = 0;
+        gossip_debug(GOSSIP_SYSFS_DEBUG,
+                     "int_store: start attr->attr.name:%s: buf:%s:\n",
+                     attr->attr.name, buf);
+        if (!strcmp(attr->attr.name, "op_timeout_secs")) {
+                rc = kstrtoint(buf, 0, &op_timeout_secs);
+                goto out;
+        } else if (!strcmp(attr->attr.name, "slot_timeout_secs")) {
+                rc = kstrtoint(buf, 0, &slot_timeout_secs);
+                goto out;
+        } else {
+                goto out;
+        }
+out:
+        if (rc)
+                rc = -EINVAL;
+        else
+                rc = count;
+        return rc;
+}
+/*
+ * obtain attribute values from userspace with a service operation.
+ */
+static int sysfs_service_op_show(char *kobj_id, char *buf, void *attr)
+{
+        struct orangefs_kernel_op_s *new_op = NULL;
+        int rc = 0;
+        char *ser_op_type = NULL;
+        struct orangefs_attribute *orangefs_attr;
+        struct acache_orangefs_attribute *acache_attr;
+        struct capcache_orangefs_attribute *capcache_attr;
+        struct ccache_orangefs_attribute *ccache_attr;
+        struct ncache_orangefs_attribute *ncache_attr;
+        struct pc_orangefs_attribute *pc_attr;
+        __u32 op_alloc_type;
+        gossip_debug(GOSSIP_SYSFS_DEBUG,
+                     "sysfs_service_op_show: id:%s:\n",
+                     kobj_id);
+        if (strcmp(kobj_id, PC_KOBJ_ID))
+                op_alloc_type = ORANGEFS_VFS_OP_PARAM;
+        else
+                op_alloc_type = ORANGEFS_VFS_OP_PERF_COUNT;
+        new_op = op_alloc(op_alloc_type);
+        if (!new_op)
+                return -ENOMEM;
+        /* Can't do a service_operation if the client is not running... */
+        rc = is_daemon_in_service();
+        if (rc) {
+                pr_info("%s: Client not running :%d:\n",
+                        __func__,
+                        is_daemon_in_service());
+                goto out;
+        }
+        if (strcmp(kobj_id, PC_KOBJ_ID))
+                new_op->upcall.req.param.type = ORANGEFS_PARAM_REQUEST_GET;
+        if (!strcmp(kobj_id, ORANGEFS_KOBJ_ID)) {
+                orangefs_attr = (struct orangefs_attribute *)attr;
+                if (!strcmp(orangefs_attr->attr.name, "perf_history_size"))
+                        new_op->upcall.req.param.op =
+                                ORANGEFS_PARAM_REQUEST_OP_PERF_HISTORY_SIZE;
+                else if (!strcmp(orangefs_attr->attr.name,
+                                 "perf_time_interval_secs"))
+                        new_op->upcall.req.param.op =
+                                ORANGEFS_PARAM_REQUEST_OP_PERF_TIME_INTERVAL_SECS;
+                else if (!strcmp(orangefs_attr->attr.name,
+                                 "perf_counter_reset"))
+                        new_op->upcall.req.param.op =
+                                ORANGEFS_PARAM_REQUEST_OP_PERF_RESET;
+        } else if (!strcmp(kobj_id, ACACHE_KOBJ_ID)) {
+                acache_attr = (struct acache_orangefs_attribute *)attr;
+                if (!strcmp(acache_attr->attr.name, "timeout_msecs"))
+                        new_op->upcall.req.param.op =
+                                ORANGEFS_PARAM_REQUEST_OP_ACACHE_TIMEOUT_MSECS;
+                if (!strcmp(acache_attr->attr.name, "hard_limit"))
+                        new_op->upcall.req.param.op =
+                                ORANGEFS_PARAM_REQUEST_OP_ACACHE_HARD_LIMIT;
+                if (!strcmp(acache_attr->attr.name, "soft_limit"))
+                        new_op->upcall.req.param.op =
+                                ORANGEFS_PARAM_REQUEST_OP_ACACHE_SOFT_LIMIT;
+                if (!strcmp(acache_attr->attr.name, "reclaim_percentage"))
+                        new_op->upcall.req.param.op =
+                          ORANGEFS_PARAM_REQUEST_OP_ACACHE_RECLAIM_PERCENTAGE;
+        } else if (!strcmp(kobj_id, CAPCACHE_KOBJ_ID)) {
+                capcache_attr = (struct capcache_orangefs_attribute *)attr;
+                if (!strcmp(capcache_attr->attr.name, "timeout_secs"))
+                        new_op->upcall.req.param.op =
+                                ORANGEFS_PARAM_REQUEST_OP_CAPCACHE_TIMEOUT_SECS;
+                if (!strcmp(capcache_attr->attr.name, "hard_limit"))
+                        new_op->upcall.req.param.op =
+                                ORANGEFS_PARAM_REQUEST_OP_CAPCACHE_HARD_LIMIT;
+                if (!strcmp(capcache_attr->attr.name, "soft_limit"))
+                        new_op->upcall.req.param.op =
+                                ORANGEFS_PARAM_REQUEST_OP_CAPCACHE_SOFT_LIMIT;
+                if (!strcmp(capcache_attr->attr.name, "reclaim_percentage"))
+                        new_op->upcall.req.param.op =
+                          ORANGEFS_PARAM_REQUEST_OP_CAPCACHE_RECLAIM_PERCENTAGE;
+        } else if (!strcmp(kobj_id, CCACHE_KOBJ_ID)) {
+                ccache_attr = (struct ccache_orangefs_attribute *)attr;
+                if (!strcmp(ccache_attr->attr.name, "timeout_secs"))
+                        new_op->upcall.req.param.op =
+                                ORANGEFS_PARAM_REQUEST_OP_CCACHE_TIMEOUT_SECS;
+                if (!strcmp(ccache_attr->attr.name, "hard_limit"))
+                        new_op->upcall.req.param.op =
+                                ORANGEFS_PARAM_REQUEST_OP_CCACHE_HARD_LIMIT;
+                if (!strcmp(ccache_attr->attr.name, "soft_limit"))
+                        new_op->upcall.req.param.op =
+                                ORANGEFS_PARAM_REQUEST_OP_CCACHE_SOFT_LIMIT;
+                if (!strcmp(ccache_attr->attr.name, "reclaim_percentage"))
+                        new_op->upcall.req.param.op =
+                          ORANGEFS_PARAM_REQUEST_OP_CCACHE_RECLAIM_PERCENTAGE;
+        } else if (!strcmp(kobj_id, NCACHE_KOBJ_ID)) {
+                ncache_attr = (struct ncache_orangefs_attribute *)attr;
+                if (!strcmp(ncache_attr->attr.name, "timeout_msecs"))
+                        new_op->upcall.req.param.op =
+                                ORANGEFS_PARAM_REQUEST_OP_NCACHE_TIMEOUT_MSECS;
+                if (!strcmp(ncache_attr->attr.name, "hard_limit"))
+                        new_op->upcall.req.param.op =
+                                ORANGEFS_PARAM_REQUEST_OP_NCACHE_HARD_LIMIT;
+                if (!strcmp(ncache_attr->attr.name, "soft_limit"))
+                        new_op->upcall.req.param.op =
+                                ORANGEFS_PARAM_REQUEST_OP_NCACHE_SOFT_LIMIT;
+                if (!strcmp(ncache_attr->attr.name, "reclaim_percentage"))
+                        new_op->upcall.req.param.op =
+                          ORANGEFS_PARAM_REQUEST_OP_NCACHE_RECLAIM_PERCENTAGE;
+        } else if (!strcmp(kobj_id, PC_KOBJ_ID)) {
+                pc_attr = (struct pc_orangefs_attribute *)attr;
+                if (!strcmp(pc_attr->attr.name, ACACHE_KOBJ_ID))
+                        new_op->upcall.req.perf_count.type =
+                                ORANGEFS_PERF_COUNT_REQUEST_ACACHE;
+                if (!strcmp(pc_attr->attr.name, CAPCACHE_KOBJ_ID))
+                        new_op->upcall.req.perf_count.type =
+                                ORANGEFS_PERF_COUNT_REQUEST_CAPCACHE;
+                if (!strcmp(pc_attr->attr.name, NCACHE_KOBJ_ID))
+                        new_op->upcall.req.perf_count.type =
+                                ORANGEFS_PERF_COUNT_REQUEST_NCACHE;
+        } else {
+                gossip_err("sysfs_service_op_show: unknown kobj_id:%s:\n",
+                           kobj_id);
+                rc = -EINVAL;
+                goto out;
+        }
+        if (strcmp(kobj_id, PC_KOBJ_ID))
+                ser_op_type = "orangefs_param";
+        else
+                ser_op_type = "orangefs_perf_count";
+        /*
+         * The service_operation will return an errno return code on
+         * error, and zero on success.
+         */
+        rc = service_operation(new_op, ser_op_type, ORANGEFS_OP_INTERRUPTIBLE);
+out:
+        if (!rc) {
+                if (strcmp(kobj_id, PC_KOBJ_ID)) {
+                        rc = scnprintf(buf,
+                                       PAGE_SIZE,
+                                       "%d\n",
+                                       (int)new_op->downcall.resp.param.value);
+                } else {
+                        rc = scnprintf(
+                                buf,
+                                PAGE_SIZE,
+                                "%s",
+                                new_op->downcall.resp.perf_count.buffer);
+                }
+        }
+        op_release(new_op);
+        return rc;
+}
+static ssize_t service_orangefs_show(struct orangefs_obj *orangefs_obj,
+                                     struct orangefs_attribute *attr,
+                                     char *buf)
+{
+        int rc = 0;
+        rc = sysfs_service_op_show(ORANGEFS_KOBJ_ID, buf, (void *)attr);
+        return rc;
+}
+static ssize_t
+        service_acache_show(struct acache_orangefs_obj *acache_orangefs_obj,
+                            struct acache_orangefs_attribute *attr,
+                            char *buf)
+{
+        int rc = 0;
+        rc = sysfs_service_op_show(ACACHE_KOBJ_ID, buf, (void *)attr);
+        return rc;
+}
+static ssize_t service_capcache_show(struct capcache_orangefs_obj
+                                        *capcache_orangefs_obj,
+                                     struct capcache_orangefs_attribute *attr,
+                                     char *buf)
+{
+        int rc = 0;
+        rc = sysfs_service_op_show(CAPCACHE_KOBJ_ID, buf, (void *)attr);
+        return rc;
+}
+static ssize_t service_ccache_show(struct ccache_orangefs_obj
+                                        *ccache_orangefs_obj,
+                                   struct ccache_orangefs_attribute *attr,
+                                   char *buf)
+{
+        int rc = 0;
+        rc = sysfs_service_op_show(CCACHE_KOBJ_ID, buf, (void *)attr);
+        return rc;
+}
+static ssize_t
+        service_ncache_show(struct ncache_orangefs_obj *ncache_orangefs_obj,
+                            struct ncache_orangefs_attribute *attr,
+                            char *buf)
+{
+        int rc = 0;
+        rc = sysfs_service_op_show(NCACHE_KOBJ_ID, buf, (void *)attr);
+        return rc;
+}
+static ssize_t
+        service_pc_show(struct pc_orangefs_obj *pc_orangefs_obj,
+                            struct pc_orangefs_attribute *attr,
+                            char *buf)
+{
+        int rc = 0;
+        rc = sysfs_service_op_show(PC_KOBJ_ID, buf, (void *)attr);
+        return rc;
+}
+/*
+ * pass attribute values back to userspace with a service operation.
+ *
+ * We have to do a memory allocation, an sscanf and a service operation.
+ * And we have to evaluate what the user entered, to make sure the
+ * value is within the range supported by the attribute. So, there's
+ * a lot of return code checking and mapping going on here.
+ *
+ * We want to return 1 if we think everything went OK, and
+ * EINVAL if not.
+ */
+static int sysfs_service_op_store(char *kobj_id, const char *buf, void *attr)
+{
+        struct orangefs_kernel_op_s *new_op = NULL;
+        int val = 0;
+        int rc = 0;
+        struct orangefs_attribute *orangefs_attr;
+        struct acache_orangefs_attribute *acache_attr;
+        struct capcache_orangefs_attribute *capcache_attr;
+        struct ccache_orangefs_attribute *ccache_attr;
+        struct ncache_orangefs_attribute *ncache_attr;
+        gossip_debug(GOSSIP_SYSFS_DEBUG,
+                     "sysfs_service_op_store: id:%s:\n",
+                     kobj_id);
+        new_op = op_alloc(ORANGEFS_VFS_OP_PARAM);
+        if (!new_op)
+                return -EINVAL; /* sic */
+        /* Can't do a service_operation if the client is not running... */
+        rc = is_daemon_in_service();
+        if (rc) {
+                pr_info("%s: Client not running :%d:\n",
+                        __func__,
+                        is_daemon_in_service());
+                goto out;
+        }
+        /*
+         * The value we want to send back to userspace is in buf.
+         */
+        rc = kstrtoint(buf, 0, &val);
+        if (rc)
+                goto out;
+        if (!strcmp(kobj_id, ORANGEFS_KOBJ_ID)) {
+                orangefs_attr = (struct orangefs_attribute *)attr;
+                if (!strcmp(orangefs_attr->attr.name, "perf_history_size")) {
+                        if (val > 0) {
+                                new_op->upcall.req.param.op =
+                                  ORANGEFS_PARAM_REQUEST_OP_PERF_HISTORY_SIZE;
+                        } else {
+                                rc = 0;
+                                goto out;
+                        }
+                } else if (!strcmp(orangefs_attr->attr.name,
+                                   "perf_time_interval_secs")) {
+                        if (val > 0) {
+                                new_op->upcall.req.param.op =
+                                ORANGEFS_PARAM_REQUEST_OP_PERF_TIME_INTERVAL_SECS;
+                        } else {
+                                rc = 0;
+                                goto out;
+                        }
+                } else if (!strcmp(orangefs_attr->attr.name,
+                                   "perf_counter_reset")) {
+                        if ((val == 0) || (val == 1)) {
+                                new_op->upcall.req.param.op =
+                                        ORANGEFS_PARAM_REQUEST_OP_PERF_RESET;
+                        } else {
+                                rc = 0;
+                                goto out;
+                        }
+                }
+        } else if (!strcmp(kobj_id, ACACHE_KOBJ_ID)) {
+                acache_attr = (struct acache_orangefs_attribute *)attr;
+                if (!strcmp(acache_attr->attr.name, "hard_limit")) {
+                        if (val > -1) {
+                                new_op->upcall.req.param.op =
+                                  ORANGEFS_PARAM_REQUEST_OP_ACACHE_HARD_LIMIT;
+                        } else {
+                                rc = 0;
+                                goto out;
+                        }
+                } else if (!strcmp(acache_attr->attr.name, "soft_limit")) {
+                        if (val > -1) {
+                                new_op->upcall.req.param.op =
+                                  ORANGEFS_PARAM_REQUEST_OP_ACACHE_SOFT_LIMIT;
+                        } else {
+                                rc = 0;
+                                goto out;
+                        }
+                } else if (!strcmp(acache_attr->attr.name,
+                                   "reclaim_percentage")) {
+                        if ((val > -1) && (val < 101)) {
+                                new_op->upcall.req.param.op =
+                                  ORANGEFS_PARAM_REQUEST_OP_ACACHE_RECLAIM_PERCENTAGE;
+                        } else {
+                                rc = 0;
+                                goto out;
+                        }
+                } else if (!strcmp(acache_attr->attr.name, "timeout_msecs")) {
+                        if (val > -1) {
+                                new_op->upcall.req.param.op =
+                                  ORANGEFS_PARAM_REQUEST_OP_ACACHE_TIMEOUT_MSECS;
+                        } else {
+                                rc = 0;
+                                goto out;
+                        }
+                }
+        } else if (!strcmp(kobj_id, CAPCACHE_KOBJ_ID)) {
+                capcache_attr = (struct capcache_orangefs_attribute *)attr;
+                if (!strcmp(capcache_attr->attr.name, "hard_limit")) {
+                        if (val > -1) {
+                                new_op->upcall.req.param.op =
+                                  ORANGEFS_PARAM_REQUEST_OP_CAPCACHE_HARD_LIMIT;
+                        } else {
+                                rc = 0;
+                                goto out;
+                        }
+                } else if (!strcmp(capcache_attr->attr.name, "soft_limit")) {
+                        if (val > -1) {
+                                new_op->upcall.req.param.op =
+                                  ORANGEFS_PARAM_REQUEST_OP_CAPCACHE_SOFT_LIMIT;
+                        } else {
+                                rc = 0;
+                                goto out;
+                        }
+                } else if (!strcmp(capcache_attr->attr.name,
+                                   "reclaim_percentage")) {
+                        if ((val > -1) && (val < 101)) {
+                                new_op->upcall.req.param.op =
+                                  ORANGEFS_PARAM_REQUEST_OP_CAPCACHE_RECLAIM_PERCENTAGE;
+                        } else {
+                                rc = 0;
+                                goto out;
+                        }
+                } else if (!strcmp(capcache_attr->attr.name, "timeout_secs")) {
+                        if (val > -1) {
+                                new_op->upcall.req.param.op =
+                                  ORANGEFS_PARAM_REQUEST_OP_CAPCACHE_TIMEOUT_SECS;
+                        } else {
+                                rc = 0;
+                                goto out;
+                        }
+                }
+        } else if (!strcmp(kobj_id, CCACHE_KOBJ_ID)) {
+                ccache_attr = (struct ccache_orangefs_attribute *)attr;
+                if (!strcmp(ccache_attr->attr.name, "hard_limit")) {
+                        if (val > -1) {
+                                new_op->upcall.req.param.op =
+                                  ORANGEFS_PARAM_REQUEST_OP_CCACHE_HARD_LIMIT;
+                        } else {
+                                rc = 0;
+                                goto out;
+                        }
+                } else if (!strcmp(ccache_attr->attr.name, "soft_limit")) {
+                        if (val > -1) {
+                                new_op->upcall.req.param.op =
+                                  ORANGEFS_PARAM_REQUEST_OP_CCACHE_SOFT_LIMIT;
+                        } else {
+                                rc = 0;
+                                goto out;
+                        }
+                } else if (!strcmp(ccache_attr->attr.name,
+                                   "reclaim_percentage")) {
+                        if ((val > -1) && (val < 101)) {
+                                new_op->upcall.req.param.op =
+                                  ORANGEFS_PARAM_REQUEST_OP_CCACHE_RECLAIM_PERCENTAGE;
+                        } else {
+                                rc = 0;
+                                goto out;
+                        }
+                } else if (!strcmp(ccache_attr->attr.name, "timeout_secs")) {
+                        if (val > -1) {
+                                new_op->upcall.req.param.op =
+                                  ORANGEFS_PARAM_REQUEST_OP_CCACHE_TIMEOUT_SECS;
+                        } else {
+                                rc = 0;
+                                goto out;
+                        }
+                }
+        } else if (!strcmp(kobj_id, NCACHE_KOBJ_ID)) {
+                ncache_attr = (struct ncache_orangefs_attribute *)attr;
+                if (!strcmp(ncache_attr->attr.name, "hard_limit")) {
+                        if (val > -1) {
+                                new_op->upcall.req.param.op =
+                                  ORANGEFS_PARAM_REQUEST_OP_NCACHE_HARD_LIMIT;
+                        } else {
+                                rc = 0;
+                                goto out;
+                        }
+                } else if (!strcmp(ncache_attr->attr.name, "soft_limit")) {
+                        if (val > -1) {
+                                new_op->upcall.req.param.op =
+                                  ORANGEFS_PARAM_REQUEST_OP_NCACHE_SOFT_LIMIT;
+                        } else {
+                                rc = 0;
+                                goto out;
+                        }
+                } else if (!strcmp(ncache_attr->attr.name,
+                                   "reclaim_percentage")) {
+                        if ((val > -1) && (val < 101)) {
+                                new_op->upcall.req.param.op =
+                                        ORANGEFS_PARAM_REQUEST_OP_NCACHE_RECLAIM_PERCENTAGE;
+                        } else {
+                                rc = 0;
+                                goto out;
+                        }
+                } else if (!strcmp(ncache_attr->attr.name, "timeout_msecs")) {
+                        if (val > -1) {
+                                new_op->upcall.req.param.op =
+                                  ORANGEFS_PARAM_REQUEST_OP_NCACHE_TIMEOUT_MSECS;
+                        } else {
+                                rc = 0;
+                                goto out;
+                        }
+                }
+        } else {
+                gossip_err("sysfs_service_op_store: unknown kobj_id:%s:\n",
+                           kobj_id);
+                rc = -EINVAL;
+                goto out;
+        }
+        new_op->upcall.req.param.type = ORANGEFS_PARAM_REQUEST_SET;
+        new_op->upcall.req.param.value = val;
+        /*
+         * The service_operation will return a errno return code on
+         * error, and zero on success.
+         */
+        rc = service_operation(new_op, "orangefs_param", ORANGEFS_OP_INTERRUPTIBLE);
+        if (rc < 0) {
+                gossip_err("sysfs_service_op_store: service op returned:%d:\n",
+                        rc);
+                rc = 0;
+        } else {
+                rc = 1;
+        }
+out:
+        op_release(new_op);
+        if (rc == -ENOMEM || rc == 0)
+                rc = -EINVAL;
+        return rc;
+}
+static ssize_t
+        service_orangefs_store(struct orangefs_obj *orangefs_obj,
+                               struct orangefs_attribute *attr,
+                               const char *buf,
+                               size_t count)
+{
+        int rc = 0;
+        rc = sysfs_service_op_store(ORANGEFS_KOBJ_ID, buf, (void *) attr);
+        /* rc should have an errno value if the service_op went bad. */
+        if (rc == 1)
+                rc = count;
+        return rc;
+}
+static ssize_t
+        service_acache_store(struct acache_orangefs_obj *acache_orangefs_obj,
+                             struct acache_orangefs_attribute *attr,
+                             const char *buf,
+                             size_t count)
+{
+        int rc = 0;
+        rc = sysfs_service_op_store(ACACHE_KOBJ_ID, buf, (void *) attr);
+        /* rc should have an errno value if the service_op went bad. */
+        if (rc == 1)
+                rc = count;
+        return rc;
+}
+static ssize_t
+        service_capcache_store(struct capcache_orangefs_obj
+                                *capcache_orangefs_obj,
+                               struct capcache_orangefs_attribute *attr,
+                               const char *buf,
+                               size_t count)
+{
+        int rc = 0;
+        rc = sysfs_service_op_store(CAPCACHE_KOBJ_ID, buf, (void *) attr);
+        /* rc should have an errno value if the service_op went bad. */
+        if (rc == 1)
+                rc = count;
+        return rc;
+}
+static ssize_t service_ccache_store(struct ccache_orangefs_obj
+                                        *ccache_orangefs_obj,
+                                    struct ccache_orangefs_attribute *attr,
+                                    const char *buf,
+                                    size_t count)
+{
+        int rc = 0;
+        rc = sysfs_service_op_store(CCACHE_KOBJ_ID, buf, (void *) attr);
+        /* rc should have an errno value if the service_op went bad. */
+        if (rc == 1)
+                rc = count;
+        return rc;
+}
+static ssize_t
+        service_ncache_store(struct ncache_orangefs_obj *ncache_orangefs_obj,
+                             struct ncache_orangefs_attribute *attr,
+                             const char *buf,
+                             size_t count)
+{
+        int rc = 0;
+        rc = sysfs_service_op_store(NCACHE_KOBJ_ID, buf, (void *) attr);
+        /* rc should have an errno value if the service_op went bad. */
+        if (rc == 1)
+                rc = count;
+        return rc;
+}
+static struct orangefs_attribute op_timeout_secs_attribute =
+        __ATTR(op_timeout_secs, 0664, int_orangefs_show, int_store);
+static struct orangefs_attribute slot_timeout_secs_attribute =
+        __ATTR(slot_timeout_secs, 0664, int_orangefs_show, int_store);
+static struct orangefs_attribute perf_counter_reset_attribute =
+        __ATTR(perf_counter_reset,
+               0664,
+               service_orangefs_show,
+               service_orangefs_store);
+static struct orangefs_attribute perf_history_size_attribute =
+        __ATTR(perf_history_size,
+               0664,
+               service_orangefs_show,
+               service_orangefs_store);
+static struct orangefs_attribute perf_time_interval_secs_attribute =
+        __ATTR(perf_time_interval_secs,
+               0664,
+               service_orangefs_show,
+               service_orangefs_store);
+static struct attribute *orangefs_default_attrs[] = {
+        &op_timeout_secs_attribute.attr,
+        &slot_timeout_secs_attribute.attr,
+        &perf_counter_reset_attribute.attr,
+        &perf_history_size_attribute.attr,
+        &perf_time_interval_secs_attribute.attr,
+        NULL,
+};
+static struct kobj_type orangefs_ktype = {
+        .sysfs_ops = &orangefs_sysfs_ops,
+        .release = orangefs_release,
+        .default_attrs = orangefs_default_attrs,
+};
+static struct acache_orangefs_attribute acache_hard_limit_attribute =
+        __ATTR(hard_limit,
+               0664,
+               service_acache_show,
+               service_acache_store);
+static struct acache_orangefs_attribute acache_reclaim_percent_attribute =
+        __ATTR(reclaim_percentage,
+               0664,
+               service_acache_show,
+               service_acache_store);
+static struct acache_orangefs_attribute acache_soft_limit_attribute =
+        __ATTR(soft_limit,
+               0664,
+               service_acache_show,
+               service_acache_store);
+static struct acache_orangefs_attribute acache_timeout_msecs_attribute =
+        __ATTR(timeout_msecs,
+               0664,
+               service_acache_show,
+               service_acache_store);
+static struct attribute *acache_orangefs_default_attrs[] = {
+        &acache_hard_limit_attribute.attr,
+        &acache_reclaim_percent_attribute.attr,
+        &acache_soft_limit_attribute.attr,
+        &acache_timeout_msecs_attribute.attr,
+        NULL,
+};
+static struct kobj_type acache_orangefs_ktype = {
+        .sysfs_ops = &acache_orangefs_sysfs_ops,
+        .release = acache_orangefs_release,
+        .default_attrs = acache_orangefs_default_attrs,
+};
+static struct capcache_orangefs_attribute capcache_hard_limit_attribute =
+        __ATTR(hard_limit,
+               0664,
+               service_capcache_show,
+               service_capcache_store);
+static struct capcache_orangefs_attribute capcache_reclaim_percent_attribute =
+        __ATTR(reclaim_percentage,
+               0664,
+               service_capcache_show,
+               service_capcache_store);
+static struct capcache_orangefs_attribute capcache_soft_limit_attribute =
+        __ATTR(soft_limit,
+               0664,
+               service_capcache_show,
+               service_capcache_store);
+static struct capcache_orangefs_attribute capcache_timeout_secs_attribute =
+        __ATTR(timeout_secs,
+               0664,
+               service_capcache_show,
+               service_capcache_store);
+static struct attribute *capcache_orangefs_default_attrs[] = {
+        &capcache_hard_limit_attribute.attr,
+        &capcache_reclaim_percent_attribute.attr,
+        &capcache_soft_limit_attribute.attr,
+        &capcache_timeout_secs_attribute.attr,
+        NULL,
+};
+static struct kobj_type capcache_orangefs_ktype = {
+        .sysfs_ops = &capcache_orangefs_sysfs_ops,
+        .release = capcache_orangefs_release,
+        .default_attrs = capcache_orangefs_default_attrs,
+};
+static struct ccache_orangefs_attribute ccache_hard_limit_attribute =
+        __ATTR(hard_limit,
+               0664,
+               service_ccache_show,
+               service_ccache_store);
+static struct ccache_orangefs_attribute ccache_reclaim_percent_attribute =
+        __ATTR(reclaim_percentage,
+               0664,
+               service_ccache_show,
+               service_ccache_store);
+static struct ccache_orangefs_attribute ccache_soft_limit_attribute =
+        __ATTR(soft_limit,
+               0664,
+               service_ccache_show,
+               service_ccache_store);
+static struct ccache_orangefs_attribute ccache_timeout_secs_attribute =
+        __ATTR(timeout_secs,
+               0664,
+               service_ccache_show,
+               service_ccache_store);
+static struct attribute *ccache_orangefs_default_attrs[] = {
+        &ccache_hard_limit_attribute.attr,
+        &ccache_reclaim_percent_attribute.attr,
+        &ccache_soft_limit_attribute.attr,
+        &ccache_timeout_secs_attribute.attr,
+        NULL,
+};
+static struct kobj_type ccache_orangefs_ktype = {
+        .sysfs_ops = &ccache_orangefs_sysfs_ops,
+        .release = ccache_orangefs_release,
+        .default_attrs = ccache_orangefs_default_attrs,
+};
+static struct ncache_orangefs_attribute ncache_hard_limit_attribute =
+        __ATTR(hard_limit,
+               0664,
+               service_ncache_show,
+               service_ncache_store);
+static struct ncache_orangefs_attribute ncache_reclaim_percent_attribute =
+        __ATTR(reclaim_percentage,
+               0664,
+               service_ncache_show,
+               service_ncache_store);
+static struct ncache_orangefs_attribute ncache_soft_limit_attribute =
+        __ATTR(soft_limit,
+               0664,
+               service_ncache_show,
+               service_ncache_store);
+static struct ncache_orangefs_attribute ncache_timeout_msecs_attribute =
+        __ATTR(timeout_msecs,
+               0664,
+               service_ncache_show,
+               service_ncache_store);
+static struct attribute *ncache_orangefs_default_attrs[] = {
+        &ncache_hard_limit_attribute.attr,
+        &ncache_reclaim_percent_attribute.attr,
+        &ncache_soft_limit_attribute.attr,
+        &ncache_timeout_msecs_attribute.attr,
+        NULL,
+};
+static struct kobj_type ncache_orangefs_ktype = {
+        .sysfs_ops = &ncache_orangefs_sysfs_ops,
+        .release = ncache_orangefs_release,
+        .default_attrs = ncache_orangefs_default_attrs,
+};
+static struct pc_orangefs_attribute pc_acache_attribute =
+        __ATTR(acache,
+               0664,
+               service_pc_show,
+               NULL);
+static struct pc_orangefs_attribute pc_capcache_attribute =
+        __ATTR(capcache,
+               0664,
+               service_pc_show,
+               NULL);
+static struct pc_orangefs_attribute pc_ncache_attribute =
+        __ATTR(ncache,
+               0664,
+               service_pc_show,
+               NULL);
+static struct attribute *pc_orangefs_default_attrs[] = {
+        &pc_acache_attribute.attr,
+        &pc_capcache_attribute.attr,
+        &pc_ncache_attribute.attr,
+        NULL,
+};
+static struct kobj_type pc_orangefs_ktype = {
+        .sysfs_ops = &pc_orangefs_sysfs_ops,
+        .release = pc_orangefs_release,
+        .default_attrs = pc_orangefs_default_attrs,
+};
+static struct stats_orangefs_attribute stats_reads_attribute =
+        __ATTR(reads,
+               0664,
+               int_stats_show,
+               NULL);
+static struct stats_orangefs_attribute stats_writes_attribute =
+        __ATTR(writes,
+               0664,
+               int_stats_show,
+               NULL);
+static struct attribute *stats_orangefs_default_attrs[] = {
+        &stats_reads_attribute.attr,
+        &stats_writes_attribute.attr,
+        NULL,
+};
+static struct kobj_type stats_orangefs_ktype = {
+        .sysfs_ops = &stats_orangefs_sysfs_ops,
+        .release = stats_orangefs_release,
+        .default_attrs = stats_orangefs_default_attrs,
+};
+static struct orangefs_obj *orangefs_obj;
+static struct acache_orangefs_obj *acache_orangefs_obj;
+static struct capcache_orangefs_obj *capcache_orangefs_obj;
+static struct ccache_orangefs_obj *ccache_orangefs_obj;
+static struct ncache_orangefs_obj *ncache_orangefs_obj;
+static struct pc_orangefs_obj *pc_orangefs_obj;
+static struct stats_orangefs_obj *stats_orangefs_obj;
+int orangefs_sysfs_init(void)
+{
+        int rc = -EINVAL;
+        gossip_debug(GOSSIP_SYSFS_DEBUG, "orangefs_sysfs_init: start\n");
+        /* create /sys/fs/orangefs. */
+        orangefs_obj = kzalloc(sizeof(*orangefs_obj), GFP_KERNEL);
+        if (!orangefs_obj)
+                goto out;
+        rc = kobject_init_and_add(&orangefs_obj->kobj,
+                                  &orangefs_ktype,
+                                  fs_kobj,
+                                  ORANGEFS_KOBJ_ID);
+        if (rc)
+                goto ofs_obj_bail;
+        kobject_uevent(&orangefs_obj->kobj, KOBJ_ADD);
+        /* create /sys/fs/orangefs/acache. */
+        acache_orangefs_obj = kzalloc(sizeof(*acache_orangefs_obj), GFP_KERNEL);
+        if (!acache_orangefs_obj) {
+                rc = -EINVAL;
+                goto ofs_obj_bail;
+        }
+        rc = kobject_init_and_add(&acache_orangefs_obj->kobj,
+                                  &acache_orangefs_ktype,
+                                  &orangefs_obj->kobj,
+                                  ACACHE_KOBJ_ID);
+        if (rc)
+                goto acache_obj_bail;
+        kobject_uevent(&acache_orangefs_obj->kobj, KOBJ_ADD);
+        /* create /sys/fs/orangefs/capcache. */
+        capcache_orangefs_obj =
+                kzalloc(sizeof(*capcache_orangefs_obj), GFP_KERNEL);
+        if (!capcache_orangefs_obj) {
+                rc = -EINVAL;
+                goto acache_obj_bail;
+        }
+        rc = kobject_init_and_add(&capcache_orangefs_obj->kobj,
+                                  &capcache_orangefs_ktype,
+                                  &orangefs_obj->kobj,
+                                  CAPCACHE_KOBJ_ID);
+        if (rc)
+                goto capcache_obj_bail;
+        kobject_uevent(&capcache_orangefs_obj->kobj, KOBJ_ADD);
+        /* create /sys/fs/orangefs/ccache. */
+        ccache_orangefs_obj =
+                kzalloc(sizeof(*ccache_orangefs_obj), GFP_KERNEL);
+        if (!ccache_orangefs_obj) {
+                rc = -EINVAL;
+                goto capcache_obj_bail;
+        }
+        rc = kobject_init_and_add(&ccache_orangefs_obj->kobj,
+                                  &ccache_orangefs_ktype,
+                                  &orangefs_obj->kobj,
+                                  CCACHE_KOBJ_ID);
+        if (rc)
+                goto ccache_obj_bail;
+        kobject_uevent(&ccache_orangefs_obj->kobj, KOBJ_ADD);
+        /* create /sys/fs/orangefs/ncache. */
+        ncache_orangefs_obj = kzalloc(sizeof(*ncache_orangefs_obj), GFP_KERNEL);
+        if (!ncache_orangefs_obj) {
+                rc = -EINVAL;
+                goto ccache_obj_bail;
+        }
+        rc = kobject_init_and_add(&ncache_orangefs_obj->kobj,
+                                  &ncache_orangefs_ktype,
+                                  &orangefs_obj->kobj,
+                                  NCACHE_KOBJ_ID);
+        if (rc)
+                goto ncache_obj_bail;
+        kobject_uevent(&ncache_orangefs_obj->kobj, KOBJ_ADD);
+        /* create /sys/fs/orangefs/perf_counters. */
+        pc_orangefs_obj = kzalloc(sizeof(*pc_orangefs_obj), GFP_KERNEL);
+        if (!pc_orangefs_obj) {
+                rc = -EINVAL;
+                goto ncache_obj_bail;
+        }
+        rc = kobject_init_and_add(&pc_orangefs_obj->kobj,
+                                  &pc_orangefs_ktype,
+                                  &orangefs_obj->kobj,
+                                  "perf_counters");
+        if (rc)
+                goto pc_obj_bail;
+        kobject_uevent(&pc_orangefs_obj->kobj, KOBJ_ADD);
+        /* create /sys/fs/orangefs/stats. */
+        stats_orangefs_obj = kzalloc(sizeof(*stats_orangefs_obj), GFP_KERNEL);
+        if (!stats_orangefs_obj) {
+                rc = -EINVAL;
+                goto pc_obj_bail;
+        }
+        rc = kobject_init_and_add(&stats_orangefs_obj->kobj,
+                                  &stats_orangefs_ktype,
+                                  &orangefs_obj->kobj,
+                                  STATS_KOBJ_ID);
+        if (rc)
+                goto stats_obj_bail;
+        kobject_uevent(&stats_orangefs_obj->kobj, KOBJ_ADD);
+        goto out;
+stats_obj_bail:
+                kobject_put(&stats_orangefs_obj->kobj);
+pc_obj_bail:
+                kobject_put(&pc_orangefs_obj->kobj);
+ncache_obj_bail:
+                kobject_put(&ncache_orangefs_obj->kobj);
+ccache_obj_bail:
+                kobject_put(&ccache_orangefs_obj->kobj);
+capcache_obj_bail:
+                kobject_put(&capcache_orangefs_obj->kobj);
+acache_obj_bail:
+                kobject_put(&acache_orangefs_obj->kobj);
+ofs_obj_bail:
+                kobject_put(&orangefs_obj->kobj);
+out:
+        return rc;
+}
+void orangefs_sysfs_exit(void)
+{
+        gossip_debug(GOSSIP_SYSFS_DEBUG, "orangefs_sysfs_exit: start\n");
+        kobject_put(&acache_orangefs_obj->kobj);
+        kobject_put(&capcache_orangefs_obj->kobj);
+        kobject_put(&ccache_orangefs_obj->kobj);
+        kobject_put(&ncache_orangefs_obj->kobj);
+        kobject_put(&pc_orangefs_obj->kobj);
+        kobject_put(&stats_orangefs_obj->kobj);
+        kobject_put(&orangefs_obj->kobj);
+}
diff --git a/fs/orangefs/orangefs-sysfs.h b/fs/orangefs/orangefs-sysfs.h
new file mode 100644
index 000000000000..f0b76382db02
--- /dev/null
+++ b/fs/orangefs/orangefs-sysfs.h
@@ -0,0 +1,2 @@
+extern int orangefs_sysfs_init(void);
+extern void orangefs_sysfs_exit(void);
diff --git a/fs/orangefs/orangefs-utils.c b/fs/orangefs/orangefs-utils.c
new file mode 100644
index 000000000000..40f5163b56aa
--- /dev/null
+++ b/fs/orangefs/orangefs-utils.c
@@ -0,0 +1,1048 @@
+/*
+ * (C) 2001 Clemson University and The University of Chicago
+ *
+ * See COPYING in top-level directory.
+ */
+#include "protocol.h"
+#include "orangefs-kernel.h"
+#include "orangefs-dev-proto.h"
+#include "orangefs-bufmap.h"
+__s32 fsid_of_op(struct orangefs_kernel_op_s *op)
+{
+        __s32 fsid = ORANGEFS_FS_ID_NULL;
+        if (op) {
+                switch (op->upcall.type) {
+                case ORANGEFS_VFS_OP_FILE_IO:
+                        fsid = op->upcall.req.io.refn.fs_id;
+                        break;
+                case ORANGEFS_VFS_OP_LOOKUP:
+                        fsid = op->upcall.req.lookup.parent_refn.fs_id;
+                        break;
+                case ORANGEFS_VFS_OP_CREATE:
+                        fsid = op->upcall.req.create.parent_refn.fs_id;
+                        break;
+                case ORANGEFS_VFS_OP_GETATTR:
+                        fsid = op->upcall.req.getattr.refn.fs_id;
+                        break;
+                case ORANGEFS_VFS_OP_REMOVE:
+                        fsid = op->upcall.req.remove.parent_refn.fs_id;
+                        break;
+                case ORANGEFS_VFS_OP_MKDIR:
+                        fsid = op->upcall.req.mkdir.parent_refn.fs_id;
+                        break;
+                case ORANGEFS_VFS_OP_READDIR:
+                        fsid = op->upcall.req.readdir.refn.fs_id;
+                        break;
+                case ORANGEFS_VFS_OP_SETATTR:
+                        fsid = op->upcall.req.setattr.refn.fs_id;
+                        break;
+                case ORANGEFS_VFS_OP_SYMLINK:
+                        fsid = op->upcall.req.sym.parent_refn.fs_id;
+                        break;
+                case ORANGEFS_VFS_OP_RENAME:
+                        fsid = op->upcall.req.rename.old_parent_refn.fs_id;
+                        break;
+                case ORANGEFS_VFS_OP_STATFS:
+                        fsid = op->upcall.req.statfs.fs_id;
+                        break;
+                case ORANGEFS_VFS_OP_TRUNCATE:
+                        fsid = op->upcall.req.truncate.refn.fs_id;
+                        break;
+                case ORANGEFS_VFS_OP_MMAP_RA_FLUSH:
+                        fsid = op->upcall.req.ra_cache_flush.refn.fs_id;
+                        break;
+                case ORANGEFS_VFS_OP_FS_UMOUNT:
+                        fsid = op->upcall.req.fs_umount.fs_id;
+                        break;
+                case ORANGEFS_VFS_OP_GETXATTR:
+                        fsid = op->upcall.req.getxattr.refn.fs_id;
+                        break;
+                case ORANGEFS_VFS_OP_SETXATTR:
+                        fsid = op->upcall.req.setxattr.refn.fs_id;
+                        break;
+                case ORANGEFS_VFS_OP_LISTXATTR:
+                        fsid = op->upcall.req.listxattr.refn.fs_id;
+                        break;
+                case ORANGEFS_VFS_OP_REMOVEXATTR:
+                        fsid = op->upcall.req.removexattr.refn.fs_id;
+                        break;
+                case ORANGEFS_VFS_OP_FSYNC:
+                        fsid = op->upcall.req.fsync.refn.fs_id;
+                        break;
+                default:
+                        break;
+                }
+        }
+        return fsid;
+}
+static int orangefs_inode_flags(struct ORANGEFS_sys_attr_s *attrs)
+{
+        int flags = 0;
+        if (attrs->flags & ORANGEFS_IMMUTABLE_FL)
+                flags |= S_IMMUTABLE;
+        else
+                flags &= ~S_IMMUTABLE;
+        if (attrs->flags & ORANGEFS_APPEND_FL)
+                flags |= S_APPEND;
+        else
+                flags &= ~S_APPEND;
+        if (attrs->flags & ORANGEFS_NOATIME_FL)
+                flags |= S_NOATIME;
+        else
+                flags &= ~S_NOATIME;
+        return flags;
+}
+static int orangefs_inode_perms(struct ORANGEFS_sys_attr_s *attrs)
+{
+        int perm_mode = 0;
+        if (attrs->perms & ORANGEFS_O_EXECUTE)
+                perm_mode |= S_IXOTH;
+        if (attrs->perms & ORANGEFS_O_WRITE)
+                perm_mode |= S_IWOTH;
+        if (attrs->perms & ORANGEFS_O_READ)
+                perm_mode |= S_IROTH;
+        if (attrs->perms & ORANGEFS_G_EXECUTE)
+                perm_mode |= S_IXGRP;
+        if (attrs->perms & ORANGEFS_G_WRITE)
+                perm_mode |= S_IWGRP;
+        if (attrs->perms & ORANGEFS_G_READ)
+                perm_mode |= S_IRGRP;
+        if (attrs->perms & ORANGEFS_U_EXECUTE)
+                perm_mode |= S_IXUSR;
+        if (attrs->perms & ORANGEFS_U_WRITE)
+                perm_mode |= S_IWUSR;
+        if (attrs->perms & ORANGEFS_U_READ)
+                perm_mode |= S_IRUSR;
+        if (attrs->perms & ORANGEFS_G_SGID)
+                perm_mode |= S_ISGID;
+        if (attrs->perms & ORANGEFS_U_SUID)
+                perm_mode |= S_ISUID;
+        return perm_mode;
+}
+/*
+ * NOTE: in kernel land, we never use the sys_attr->link_target for
+ * anything, so don't bother copying it into the sys_attr object here.
+ */
+static inline int copy_attributes_from_inode(struct inode *inode,
+                                             struct ORANGEFS_sys_attr_s *attrs,
+                                             struct iattr *iattr)
+{
+        umode_t tmp_mode;
+        if (!iattr || !inode || !attrs) {
+                gossip_err("NULL iattr (%p), inode (%p), attrs (%p) "
+                           "in copy_attributes_from_inode!\n",
+                           iattr,
+                           inode,
+                           attrs);
+                return -EINVAL;
+        }
+        /*
+         * We need to be careful to only copy the attributes out of the
+         * iattr object that we know are valid.
+         */
+        attrs->mask = 0;
+        if (iattr->ia_valid & ATTR_UID) {
+                attrs->owner = from_kuid(current_user_ns(), iattr->ia_uid);
+                attrs->mask |= ORANGEFS_ATTR_SYS_UID;
+                gossip_debug(GOSSIP_UTILS_DEBUG, "(UID) %d\n", attrs->owner);
+        }
+        if (iattr->ia_valid & ATTR_GID) {
+                attrs->group = from_kgid(current_user_ns(), iattr->ia_gid);
+                attrs->mask |= ORANGEFS_ATTR_SYS_GID;
+                gossip_debug(GOSSIP_UTILS_DEBUG, "(GID) %d\n", attrs->group);
+        }
+        if (iattr->ia_valid & ATTR_ATIME) {
+                attrs->mask |= ORANGEFS_ATTR_SYS_ATIME;
+                if (iattr->ia_valid & ATTR_ATIME_SET) {
+                        attrs->atime = (time64_t)iattr->ia_atime.tv_sec;
+                        attrs->mask |= ORANGEFS_ATTR_SYS_ATIME_SET;
+                }
+        }
+        if (iattr->ia_valid & ATTR_MTIME) {
+                attrs->mask |= ORANGEFS_ATTR_SYS_MTIME;
+                if (iattr->ia_valid & ATTR_MTIME_SET) {
+                        attrs->mtime = (time64_t)iattr->ia_mtime.tv_sec;
+                        attrs->mask |= ORANGEFS_ATTR_SYS_MTIME_SET;
+                }
+        }
+        if (iattr->ia_valid & ATTR_CTIME)
+                attrs->mask |= ORANGEFS_ATTR_SYS_CTIME;
+        /*
+         * ORANGEFS cannot set size with a setattr operation.  Probably not likely
+         * to be requested through the VFS, but just in case, don't worry about
+         * ATTR_SIZE
+         */
+        if (iattr->ia_valid & ATTR_MODE) {
+                tmp_mode = iattr->ia_mode;
+                if (tmp_mode & (S_ISVTX)) {
+                        if (is_root_handle(inode)) {
+                                /*
+                                 * allow sticky bit to be set on root (since
+                                 * it shows up that way by default anyhow),
+                                 * but don't show it to the server
+                                 */
+                                tmp_mode -= S_ISVTX;
+                        } else {
+                                gossip_debug(GOSSIP_UTILS_DEBUG,
+                                             "User attempted to set sticky bit on non-root directory; returning EINVAL.\n");
+                                return -EINVAL;
+                        }
+                }
+                if (tmp_mode & (S_ISUID)) {
+                        gossip_debug(GOSSIP_UTILS_DEBUG,
+                                     "Attempting to set setuid bit (not supported); returning EINVAL.\n");
+                        return -EINVAL;
+                }
+                attrs->perms = ORANGEFS_util_translate_mode(tmp_mode);
+                attrs->mask |= ORANGEFS_ATTR_SYS_PERM;
+        }
+        return 0;
+}
+static int orangefs_inode_type(enum orangefs_ds_type objtype)
+{
+        if (objtype == ORANGEFS_TYPE_METAFILE)
+                return S_IFREG;
+        else if (objtype == ORANGEFS_TYPE_DIRECTORY)
+                return S_IFDIR;
+        else if (objtype == ORANGEFS_TYPE_SYMLINK)
+                return S_IFLNK;
+        else
+                return -1;
+}
+static int orangefs_inode_is_stale(struct inode *inode, int new,
+    struct ORANGEFS_sys_attr_s *attrs, char *link_target)
+{
+        struct orangefs_inode_s *orangefs_inode = ORANGEFS_I(inode);
+        int type = orangefs_inode_type(attrs->objtype);
+        if (!new) {
+                /*
+                 * If the inode type or symlink target have changed then this
+                 * inode is stale.
+                 */
+                if (type == -1 || !(inode->i_mode & type)) {
+                        orangefs_make_bad_inode(inode);
+                        return 1;
+                }
+                if (type == S_IFLNK && strncmp(orangefs_inode->link_target,
+                    link_target, ORANGEFS_NAME_MAX)) {
+                        orangefs_make_bad_inode(inode);
+                        return 1;
+                }
+        }
+        return 0;
+}
+int orangefs_inode_getattr(struct inode *inode, int new, int size)
+{
+        struct orangefs_inode_s *orangefs_inode = ORANGEFS_I(inode);
+        struct orangefs_kernel_op_s *new_op;
+        loff_t inode_size, rounded_up_size;
+        int ret, type;
+        gossip_debug(GOSSIP_UTILS_DEBUG, "%s: called on inode %pU\n", __func__,
+            get_khandle_from_ino(inode));
+        new_op = op_alloc(ORANGEFS_VFS_OP_GETATTR);
+        if (!new_op)
+                return -ENOMEM;
+        new_op->upcall.req.getattr.refn = orangefs_inode->refn;
+        new_op->upcall.req.getattr.mask = size ?
+            ORANGEFS_ATTR_SYS_ALL_NOHINT : ORANGEFS_ATTR_SYS_ALL_NOHINT_NOSIZE;
+        ret = service_operation(new_op, __func__,
+            get_interruptible_flag(inode));
+        if (ret != 0)
+                goto out;
+        type = orangefs_inode_type(new_op->
+            downcall.resp.getattr.attributes.objtype);
+        ret = orangefs_inode_is_stale(inode, new,
+            &new_op->downcall.resp.getattr.attributes,
+            new_op->downcall.resp.getattr.link_target);
+        if (ret) {
+                ret = -ESTALE;
+                goto out;
+        }
+        switch (type) {
+        case S_IFREG:
+                inode->i_flags = orangefs_inode_flags(&new_op->
+                    downcall.resp.getattr.attributes);
+                if (size) {
+                        inode_size = (loff_t)new_op->
+                            downcall.resp.getattr.attributes.size;
+                        rounded_up_size =
+                            (inode_size + (4096 - (inode_size % 4096)));
+                        inode->i_size = inode_size;
+                        orangefs_inode->blksize =
+                            new_op->downcall.resp.getattr.attributes.blksize;
+                        spin_lock(&inode->i_lock);
+                        inode->i_bytes = inode_size;
+                        inode->i_blocks =
+                            (unsigned long)(rounded_up_size / 512);
+                        spin_unlock(&inode->i_lock);
+                }
+                break;
+        case S_IFDIR:
+                inode->i_size = PAGE_CACHE_SIZE;
+                orangefs_inode->blksize = (1 << inode->i_blkbits);
+                spin_lock(&inode->i_lock);
+                inode_set_bytes(inode, inode->i_size);
+                spin_unlock(&inode->i_lock);
+                set_nlink(inode, 1);
+                break;
+        case S_IFLNK:
+                if (new) {
+                        inode->i_size = (loff_t)strlen(new_op->
+                            downcall.resp.getattr.link_target);
+                        orangefs_inode->blksize = (1 << inode->i_blkbits);
+                        strlcpy(orangefs_inode->link_target,
+                            new_op->downcall.resp.getattr.link_target,
+                            ORANGEFS_NAME_MAX);
+                        inode->i_link = orangefs_inode->link_target;
+                }
+                break;
+        }
+        inode->i_uid = make_kuid(&init_user_ns, new_op->
+            downcall.resp.getattr.attributes.owner);
+        inode->i_gid = make_kgid(&init_user_ns, new_op->
+            downcall.resp.getattr.attributes.group);
+        inode->i_atime.tv_sec = (time64_t)new_op->
+            downcall.resp.getattr.attributes.atime;
+        inode->i_mtime.tv_sec = (time64_t)new_op->
+            downcall.resp.getattr.attributes.mtime;
+        inode->i_ctime.tv_sec = (time64_t)new_op->
+            downcall.resp.getattr.attributes.ctime;
+        inode->i_atime.tv_nsec = 0;
+        inode->i_mtime.tv_nsec = 0;
+        inode->i_ctime.tv_nsec = 0;
+        /* special case: mark the root inode as sticky */
+        inode->i_mode = type | (is_root_handle(inode) ? S_ISVTX : 0) |
+            orangefs_inode_perms(&new_op->downcall.resp.getattr.attributes);
+        ret = 0;
+out:
+        op_release(new_op);
+        return ret;
+}
+int orangefs_inode_check_changed(struct inode *inode)
+{
+        struct orangefs_inode_s *orangefs_inode = ORANGEFS_I(inode);
+        struct orangefs_kernel_op_s *new_op;
+        int ret;
+        gossip_debug(GOSSIP_UTILS_DEBUG, "%s: called on inode %pU\n", __func__,
+            get_khandle_from_ino(inode));
+        new_op = op_alloc(ORANGEFS_VFS_OP_GETATTR);
+        if (!new_op)
+                return -ENOMEM;
+        new_op->upcall.req.getattr.refn = orangefs_inode->refn;
+        new_op->upcall.req.getattr.mask = ORANGEFS_ATTR_SYS_TYPE |
+            ORANGEFS_ATTR_SYS_LNK_TARGET;
+        ret = service_operation(new_op, __func__,
+            get_interruptible_flag(inode));
+        if (ret != 0)
+                goto out;
+        ret = orangefs_inode_is_stale(inode, 0,
+            &new_op->downcall.resp.getattr.attributes,
+            new_op->downcall.resp.getattr.link_target);
+out:
+        op_release(new_op);
+        return ret;
+}
+/*
+ * issues a orangefs setattr request to make sure the new attribute values
+ * take effect if successful.  returns 0 on success; -errno otherwise
+ */
+int orangefs_inode_setattr(struct inode *inode, struct iattr *iattr)
+{
+        struct orangefs_inode_s *orangefs_inode = ORANGEFS_I(inode);
+        struct orangefs_kernel_op_s *new_op;
+        int ret;
+        new_op = op_alloc(ORANGEFS_VFS_OP_SETATTR);
+        if (!new_op)
+                return -ENOMEM;
+        new_op->upcall.req.setattr.refn = orangefs_inode->refn;
+        ret = copy_attributes_from_inode(inode,
+                       &new_op->upcall.req.setattr.attributes,
+                       iattr);
+        if (ret >= 0) {
+                ret = service_operation(new_op, __func__,
+                                get_interruptible_flag(inode));
+                gossip_debug(GOSSIP_UTILS_DEBUG,
+                             "orangefs_inode_setattr: returning %d\n",
+                             ret);
+        }
+        op_release(new_op);
+        /*
+         * successful setattr should clear the atime, mtime and
+         * ctime flags.
+         */
+        if (ret == 0) {
+                ClearAtimeFlag(orangefs_inode);
+                ClearMtimeFlag(orangefs_inode);
+                ClearCtimeFlag(orangefs_inode);
+                ClearModeFlag(orangefs_inode);
+        }
+        return ret;
+}
+int orangefs_flush_inode(struct inode *inode)
+{
+        /*
+         * If it is a dirty inode, this function gets called.
+         * Gather all the information that needs to be setattr'ed
+         * Right now, this will only be used for mode, atime, mtime
+         * and/or ctime.
+         */
+        struct iattr wbattr;
+        int ret;
+        int mtime_flag;
+        int ctime_flag;
+        int atime_flag;
+        int mode_flag;
+        struct orangefs_inode_s *orangefs_inode = ORANGEFS_I(inode);
+        memset(&wbattr, 0, sizeof(wbattr));
+        /*
+         * check inode flags up front, and clear them if they are set.  This
+         * will prevent multiple processes from all trying to flush the same
+         * inode if they call close() simultaneously
+         */
+        mtime_flag = MtimeFlag(orangefs_inode);
+        ClearMtimeFlag(orangefs_inode);
+        ctime_flag = CtimeFlag(orangefs_inode);
+        ClearCtimeFlag(orangefs_inode);
+        atime_flag = AtimeFlag(orangefs_inode);
+        ClearAtimeFlag(orangefs_inode);
+        mode_flag = ModeFlag(orangefs_inode);
+        ClearModeFlag(orangefs_inode);
+        /*  -- Lazy atime,mtime and ctime update --
+         * Note: all times are dictated by server in the new scheme
+         * and not by the clients
+         *
+         * Also mode updates are being handled now..
+         */
+        if (mtime_flag)
+                wbattr.ia_valid |= ATTR_MTIME;
+        if (ctime_flag)
+                wbattr.ia_valid |= ATTR_CTIME;
+        if (atime_flag)
+                wbattr.ia_valid |= ATTR_ATIME;
+        if (mode_flag) {
+                wbattr.ia_mode = inode->i_mode;
+                wbattr.ia_valid |= ATTR_MODE;
+        }
+        gossip_debug(GOSSIP_UTILS_DEBUG,
+                     "*********** orangefs_flush_inode: %pU "
+                     "(ia_valid %d)\n",
+                     get_khandle_from_ino(inode),
+                     wbattr.ia_valid);
+        if (wbattr.ia_valid == 0) {
+                gossip_debug(GOSSIP_UTILS_DEBUG,
+                             "orangefs_flush_inode skipping setattr()\n");
+                return 0;
+        }
+        gossip_debug(GOSSIP_UTILS_DEBUG,
+                     "orangefs_flush_inode (%pU) writing mode %o\n",
+                     get_khandle_from_ino(inode),
+                     inode->i_mode);
+        ret = orangefs_inode_setattr(inode, &wbattr);
+        return ret;
+}
+int orangefs_unmount_sb(struct super_block *sb)
+{
+        int ret = -EINVAL;
+        struct orangefs_kernel_op_s *new_op = NULL;
+        gossip_debug(GOSSIP_UTILS_DEBUG,
+                     "orangefs_unmount_sb called on sb %p\n",
+                     sb);
+        new_op = op_alloc(ORANGEFS_VFS_OP_FS_UMOUNT);
+        if (!new_op)
+                return -ENOMEM;
+        new_op->upcall.req.fs_umount.id = ORANGEFS_SB(sb)->id;
+        new_op->upcall.req.fs_umount.fs_id = ORANGEFS_SB(sb)->fs_id;
+        strncpy(new_op->upcall.req.fs_umount.orangefs_config_server,
+                ORANGEFS_SB(sb)->devname,
+                ORANGEFS_MAX_SERVER_ADDR_LEN);
+        gossip_debug(GOSSIP_UTILS_DEBUG,
+                     "Attempting ORANGEFS Unmount via host %s\n",
+                     new_op->upcall.req.fs_umount.orangefs_config_server);
+        ret = service_operation(new_op, "orangefs_fs_umount", 0);
+        gossip_debug(GOSSIP_UTILS_DEBUG,
+                     "orangefs_unmount: got return value of %d\n", ret);
+        if (ret)
+                sb = ERR_PTR(ret);
+        else
+                ORANGEFS_SB(sb)->mount_pending = 1;
+        op_release(new_op);
+        return ret;
+}
+void orangefs_make_bad_inode(struct inode *inode)
+{
+        if (is_root_handle(inode)) {
+                /*
+                 * if this occurs, the pvfs2-client-core was killed but we
+                 * can't afford to lose the inode operations and such
+                 * associated with the root handle in any case.
+                 */
+                gossip_debug(GOSSIP_UTILS_DEBUG,
+                             "*** NOT making bad root inode %pU\n",
+                             get_khandle_from_ino(inode));
+        } else {
+                gossip_debug(GOSSIP_UTILS_DEBUG,
+                             "*** making bad inode %pU\n",
+                             get_khandle_from_ino(inode));
+                make_bad_inode(inode);
+        }
+}
+/*
+ * The following is a very dirty hack that is now a permanent part of the
+ * ORANGEFS protocol. See protocol.h for more error definitions.
+ */
+/* The order matches include/orangefs-types.h in the OrangeFS source. */
+static int PINT_errno_mapping[] = {
+        0, EPERM, ENOENT, EINTR, EIO, ENXIO, EBADF, EAGAIN, ENOMEM,
+        EFAULT, EBUSY, EEXIST, ENODEV, ENOTDIR, EISDIR, EINVAL, EMFILE,
+        EFBIG, ENOSPC, EROFS, EMLINK, EPIPE, EDEADLK, ENAMETOOLONG,
+        ENOLCK, ENOSYS, ENOTEMPTY, ELOOP, EWOULDBLOCK, ENOMSG, EUNATCH,
+        EBADR, EDEADLOCK, ENODATA, ETIME, ENONET, EREMOTE, ECOMM,
+        EPROTO, EBADMSG, EOVERFLOW, ERESTART, EMSGSIZE, EPROTOTYPE,
+        ENOPROTOOPT, EPROTONOSUPPORT, EOPNOTSUPP, EADDRINUSE,
+        EADDRNOTAVAIL, ENETDOWN, ENETUNREACH, ENETRESET, ENOBUFS,
+        ETIMEDOUT, ECONNREFUSED, EHOSTDOWN, EHOSTUNREACH, EALREADY,
+        EACCES, ECONNRESET, ERANGE
+};
+int orangefs_normalize_to_errno(__s32 error_code)
+{
+        __u32 i;
+        /* Success */
+        if (error_code == 0) {
+                return 0;
+        /*
+         * This shouldn't ever happen. If it does it should be fixed on the
+         * server.
+         */
+        } else if (error_code > 0) {
+                gossip_err("orangefs: error status receieved.\n");
+                gossip_err("orangefs: assuming error code is inverted.\n");
+                error_code = -error_code;
+        }
+        /*
+         * XXX: This is very bad since error codes from ORANGEFS may not be
+         * suitable for return into userspace.
+         */
+        /*
+         * Convert ORANGEFS error values into errno values suitable for return
+         * from the kernel.
+         */
+        if ((-error_code) & ORANGEFS_NON_ERRNO_ERROR_BIT) {
+                if (((-error_code) &
+                    (ORANGEFS_ERROR_NUMBER_BITS|ORANGEFS_NON_ERRNO_ERROR_BIT|
+                    ORANGEFS_ERROR_BIT)) == ORANGEFS_ECANCEL) {
+                        /*
+                         * cancellation error codes generally correspond to
+                         * a timeout from the client's perspective
+                         */
+                        error_code = -ETIMEDOUT;
+                } else {
+                        /* assume a default error code */
+                        gossip_err("orangefs: warning: got error code without errno equivalent: %d.\n", error_code);
+                        error_code = -EINVAL;
+                }
+        /* Convert ORANGEFS encoded errno values into regular errno values. */
+        } else if ((-error_code) & ORANGEFS_ERROR_BIT) {
+                i = (-error_code) & ~(ORANGEFS_ERROR_BIT|ORANGEFS_ERROR_CLASS_BITS);
+                if (i < sizeof(PINT_errno_mapping)/sizeof(*PINT_errno_mapping))
+                        error_code = -PINT_errno_mapping[i];
+                else
+                        error_code = -EINVAL;
+        /*
+         * Only ORANGEFS protocol error codes should ever come here. Otherwise
+         * there is a bug somewhere.
+         */
+        } else {
+                gossip_err("orangefs: orangefs_normalize_to_errno: got error code which is not from ORANGEFS.\n");
+        }
+        return error_code;
+}
+#define NUM_MODES 11
+__s32 ORANGEFS_util_translate_mode(int mode)
+{
+        int ret = 0;
+        int i = 0;
+        static int modes[NUM_MODES] = {
+                S_IXOTH, S_IWOTH, S_IROTH,
+                S_IXGRP, S_IWGRP, S_IRGRP,
+                S_IXUSR, S_IWUSR, S_IRUSR,
+                S_ISGID, S_ISUID
+        };
+        static int orangefs_modes[NUM_MODES] = {
+                ORANGEFS_O_EXECUTE, ORANGEFS_O_WRITE, ORANGEFS_O_READ,
+                ORANGEFS_G_EXECUTE, ORANGEFS_G_WRITE, ORANGEFS_G_READ,
+                ORANGEFS_U_EXECUTE, ORANGEFS_U_WRITE, ORANGEFS_U_READ,
+                ORANGEFS_G_SGID, ORANGEFS_U_SUID
+        };
+        for (i = 0; i < NUM_MODES; i++)
+                if (mode & modes[i])
+                        ret |= orangefs_modes[i];
+        return ret;
+}
+#undef NUM_MODES
+/*
+ * After obtaining a string representation of the client's debug
+ * keywords and their associated masks, this function is called to build an
+ * array of these values.
+ */
+int orangefs_prepare_cdm_array(char *debug_array_string)
+{
+        int i;
+        int rc = -EINVAL;
+        char *cds_head = NULL;
+        char *cds_delimiter = NULL;
+        int keyword_len = 0;
+        gossip_debug(GOSSIP_UTILS_DEBUG, "%s: start\n", __func__);
+        /*
+         * figure out how many elements the cdm_array needs.
+         */
+        for (i = 0; i < strlen(debug_array_string); i++)
+                if (debug_array_string[i] == '\n')
+                        cdm_element_count++;
+        if (!cdm_element_count) {
+                pr_info("No elements in client debug array string!\n");
+                goto out;
+        }
+        cdm_array =
+                kzalloc(cdm_element_count * sizeof(struct client_debug_mask),
+                        GFP_KERNEL);
+        if (!cdm_array) {
+                pr_info("malloc failed for cdm_array!\n");
+                rc = -ENOMEM;
+                goto out;
+        }
+        cds_head = debug_array_string;
+        for (i = 0; i < cdm_element_count; i++) {
+                cds_delimiter = strchr(cds_head, '\n');
+                *cds_delimiter = '\0';
+                keyword_len = strcspn(cds_head, " ");
+                cdm_array[i].keyword = kzalloc(keyword_len + 1, GFP_KERNEL);
+                if (!cdm_array[i].keyword) {
+                        rc = -ENOMEM;
+                        goto out;
+                }
+                sscanf(cds_head,
+                       "%s %llx %llx",
+                       cdm_array[i].keyword,
+                       (unsigned long long *)&(cdm_array[i].mask1),
+                       (unsigned long long *)&(cdm_array[i].mask2));
+                if (!strcmp(cdm_array[i].keyword, ORANGEFS_VERBOSE))
+                        client_verbose_index = i;
+                if (!strcmp(cdm_array[i].keyword, ORANGEFS_ALL))
+                        client_all_index = i;
+                cds_head = cds_delimiter + 1;
+        }
+        rc = cdm_element_count;
+        gossip_debug(GOSSIP_UTILS_DEBUG, "%s: rc:%d:\n", __func__, rc);
+out:
+        return rc;
+}
+/*
+ * /sys/kernel/debug/orangefs/debug-help can be catted to
+ * see all the available kernel and client debug keywords.
+ *
+ * When the kernel boots, we have no idea what keywords the
+ * client supports, nor their associated masks.
+ *
+ * We pass through this function once at boot and stamp a
+ * boilerplate "we don't know" message for the client in the
+ * debug-help file. We pass through here again when the client
+ * starts and then we can fill out the debug-help file fully.
+ *
+ * The client might be restarted any number of times between
+ * reboots, we only build the debug-help file the first time.
+ */
+int orangefs_prepare_debugfs_help_string(int at_boot)
+{
+        int rc = -EINVAL;
+        int i;
+        int byte_count = 0;
+        char *client_title = "Client Debug Keywords:\n";
+        char *kernel_title = "Kernel Debug Keywords:\n";
+        gossip_debug(GOSSIP_UTILS_DEBUG, "%s: start\n", __func__);
+        if (at_boot) {
+                byte_count += strlen(HELP_STRING_UNINITIALIZED);
+                client_title = HELP_STRING_UNINITIALIZED;
+        } else {
+                /*
+                 * fill the client keyword/mask array and remember
+                 * how many elements there were.
+                 */
+                cdm_element_count =
+                        orangefs_prepare_cdm_array(client_debug_array_string);
+                if (cdm_element_count <= 0)
+                        goto out;
+                /* Count the bytes destined for debug_help_string. */
+                byte_count += strlen(client_title);
+                for (i = 0; i < cdm_element_count; i++) {
+                        byte_count += strlen(cdm_array[i].keyword + 2);
+                        if (byte_count >= DEBUG_HELP_STRING_SIZE) {
+                                pr_info("%s: overflow 1!\n", __func__);
+                                goto out;
+                        }
+                }
+                gossip_debug(GOSSIP_UTILS_DEBUG,
+                             "%s: cdm_element_count:%d:\n",
+                             __func__,
+                             cdm_element_count);
+        }
+        byte_count += strlen(kernel_title);
+        for (i = 0; i < num_kmod_keyword_mask_map; i++) {
+                byte_count +=
+                        strlen(s_kmod_keyword_mask_map[i].keyword + 2);
+                if (byte_count >= DEBUG_HELP_STRING_SIZE) {
+                        pr_info("%s: overflow 2!\n", __func__);
+                        goto out;
+                }
+        }
+        /* build debug_help_string. */
+        debug_help_string = kzalloc(DEBUG_HELP_STRING_SIZE, GFP_KERNEL);
+        if (!debug_help_string) {
+                rc = -ENOMEM;
+                goto out;
+        }
+        strcat(debug_help_string, client_title);
+        if (!at_boot) {
+                for (i = 0; i < cdm_element_count; i++) {
+                        strcat(debug_help_string, "\t");
+                        strcat(debug_help_string, cdm_array[i].keyword);
+                        strcat(debug_help_string, "\n");
+                }
+        }
+        strcat(debug_help_string, "\n");
+        strcat(debug_help_string, kernel_title);
+        for (i = 0; i < num_kmod_keyword_mask_map; i++) {
+                strcat(debug_help_string, "\t");
+                strcat(debug_help_string, s_kmod_keyword_mask_map[i].keyword);
+                strcat(debug_help_string, "\n");
+        }
+        rc = 0;
+out:
+        return rc;
+}
+/*
+ * kernel = type 0
+ * client = type 1
+ */
+void debug_mask_to_string(void *mask, int type)
+{
+        int i;
+        int len = 0;
+        char *debug_string;
+        int element_count = 0;
+        gossip_debug(GOSSIP_UTILS_DEBUG, "%s: start\n", __func__);
+        if (type) {
+                debug_string = client_debug_string;
+                element_count = cdm_element_count;
+        } else {
+                debug_string = kernel_debug_string;
+                element_count = num_kmod_keyword_mask_map;
+        }
+        memset(debug_string, 0, ORANGEFS_MAX_DEBUG_STRING_LEN);
+        /*
+         * Some keywords, like "all" or "verbose", are amalgams of
+         * numerous other keywords. Make a special check for those
+         * before grinding through the whole mask only to find out
+         * later...
+         */
+        if (check_amalgam_keyword(mask, type))
+                goto out;
+        /* Build the debug string. */
+        for (i = 0; i < element_count; i++)
+                if (type)
+                        do_c_string(mask, i);
+                else
+                        do_k_string(mask, i);
+        len = strlen(debug_string);
+        if ((len) && (type))
+                client_debug_string[len - 1] = '\0';
+        else if (len)
+                kernel_debug_string[len - 1] = '\0';
+        else if (type)
+                strcpy(client_debug_string, "none");
+        else
+                strcpy(kernel_debug_string, "none");
+out:
+gossip_debug(GOSSIP_UTILS_DEBUG, "%s: string:%s:\n", __func__, debug_string);
+        return;
+}
+void do_k_string(void *k_mask, int index)
+{
+        __u64 *mask = (__u64 *) k_mask;
+        if (keyword_is_amalgam((char *) s_kmod_keyword_mask_map[index].keyword))
+                goto out;
+        if (*mask & s_kmod_keyword_mask_map[index].mask_val) {
+                if ((strlen(kernel_debug_string) +
+                     strlen(s_kmod_keyword_mask_map[index].keyword))
+                        < ORANGEFS_MAX_DEBUG_STRING_LEN - 1) {
+                                strcat(kernel_debug_string,
+                                       s_kmod_keyword_mask_map[index].keyword);
+                                strcat(kernel_debug_string, ",");
+                        } else {
+                                gossip_err("%s: overflow!\n", __func__);
+                                strcpy(kernel_debug_string, ORANGEFS_ALL);
+                                goto out;
+                        }
+        }
+out:
+        return;
+}
+void do_c_string(void *c_mask, int index)
+{
+        struct client_debug_mask *mask = (struct client_debug_mask *) c_mask;
+        if (keyword_is_amalgam(cdm_array[index].keyword))
+                goto out;
+        if ((mask->mask1 & cdm_array[index].mask1) ||
+            (mask->mask2 & cdm_array[index].mask2)) {
+                if ((strlen(client_debug_string) +
+                     strlen(cdm_array[index].keyword) + 1)
+                        < ORANGEFS_MAX_DEBUG_STRING_LEN - 2) {
+                                strcat(client_debug_string,
+                                       cdm_array[index].keyword);
+                                strcat(client_debug_string, ",");
+                        } else {
+                                gossip_err("%s: overflow!\n", __func__);
+                                strcpy(client_debug_string, ORANGEFS_ALL);
+                                goto out;
+                        }
+        }
+out:
+        return;
+}
+int keyword_is_amalgam(char *keyword)
+{
+        int rc = 0;
+        if ((!strcmp(keyword, ORANGEFS_ALL)) || (!strcmp(keyword, ORANGEFS_VERBOSE)))
+                rc = 1;
+        return rc;
+}
+/*
+ * kernel = type 0
+ * client = type 1
+ *
+ * return 1 if we found an amalgam.
+ */
+int check_amalgam_keyword(void *mask, int type)
+{
+        __u64 *k_mask;
+        struct client_debug_mask *c_mask;
+        int k_all_index = num_kmod_keyword_mask_map - 1;
+        int rc = 0;
+        if (type) {
+                c_mask = (struct client_debug_mask *) mask;
+                if ((c_mask->mask1 == cdm_array[client_all_index].mask1) &&
+                    (c_mask->mask2 == cdm_array[client_all_index].mask2)) {
+                        strcpy(client_debug_string, ORANGEFS_ALL);
+                        rc = 1;
+                        goto out;
+                }
+                if ((c_mask->mask1 == cdm_array[client_verbose_index].mask1) &&
+                    (c_mask->mask2 == cdm_array[client_verbose_index].mask2)) {
+                        strcpy(client_debug_string, ORANGEFS_VERBOSE);
+                        rc = 1;
+                        goto out;
+                }
+        } else {
+                k_mask = (__u64 *) mask;
+                if (*k_mask >= s_kmod_keyword_mask_map[k_all_index].mask_val) {
+                        strcpy(kernel_debug_string, ORANGEFS_ALL);
+                        rc = 1;
+                        goto out;
+                }
+        }
+out:
+        return rc;
+}
+/*
+ * kernel = type 0
+ * client = type 1
+ */
+void debug_string_to_mask(char *debug_string, void *mask, int type)
+{
+        char *unchecked_keyword;
+        int i;
+        char *strsep_fodder = kstrdup(debug_string, GFP_KERNEL);
+        char *original_pointer;
+        int element_count = 0;
+        struct client_debug_mask *c_mask;
+        __u64 *k_mask;
+        gossip_debug(GOSSIP_UTILS_DEBUG, "%s: start\n", __func__);
+        if (type) {
+                c_mask = (struct client_debug_mask *)mask;
+                element_count = cdm_element_count;
+        } else {
+                k_mask = (__u64 *)mask;
+                *k_mask = 0;
+                element_count = num_kmod_keyword_mask_map;
+        }
+        original_pointer = strsep_fodder;
+        while ((unchecked_keyword = strsep(&strsep_fodder, ",")))
+                if (strlen(unchecked_keyword)) {
+                        for (i = 0; i < element_count; i++)
+                                if (type)
+                                        do_c_mask(i,
+                                                  unchecked_keyword,
+                                                  &c_mask);
+                                else
+                                        do_k_mask(i,
+                                                  unchecked_keyword,
+                                                  &k_mask);
+                }
+        kfree(original_pointer);
+}
+void do_c_mask(int i,
+               char *unchecked_keyword,
+               struct client_debug_mask **sane_mask)
+{
+        if (!strcmp(cdm_array[i].keyword, unchecked_keyword)) {
+                (**sane_mask).mask1 = (**sane_mask).mask1 | cdm_array[i].mask1;
+                (**sane_mask).mask2 = (**sane_mask).mask2 | cdm_array[i].mask2;
+        }
+}
+void do_k_mask(int i, char *unchecked_keyword, __u64 **sane_mask)
+{
+        if (!strcmp(s_kmod_keyword_mask_map[i].keyword, unchecked_keyword))
+                **sane_mask = (**sane_mask) |
+                                s_kmod_keyword_mask_map[i].mask_val;
+}
diff --git a/fs/orangefs/protocol.h b/fs/orangefs/protocol.h
new file mode 100644
index 000000000000..45ce4ff4cbc7
--- /dev/null
+++ b/fs/orangefs/protocol.h
@@ -0,0 +1,452 @@
+#include <linux/types.h>
+#include <linux/spinlock_types.h>
+#include <linux/slab.h>
+#include <linux/ioctl.h>
+extern struct client_debug_mask *cdm_array;
+extern char *debug_help_string;
+extern int help_string_initialized;
+extern struct dentry *debug_dir;
+extern struct dentry *help_file_dentry;
+extern struct dentry *client_debug_dentry;
+extern const struct file_operations debug_help_fops;
+extern int client_all_index;
+extern int client_verbose_index;
+extern int cdm_element_count;
+#define DEBUG_HELP_STRING_SIZE 4096
+#define HELP_STRING_UNINITIALIZED \
+        "Client Debug Keywords are unknown until the first time\n" \
+        "the client is started after boot.\n"
+#define ORANGEFS_KMOD_DEBUG_HELP_FILE "debug-help"
+#define ORANGEFS_KMOD_DEBUG_FILE "kernel-debug"
+#define ORANGEFS_CLIENT_DEBUG_FILE "client-debug"
+#define ORANGEFS_VERBOSE "verbose"
+#define ORANGEFS_ALL "all"
+/* pvfs2-config.h ***********************************************************/
+#define ORANGEFS_VERSION_MAJOR 2
+#define ORANGEFS_VERSION_MINOR 9
+#define ORANGEFS_VERSION_SUB 0
+/* khandle stuff  ***********************************************************/
+/*
+ * The 2.9 core will put 64 bit handles in here like this:
+ *    1234 0000 0000 5678
+ * The 3.0 and beyond cores will put 128 bit handles in here like this:
+ *    1234 5678 90AB CDEF
+ * The kernel module will always use the first four bytes and
+ * the last four bytes as an inum.
+ */
+struct orangefs_khandle {
+        unsigned char u[16];
+}  __aligned(8);
+/*
+ * kernel version of an object ref.
+ */
+struct orangefs_object_kref {
+        struct orangefs_khandle khandle;
+        __s32 fs_id;
+        __s32 __pad1;
+};
+/*
+ * compare 2 khandles assumes little endian thus from large address to
+ * small address
+ */
+static inline int ORANGEFS_khandle_cmp(const struct orangefs_khandle *kh1,
+                                   const struct orangefs_khandle *kh2)
+{
+        int i;
+        for (i = 15; i >= 0; i--) {
+                if (kh1->u[i] > kh2->u[i])
+                        return 1;
+                if (kh1->u[i] < kh2->u[i])
+                        return -1;
+        }
+        return 0;
+}
+static inline void ORANGEFS_khandle_to(const struct orangefs_khandle *kh,
+                                   void *p, int size)
+{
+        memset(p, 0, size);
+        memcpy(p, kh->u, 16);
+}
+static inline void ORANGEFS_khandle_from(struct orangefs_khandle *kh,
+                                     void *p, int size)
+{
+        memset(kh, 0, 16);
+        memcpy(kh->u, p, 16);
+}
+/* pvfs2-types.h ************************************************************/
+typedef __u32 ORANGEFS_uid;
+typedef __u32 ORANGEFS_gid;
+typedef __s32 ORANGEFS_fs_id;
+typedef __u32 ORANGEFS_permissions;
+typedef __u64 ORANGEFS_time;
+typedef __s64 ORANGEFS_size;
+typedef __u64 ORANGEFS_flags;
+typedef __u64 ORANGEFS_ds_position;
+typedef __s32 ORANGEFS_error;
+typedef __s64 ORANGEFS_offset;
+#define ORANGEFS_SUPER_MAGIC 0x20030528
+/*
+ * ORANGEFS error codes are a signed 32-bit integer. Error codes are negative, but
+ * the sign is stripped before decoding.
+ */
+/* Bit 31 is not used since it is the sign. */
+/*
+ * Bit 30 specifies that this is a ORANGEFS error. A ORANGEFS error is either an
+ * encoded errno value or a ORANGEFS protocol error.
+ */
+#define ORANGEFS_ERROR_BIT (1 << 30)
+/*
+ * Bit 29 specifies that this is a ORANGEFS protocol error and not an encoded
+ * errno value.
+ */
+#define ORANGEFS_NON_ERRNO_ERROR_BIT (1 << 29)
+/*
+ * Bits 9, 8, and 7 specify the error class, which encodes the section of
+ * server code the error originated in for logging purposes. It is not used
+ * in the kernel except to be masked out.
+ */
+#define ORANGEFS_ERROR_CLASS_BITS 0x380
+/* Bits 6 - 0 are reserved for the actual error code. */
+#define ORANGEFS_ERROR_NUMBER_BITS 0x7f
+/* Encoded errno values decoded by PINT_errno_mapping in orangefs-utils.c. */
+/* Our own ORANGEFS protocol error codes. */
+#define ORANGEFS_ECANCEL    (1|ORANGEFS_NON_ERRNO_ERROR_BIT|ORANGEFS_ERROR_BIT)
+#define ORANGEFS_EDEVINIT   (2|ORANGEFS_NON_ERRNO_ERROR_BIT|ORANGEFS_ERROR_BIT)
+#define ORANGEFS_EDETAIL    (3|ORANGEFS_NON_ERRNO_ERROR_BIT|ORANGEFS_ERROR_BIT)
+#define ORANGEFS_EHOSTNTFD  (4|ORANGEFS_NON_ERRNO_ERROR_BIT|ORANGEFS_ERROR_BIT)
+#define ORANGEFS_EADDRNTFD  (5|ORANGEFS_NON_ERRNO_ERROR_BIT|ORANGEFS_ERROR_BIT)
+#define ORANGEFS_ENORECVR   (6|ORANGEFS_NON_ERRNO_ERROR_BIT|ORANGEFS_ERROR_BIT)
+#define ORANGEFS_ETRYAGAIN  (7|ORANGEFS_NON_ERRNO_ERROR_BIT|ORANGEFS_ERROR_BIT)
+#define ORANGEFS_ENOTPVFS   (8|ORANGEFS_NON_ERRNO_ERROR_BIT|ORANGEFS_ERROR_BIT)
+#define ORANGEFS_ESECURITY  (9|ORANGEFS_NON_ERRNO_ERROR_BIT|ORANGEFS_ERROR_BIT)
+/* permission bits */
+#define ORANGEFS_O_EXECUTE (1 << 0)
+#define ORANGEFS_O_WRITE   (1 << 1)
+#define ORANGEFS_O_READ    (1 << 2)
+#define ORANGEFS_G_EXECUTE (1 << 3)
+#define ORANGEFS_G_WRITE   (1 << 4)
+#define ORANGEFS_G_READ    (1 << 5)
+#define ORANGEFS_U_EXECUTE (1 << 6)
+#define ORANGEFS_U_WRITE   (1 << 7)
+#define ORANGEFS_U_READ    (1 << 8)
+/* no ORANGEFS_U_VTX (sticky bit) */
+#define ORANGEFS_G_SGID    (1 << 10)
+#define ORANGEFS_U_SUID    (1 << 11)
+/* definition taken from stdint.h */
+#define INT32_MAX (2147483647)
+#define ORANGEFS_ITERATE_START    (INT32_MAX - 1)
+#define ORANGEFS_ITERATE_END      (INT32_MAX - 2)
+#define ORANGEFS_ITERATE_NEXT     (INT32_MAX - 3)
+#define ORANGEFS_READDIR_START ORANGEFS_ITERATE_START
+#define ORANGEFS_READDIR_END   ORANGEFS_ITERATE_END
+#define ORANGEFS_IMMUTABLE_FL FS_IMMUTABLE_FL
+#define ORANGEFS_APPEND_FL    FS_APPEND_FL
+#define ORANGEFS_NOATIME_FL   FS_NOATIME_FL
+#define ORANGEFS_MIRROR_FL    0x01000000ULL
+#define ORANGEFS_O_EXECUTE (1 << 0)
+#define ORANGEFS_FS_ID_NULL       ((__s32)0)
+#define ORANGEFS_ATTR_SYS_UID                   (1 << 0)
+#define ORANGEFS_ATTR_SYS_GID                   (1 << 1)
+#define ORANGEFS_ATTR_SYS_PERM                  (1 << 2)
+#define ORANGEFS_ATTR_SYS_ATIME                 (1 << 3)
+#define ORANGEFS_ATTR_SYS_CTIME                 (1 << 4)
+#define ORANGEFS_ATTR_SYS_MTIME                 (1 << 5)
+#define ORANGEFS_ATTR_SYS_TYPE                  (1 << 6)
+#define ORANGEFS_ATTR_SYS_ATIME_SET             (1 << 7)
+#define ORANGEFS_ATTR_SYS_MTIME_SET             (1 << 8)
+#define ORANGEFS_ATTR_SYS_SIZE                  (1 << 20)
+#define ORANGEFS_ATTR_SYS_LNK_TARGET            (1 << 24)
+#define ORANGEFS_ATTR_SYS_DFILE_COUNT           (1 << 25)
+#define ORANGEFS_ATTR_SYS_DIRENT_COUNT          (1 << 26)
+#define ORANGEFS_ATTR_SYS_BLKSIZE               (1 << 28)
+#define ORANGEFS_ATTR_SYS_MIRROR_COPIES_COUNT   (1 << 29)
+#define ORANGEFS_ATTR_SYS_COMMON_ALL    \
+        (ORANGEFS_ATTR_SYS_UID  |       \
+         ORANGEFS_ATTR_SYS_GID  |       \
+         ORANGEFS_ATTR_SYS_PERM |       \
+         ORANGEFS_ATTR_SYS_ATIME        |       \
+         ORANGEFS_ATTR_SYS_CTIME        |       \
+         ORANGEFS_ATTR_SYS_MTIME        |       \
+         ORANGEFS_ATTR_SYS_TYPE)
+#define ORANGEFS_ATTR_SYS_ALL_SETABLE           \
+(ORANGEFS_ATTR_SYS_COMMON_ALL-ORANGEFS_ATTR_SYS_TYPE)
+#define ORANGEFS_ATTR_SYS_ALL_NOHINT                    \
+        (ORANGEFS_ATTR_SYS_COMMON_ALL           |       \
+         ORANGEFS_ATTR_SYS_SIZE                 |       \
+         ORANGEFS_ATTR_SYS_LNK_TARGET           |       \
+         ORANGEFS_ATTR_SYS_DFILE_COUNT          |       \
+         ORANGEFS_ATTR_SYS_MIRROR_COPIES_COUNT  |       \
+         ORANGEFS_ATTR_SYS_DIRENT_COUNT         |       \
+         ORANGEFS_ATTR_SYS_BLKSIZE)
+#define ORANGEFS_ATTR_SYS_ALL_NOHINT_NOSIZE             \
+        (ORANGEFS_ATTR_SYS_COMMON_ALL           |       \
+         ORANGEFS_ATTR_SYS_LNK_TARGET           |       \
+         ORANGEFS_ATTR_SYS_DFILE_COUNT          |       \
+         ORANGEFS_ATTR_SYS_MIRROR_COPIES_COUNT  |       \
+         ORANGEFS_ATTR_SYS_DIRENT_COUNT         |       \
+         ORANGEFS_ATTR_SYS_BLKSIZE)
+#define ORANGEFS_XATTR_REPLACE 0x2
+#define ORANGEFS_XATTR_CREATE  0x1
+#define ORANGEFS_MAX_SERVER_ADDR_LEN 256
+#define ORANGEFS_NAME_MAX                256
+/*
+ * max extended attribute name len as imposed by the VFS and exploited for the
+ * upcall request types.
+ * NOTE: Please retain them as multiples of 8 even if you wish to change them
+ * This is *NECESSARY* for supporting 32 bit user-space binaries on a 64-bit
+ * kernel. Due to implementation within DBPF, this really needs to be
+ * ORANGEFS_NAME_MAX, which it was the same value as, but no reason to let it
+ * break if that changes in the future.
+ */
+#define ORANGEFS_MAX_XATTR_NAMELEN   ORANGEFS_NAME_MAX  /* Not the same as
+                                                 * XATTR_NAME_MAX defined
+                                                 * by <linux/xattr.h>
+                                                 */
+#define ORANGEFS_MAX_XATTR_VALUELEN  8192       /* Not the same as XATTR_SIZE_MAX
+                                         * defined by <linux/xattr.h>
+                                         */
+#define ORANGEFS_MAX_XATTR_LISTLEN   16 /* Not the same as XATTR_LIST_MAX
+                                         * defined by <linux/xattr.h>
+                                         */
+/*
+ * ORANGEFS I/O operation types, used in both system and server interfaces.
+ */
+enum ORANGEFS_io_type {
+        ORANGEFS_IO_READ = 1,
+        ORANGEFS_IO_WRITE = 2
+};
+/*
+ * If this enum is modified the server parameters related to the precreate pool
+ * batch and low threshold sizes may need to be modified  to reflect this
+ * change.
+ */
+enum orangefs_ds_type {
+        ORANGEFS_TYPE_NONE = 0,
+        ORANGEFS_TYPE_METAFILE = (1 << 0),
+        ORANGEFS_TYPE_DATAFILE = (1 << 1),
+        ORANGEFS_TYPE_DIRECTORY = (1 << 2),
+        ORANGEFS_TYPE_SYMLINK = (1 << 3),
+        ORANGEFS_TYPE_DIRDATA = (1 << 4),
+        ORANGEFS_TYPE_INTERNAL = (1 << 5)       /* for the server's private use */
+};
+/*
+ * ORANGEFS_certificate simply stores a buffer with the buffer size.
+ * The buffer can be converted to an OpenSSL X509 struct for use.
+ */
+struct ORANGEFS_certificate {
+        __u32 buf_size;
+        unsigned char *buf;
+};
+/*
+ * A credential identifies a user and is signed by the client/user
+ * private key.
+ */
+struct ORANGEFS_credential {
+        __u32 userid;   /* user id */
+        __u32 num_groups;       /* length of group_array */
+        __u32 *group_array;     /* groups for which the user is a member */
+        char *issuer;           /* alias of the issuing server */
+        __u64 timeout;  /* seconds after epoch to time out */
+        __u32 sig_size; /* length of the signature in bytes */
+        unsigned char *signature;       /* digital signature */
+        struct ORANGEFS_certificate certificate;        /* user certificate buffer */
+};
+#define extra_size_ORANGEFS_credential (ORANGEFS_REQ_LIMIT_GROUPS       *       \
+                                    sizeof(__u32)               +       \
+                                    ORANGEFS_REQ_LIMIT_ISSUER   +       \
+                                    ORANGEFS_REQ_LIMIT_SIGNATURE        +       \
+                                    extra_size_ORANGEFS_certificate)
+/* This structure is used by the VFS-client interaction alone */
+struct ORANGEFS_keyval_pair {
+        char key[ORANGEFS_MAX_XATTR_NAMELEN];
+        __s32 key_sz;   /* __s32 for portable, fixed-size structures */
+        __s32 val_sz;
+        char val[ORANGEFS_MAX_XATTR_VALUELEN];
+};
+/* pvfs2-sysint.h ***********************************************************/
+/* Describes attributes for a file, directory, or symlink. */
+struct ORANGEFS_sys_attr_s {
+        __u32 owner;
+        __u32 group;
+        __u32 perms;
+        __u64 atime;
+        __u64 mtime;
+        __u64 ctime;
+        __s64 size;
+        /* NOTE: caller must free if valid */
+        char *link_target;
+        /* Changed to __s32 so that size of structure does not change */
+        __s32 dfile_count;
+        /* Changed to __s32 so that size of structure does not change */
+        __s32 distr_dir_servers_initial;
+        /* Changed to __s32 so that size of structure does not change */
+        __s32 distr_dir_servers_max;
+        /* Changed to __s32 so that size of structure does not change */
+        __s32 distr_dir_split_size;
+        __u32 mirror_copies_count;
+        /* NOTE: caller must free if valid */
+        char *dist_name;
+        /* NOTE: caller must free if valid */
+        char *dist_params;
+        __s64 dirent_count;
+        enum orangefs_ds_type objtype;
+        __u64 flags;
+        __u32 mask;
+        __s64 blksize;
+};
+#define ORANGEFS_LOOKUP_LINK_NO_FOLLOW 0
+/* pint-dev.h ***************************************************************/
+/* parameter structure used in ORANGEFS_DEV_DEBUG ioctl command */
+struct dev_mask_info_s {
+        enum {
+                KERNEL_MASK,
+                CLIENT_MASK,
+        } mask_type;
+        __u64 mask_value;
+};
+struct dev_mask2_info_s {
+        __u64 mask1_value;
+        __u64 mask2_value;
+};
+/* pvfs2-util.h *************************************************************/
+__s32 ORANGEFS_util_translate_mode(int mode);
+/* pvfs2-debug.h ************************************************************/
+#include "orangefs-debug.h"
+/* pvfs2-internal.h *********************************************************/
+#define llu(x) (unsigned long long)(x)
+#define lld(x) (long long)(x)
+/* pint-dev-shared.h ********************************************************/
+#define ORANGEFS_DEV_MAGIC 'k'
+#define ORANGEFS_READDIR_DEFAULT_DESC_COUNT  5
+#define DEV_GET_MAGIC           0x1
+#define DEV_GET_MAX_UPSIZE      0x2
+#define DEV_GET_MAX_DOWNSIZE    0x3
+#define DEV_MAP                 0x4
+#define DEV_REMOUNT_ALL         0x5
+#define DEV_DEBUG               0x6
+#define DEV_UPSTREAM            0x7
+#define DEV_CLIENT_MASK         0x8
+#define DEV_CLIENT_STRING       0x9
+#define DEV_MAX_NR              0xa
+/* supported ioctls, codes are with respect to user-space */
+enum {
+        ORANGEFS_DEV_GET_MAGIC = _IOW(ORANGEFS_DEV_MAGIC, DEV_GET_MAGIC, __s32),
+        ORANGEFS_DEV_GET_MAX_UPSIZE =
+            _IOW(ORANGEFS_DEV_MAGIC, DEV_GET_MAX_UPSIZE, __s32),
+        ORANGEFS_DEV_GET_MAX_DOWNSIZE =
+            _IOW(ORANGEFS_DEV_MAGIC, DEV_GET_MAX_DOWNSIZE, __s32),
+        ORANGEFS_DEV_MAP = _IO(ORANGEFS_DEV_MAGIC, DEV_MAP),
+        ORANGEFS_DEV_REMOUNT_ALL = _IO(ORANGEFS_DEV_MAGIC, DEV_REMOUNT_ALL),
+        ORANGEFS_DEV_DEBUG = _IOR(ORANGEFS_DEV_MAGIC, DEV_DEBUG, __s32),
+        ORANGEFS_DEV_UPSTREAM = _IOW(ORANGEFS_DEV_MAGIC, DEV_UPSTREAM, int),
+        ORANGEFS_DEV_CLIENT_MASK = _IOW(ORANGEFS_DEV_MAGIC,
+                                    DEV_CLIENT_MASK,
+                                    struct dev_mask2_info_s),
+        ORANGEFS_DEV_CLIENT_STRING = _IOW(ORANGEFS_DEV_MAGIC,
+                                      DEV_CLIENT_STRING,
+                                      char *),
+        ORANGEFS_DEV_MAXNR = DEV_MAX_NR,
+};
+/*
+ * version number for use in communicating between kernel space and user
+ * space. Zero signifies the upstream version of the kernel module.
+ */
+#define ORANGEFS_KERNEL_PROTO_VERSION 0
+#define ORANGEFS_MINIMUM_USERSPACE_VERSION 20904
+/*
+ * describes memory regions to map in the ORANGEFS_DEV_MAP ioctl.
+ * NOTE: See devorangefs-req.c for 32 bit compat structure.
+ * Since this structure has a variable-sized layout that is different
+ * on 32 and 64 bit platforms, we need to normalize to a 64 bit layout
+ * on such systems before servicing ioctl calls from user-space binaries
+ * that may be 32 bit!
+ */
+struct ORANGEFS_dev_map_desc {
+        void *ptr;
+        __s32 total_size;
+        __s32 size;
+        __s32 count;
+};
+/* gossip.h *****************************************************************/
+#ifdef GOSSIP_DISABLE_DEBUG
+#define gossip_debug(mask, format, f...) do {} while (0)
+#else
+extern __u64 gossip_debug_mask;
+extern struct client_debug_mask client_debug_mask;
+/* try to avoid function call overhead by checking masks in macro */
+#define gossip_debug(mask, format, f...)                        \
+do {                                                            \
+        if (gossip_debug_mask & mask)                           \
+                printk(format, ##f);                            \
+} while (0)
+#endif /* GOSSIP_DISABLE_DEBUG */
+/* do file and line number printouts w/ the GNU preprocessor */
+#define gossip_ldebug(mask, format, f...)                               \
+                gossip_debug(mask, "%s: " format, __func__, ##f)
+#define gossip_err printk
+#define gossip_lerr(format, f...)                                       \
+                gossip_err("%s line %d: " format,                       \
+                           __FILE__,                                    \
+                           __LINE__,                                    \
+                           ##f)
diff --git a/fs/orangefs/super.c b/fs/orangefs/super.c
new file mode 100644
index 000000000000..b9da9a0281c9
--- /dev/null
+++ b/fs/orangefs/super.c
@@ -0,0 +1,559 @@
+/*
+ * (C) 2001 Clemson University and The University of Chicago
+ *
+ * See COPYING in top-level directory.
+ */
+#include "protocol.h"
+#include "orangefs-kernel.h"
+#include "orangefs-bufmap.h"
+#include <linux/parser.h>
+/* a cache for orangefs-inode objects (i.e. orangefs inode private data) */
+static struct kmem_cache *orangefs_inode_cache;
+/* list for storing orangefs specific superblocks in use */
+LIST_HEAD(orangefs_superblocks);
+DEFINE_SPINLOCK(orangefs_superblocks_lock);
+enum {
+        Opt_intr,
+        Opt_acl,
+        Opt_local_lock,
+        Opt_err
+};
+static const match_table_t tokens = {
+        { Opt_acl,              "acl" },
+        { Opt_intr,             "intr" },
+        { Opt_local_lock,       "local_lock" },
+        { Opt_err,      NULL }
+};
+static int parse_mount_options(struct super_block *sb, char *options,
+                int silent)
+{
+        struct orangefs_sb_info_s *orangefs_sb = ORANGEFS_SB(sb);
+        substring_t args[MAX_OPT_ARGS];
+        char *p;
+        /*
+         * Force any potential flags that might be set from the mount
+         * to zero, ie, initialize to unset.
+         */
+        sb->s_flags &= ~MS_POSIXACL;
+        orangefs_sb->flags &= ~ORANGEFS_OPT_INTR;
+        orangefs_sb->flags &= ~ORANGEFS_OPT_LOCAL_LOCK;
+        while ((p = strsep(&options, ",")) != NULL) {
+                int token;
+                if (!*p)
+                        continue;
+                token = match_token(p, tokens, args);
+                switch (token) {
+                case Opt_acl:
+                        sb->s_flags |= MS_POSIXACL;
+                        break;
+                case Opt_intr:
+                        orangefs_sb->flags |= ORANGEFS_OPT_INTR;
+                        break;
+                case Opt_local_lock:
+                        orangefs_sb->flags |= ORANGEFS_OPT_LOCAL_LOCK;
+                        break;
+                default:
+                        goto fail;
+                }
+        }
+        return 0;
+fail:
+        if (!silent)
+                gossip_err("Error: mount option [%s] is not supported.\n", p);
+        return -EINVAL;
+}
+static void orangefs_inode_cache_ctor(void *req)
+{
+        struct orangefs_inode_s *orangefs_inode = req;
+        inode_init_once(&orangefs_inode->vfs_inode);
+        init_rwsem(&orangefs_inode->xattr_sem);
+        orangefs_inode->vfs_inode.i_version = 1;
+}
+static struct inode *orangefs_alloc_inode(struct super_block *sb)
+{
+        struct orangefs_inode_s *orangefs_inode;
+        orangefs_inode = kmem_cache_alloc(orangefs_inode_cache, GFP_KERNEL);
+        if (orangefs_inode == NULL) {
+                gossip_err("Failed to allocate orangefs_inode\n");
+                return NULL;
+        }
+        /*
+         * We want to clear everything except for rw_semaphore and the
+         * vfs_inode.
+         */
+        memset(&orangefs_inode->refn.khandle, 0, 16);
+        orangefs_inode->refn.fs_id = ORANGEFS_FS_ID_NULL;
+        orangefs_inode->last_failed_block_index_read = 0;
+        memset(orangefs_inode->link_target, 0, sizeof(orangefs_inode->link_target));
+        orangefs_inode->pinode_flags = 0;
+        gossip_debug(GOSSIP_SUPER_DEBUG,
+                     "orangefs_alloc_inode: allocated %p\n",
+                     &orangefs_inode->vfs_inode);
+        return &orangefs_inode->vfs_inode;
+}
+static void orangefs_destroy_inode(struct inode *inode)
+{
+        struct orangefs_inode_s *orangefs_inode = ORANGEFS_I(inode);
+        gossip_debug(GOSSIP_SUPER_DEBUG,
+                        "%s: deallocated %p destroying inode %pU\n",
+                        __func__, orangefs_inode, get_khandle_from_ino(inode));
+        kmem_cache_free(orangefs_inode_cache, orangefs_inode);
+}
+/*
+ * NOTE: information filled in here is typically reflected in the
+ * output of the system command 'df'
+*/
+static int orangefs_statfs(struct dentry *dentry, struct kstatfs *buf)
+{
+        int ret = -ENOMEM;
+        struct orangefs_kernel_op_s *new_op = NULL;
+        int flags = 0;
+        struct super_block *sb = NULL;
+        sb = dentry->d_sb;
+        gossip_debug(GOSSIP_SUPER_DEBUG,
+                     "orangefs_statfs: called on sb %p (fs_id is %d)\n",
+                     sb,
+                     (int)(ORANGEFS_SB(sb)->fs_id));
+        new_op = op_alloc(ORANGEFS_VFS_OP_STATFS);
+        if (!new_op)
+                return ret;
+        new_op->upcall.req.statfs.fs_id = ORANGEFS_SB(sb)->fs_id;
+        if (ORANGEFS_SB(sb)->flags & ORANGEFS_OPT_INTR)
+                flags = ORANGEFS_OP_INTERRUPTIBLE;
+        ret = service_operation(new_op, "orangefs_statfs", flags);
+        if (new_op->downcall.status < 0)
+                goto out_op_release;
+        gossip_debug(GOSSIP_SUPER_DEBUG,
+                     "%s: got %ld blocks available | "
+                     "%ld blocks total | %ld block size | "
+                     "%ld files total | %ld files avail\n",
+                     __func__,
+                     (long)new_op->downcall.resp.statfs.blocks_avail,
+                     (long)new_op->downcall.resp.statfs.blocks_total,
+                     (long)new_op->downcall.resp.statfs.block_size,
+                     (long)new_op->downcall.resp.statfs.files_total,
+                     (long)new_op->downcall.resp.statfs.files_avail);
+        buf->f_type = sb->s_magic;
+        memcpy(&buf->f_fsid, &ORANGEFS_SB(sb)->fs_id, sizeof(buf->f_fsid));
+        buf->f_bsize = new_op->downcall.resp.statfs.block_size;
+        buf->f_namelen = ORANGEFS_NAME_MAX;
+        buf->f_blocks = (sector_t) new_op->downcall.resp.statfs.blocks_total;
+        buf->f_bfree = (sector_t) new_op->downcall.resp.statfs.blocks_avail;
+        buf->f_bavail = (sector_t) new_op->downcall.resp.statfs.blocks_avail;
+        buf->f_files = (sector_t) new_op->downcall.resp.statfs.files_total;
+        buf->f_ffree = (sector_t) new_op->downcall.resp.statfs.files_avail;
+        buf->f_frsize = sb->s_blocksize;
+out_op_release:
+        op_release(new_op);
+        gossip_debug(GOSSIP_SUPER_DEBUG, "orangefs_statfs: returning %d\n", ret);
+        return ret;
+}
+/*
+ * Remount as initiated by VFS layer.  We just need to reparse the mount
+ * options, no need to signal pvfs2-client-core about it.
+ */
+static int orangefs_remount_fs(struct super_block *sb, int *flags, char *data)
+{
+        gossip_debug(GOSSIP_SUPER_DEBUG, "orangefs_remount_fs: called\n");
+        return parse_mount_options(sb, data, 1);
+}
+/*
+ * Remount as initiated by pvfs2-client-core on restart.  This is used to
+ * repopulate mount information left from previous pvfs2-client-core.
+ *
+ * the idea here is that given a valid superblock, we're
+ * re-initializing the user space client with the initial mount
+ * information specified when the super block was first initialized.
+ * this is very different than the first initialization/creation of a
+ * superblock.  we use the special service_priority_operation to make
+ * sure that the mount gets ahead of any other pending operation that
+ * is waiting for servicing.  this means that the pvfs2-client won't
+ * fail to start several times for all other pending operations before
+ * the client regains all of the mount information from us.
+ * NOTE: this function assumes that the request_mutex is already acquired!
+ */
+int orangefs_remount(struct orangefs_sb_info_s *orangefs_sb)
+{
+        struct orangefs_kernel_op_s *new_op;
+        int ret = -EINVAL;
+        gossip_debug(GOSSIP_SUPER_DEBUG, "orangefs_remount: called\n");
+        new_op = op_alloc(ORANGEFS_VFS_OP_FS_MOUNT);
+        if (!new_op)
+                return -ENOMEM;
+        strncpy(new_op->upcall.req.fs_mount.orangefs_config_server,
+                orangefs_sb->devname,
+                ORANGEFS_MAX_SERVER_ADDR_LEN);
+        gossip_debug(GOSSIP_SUPER_DEBUG,
+                     "Attempting ORANGEFS Remount via host %s\n",
+                     new_op->upcall.req.fs_mount.orangefs_config_server);
+        /*
+         * we assume that the calling function has already acquired the
+         * request_mutex to prevent other operations from bypassing
+         * this one
+         */
+        ret = service_operation(new_op, "orangefs_remount",
+                ORANGEFS_OP_PRIORITY | ORANGEFS_OP_NO_MUTEX);
+        gossip_debug(GOSSIP_SUPER_DEBUG,
+                     "orangefs_remount: mount got return value of %d\n",
+                     ret);
+        if (ret == 0) {
+                /*
+                 * store the id assigned to this sb -- it's just a
+                 * short-lived mapping that the system interface uses
+                 * to map this superblock to a particular mount entry
+                 */
+                orangefs_sb->id = new_op->downcall.resp.fs_mount.id;
+                orangefs_sb->mount_pending = 0;
+        }
+        op_release(new_op);
+        return ret;
+}
+int fsid_key_table_initialize(void)
+{
+        return 0;
+}
+void fsid_key_table_finalize(void)
+{
+}
+/* Called whenever the VFS dirties the inode in response to atime updates */
+static void orangefs_dirty_inode(struct inode *inode, int flags)
+{
+        struct orangefs_inode_s *orangefs_inode = ORANGEFS_I(inode);
+        gossip_debug(GOSSIP_SUPER_DEBUG,
+                     "orangefs_dirty_inode: %pU\n",
+                     get_khandle_from_ino(inode));
+        SetAtimeFlag(orangefs_inode);
+}
+static const struct super_operations orangefs_s_ops = {
+        .alloc_inode = orangefs_alloc_inode,
+        .destroy_inode = orangefs_destroy_inode,
+        .dirty_inode = orangefs_dirty_inode,
+        .drop_inode = generic_delete_inode,
+        .statfs = orangefs_statfs,
+        .remount_fs = orangefs_remount_fs,
+        .show_options = generic_show_options,
+};
+static struct dentry *orangefs_fh_to_dentry(struct super_block *sb,
+                                  struct fid *fid,
+                                  int fh_len,
+                                  int fh_type)
+{
+        struct orangefs_object_kref refn;
+        if (fh_len < 5 || fh_type > 2)
+                return NULL;
+        ORANGEFS_khandle_from(&(refn.khandle), fid->raw, 16);
+        refn.fs_id = (u32) fid->raw[4];
+        gossip_debug(GOSSIP_SUPER_DEBUG,
+                     "fh_to_dentry: handle %pU, fs_id %d\n",
+                     &refn.khandle,
+                     refn.fs_id);
+        return d_obtain_alias(orangefs_iget(sb, &refn));
+}
+static int orangefs_encode_fh(struct inode *inode,
+                    __u32 *fh,
+                    int *max_len,
+                    struct inode *parent)
+{
+        int len = parent ? 10 : 5;
+        int type = 1;
+        struct orangefs_object_kref refn;
+        if (*max_len < len) {
+                gossip_lerr("fh buffer is too small for encoding\n");
+                *max_len = len;
+                type = 255;
+                goto out;
+        }
+        refn = ORANGEFS_I(inode)->refn;
+        ORANGEFS_khandle_to(&refn.khandle, fh, 16);
+        fh[4] = refn.fs_id;
+        gossip_debug(GOSSIP_SUPER_DEBUG,
+                     "Encoding fh: handle %pU, fsid %u\n",
+                     &refn.khandle,
+                     refn.fs_id);
+        if (parent) {
+                refn = ORANGEFS_I(parent)->refn;
+                ORANGEFS_khandle_to(&refn.khandle, (char *) fh + 20, 16);
+                fh[9] = refn.fs_id;
+                type = 2;
+                gossip_debug(GOSSIP_SUPER_DEBUG,
+                             "Encoding parent: handle %pU, fsid %u\n",
+                             &refn.khandle,
+                             refn.fs_id);
+        }
+        *max_len = len;
+out:
+        return type;
+}
+static const struct export_operations orangefs_export_ops = {
+        .encode_fh = orangefs_encode_fh,
+        .fh_to_dentry = orangefs_fh_to_dentry,
+};
+static int orangefs_fill_sb(struct super_block *sb,
+                struct orangefs_fs_mount_response *fs_mount,
+                void *data, int silent)
+{
+        int ret = -EINVAL;
+        struct inode *root = NULL;
+        struct dentry *root_dentry = NULL;
+        struct orangefs_object_kref root_object;
+        /* alloc and init our private orangefs sb info */
+        sb->s_fs_info = kzalloc(sizeof(struct orangefs_sb_info_s), GFP_KERNEL);
+        if (!ORANGEFS_SB(sb))
+                return -ENOMEM;
+        ORANGEFS_SB(sb)->sb = sb;
+        ORANGEFS_SB(sb)->root_khandle = fs_mount->root_khandle;
+        ORANGEFS_SB(sb)->fs_id = fs_mount->fs_id;
+        ORANGEFS_SB(sb)->id = fs_mount->id;
+        if (data) {
+                ret = parse_mount_options(sb, data, silent);
+                if (ret)
+                        return ret;
+        }
+        /* Hang the xattr handlers off the superblock */
+        sb->s_xattr = orangefs_xattr_handlers;
+        sb->s_magic = ORANGEFS_SUPER_MAGIC;
+        sb->s_op = &orangefs_s_ops;
+        sb->s_d_op = &orangefs_dentry_operations;
+        sb->s_blocksize = orangefs_bufmap_size_query();
+        sb->s_blocksize_bits = orangefs_bufmap_shift_query();
+        sb->s_maxbytes = MAX_LFS_FILESIZE;
+        root_object.khandle = ORANGEFS_SB(sb)->root_khandle;
+        root_object.fs_id = ORANGEFS_SB(sb)->fs_id;
+        gossip_debug(GOSSIP_SUPER_DEBUG,
+                     "get inode %pU, fsid %d\n",
+                     &root_object.khandle,
+                     root_object.fs_id);
+        root = orangefs_iget(sb, &root_object);
+        if (IS_ERR(root))
+                return PTR_ERR(root);
+        gossip_debug(GOSSIP_SUPER_DEBUG,
+                     "Allocated root inode [%p] with mode %x\n",
+                     root,
+                     root->i_mode);
+        /* allocates and places root dentry in dcache */
+        root_dentry = d_make_root(root);
+        if (!root_dentry)
+                return -ENOMEM;
+        sb->s_export_op = &orangefs_export_ops;
+        sb->s_root = root_dentry;
+        return 0;
+}
+struct dentry *orangefs_mount(struct file_system_type *fst,
+                           int flags,
+                           const char *devname,
+                           void *data)
+{
+        int ret = -EINVAL;
+        struct super_block *sb = ERR_PTR(-EINVAL);
+        struct orangefs_kernel_op_s *new_op;
+        struct dentry *d = ERR_PTR(-EINVAL);
+        gossip_debug(GOSSIP_SUPER_DEBUG,
+                     "orangefs_mount: called with devname %s\n",
+                     devname);
+        if (!devname) {
+                gossip_err("ERROR: device name not specified.\n");
+                return ERR_PTR(-EINVAL);
+        }
+        new_op = op_alloc(ORANGEFS_VFS_OP_FS_MOUNT);
+        if (!new_op)
+                return ERR_PTR(-ENOMEM);
+        strncpy(new_op->upcall.req.fs_mount.orangefs_config_server,
+                devname,
+                ORANGEFS_MAX_SERVER_ADDR_LEN);
+        gossip_debug(GOSSIP_SUPER_DEBUG,
+                     "Attempting ORANGEFS Mount via host %s\n",
+                     new_op->upcall.req.fs_mount.orangefs_config_server);
+        ret = service_operation(new_op, "orangefs_mount", 0);
+        gossip_debug(GOSSIP_SUPER_DEBUG,
+                     "orangefs_mount: mount got return value of %d\n", ret);
+        if (ret)
+                goto free_op;
+        if (new_op->downcall.resp.fs_mount.fs_id == ORANGEFS_FS_ID_NULL) {
+                gossip_err("ERROR: Retrieved null fs_id\n");
+                ret = -EINVAL;
+                goto free_op;
+        }
+        sb = sget(fst, NULL, set_anon_super, flags, NULL);
+        if (IS_ERR(sb)) {
+                d = ERR_CAST(sb);
+                goto free_op;
+        }
+        ret = orangefs_fill_sb(sb,
+              &new_op->downcall.resp.fs_mount, data,
+              flags & MS_SILENT ? 1 : 0);
+        if (ret) {
+                d = ERR_PTR(ret);
+                goto free_op;
+        }
+        /*
+         * on successful mount, store the devname and data
+         * used
+         */
+        strncpy(ORANGEFS_SB(sb)->devname,
+                devname,
+                ORANGEFS_MAX_SERVER_ADDR_LEN);
+        /* mount_pending must be cleared */
+        ORANGEFS_SB(sb)->mount_pending = 0;
+        /*
+         * finally, add this sb to our list of known orangefs
+         * sb's
+         */
+        gossip_debug(GOSSIP_SUPER_DEBUG,
+                     "Adding SB %p to orangefs superblocks\n",
+                     ORANGEFS_SB(sb));
+        spin_lock(&orangefs_superblocks_lock);
+        list_add_tail(&ORANGEFS_SB(sb)->list, &orangefs_superblocks);
+        spin_unlock(&orangefs_superblocks_lock);
+        op_release(new_op);
+        return dget(sb->s_root);
+free_op:
+        gossip_err("orangefs_mount: mount request failed with %d\n", ret);
+        if (ret == -EINVAL) {
+                gossip_err("Ensure that all orangefs-servers have the same FS configuration files\n");
+                gossip_err("Look at pvfs2-client-core log file (typically /tmp/pvfs2-client.log) for more details\n");
+        }
+        op_release(new_op);
+        return d;
+}
+void orangefs_kill_sb(struct super_block *sb)
+{
+        gossip_debug(GOSSIP_SUPER_DEBUG, "orangefs_kill_sb: called\n");
+        /* provided sb cleanup */
+        kill_anon_super(sb);
+        /*
+         * issue the unmount to userspace to tell it to remove the
+         * dynamic mount info it has for this superblock
+         */
+         orangefs_unmount_sb(sb);
+        /* remove the sb from our list of orangefs specific sb's */
+        spin_lock(&orangefs_superblocks_lock);
+        __list_del_entry(&ORANGEFS_SB(sb)->list);       /* not list_del_init */
+        ORANGEFS_SB(sb)->list.prev = NULL;
+        spin_unlock(&orangefs_superblocks_lock);
+        /*
+         * make sure that ORANGEFS_DEV_REMOUNT_ALL loop that might've seen us
+         * gets completed before we free the dang thing.
+         */
+        mutex_lock(&request_mutex);
+        mutex_unlock(&request_mutex);
+        /* free the orangefs superblock private data */
+        kfree(ORANGEFS_SB(sb));
+}
+int orangefs_inode_cache_initialize(void)
+{
+        orangefs_inode_cache = kmem_cache_create("orangefs_inode_cache",
+                                              sizeof(struct orangefs_inode_s),
+                                              0,
+                                              ORANGEFS_CACHE_CREATE_FLAGS,
+                                              orangefs_inode_cache_ctor);
+        if (!orangefs_inode_cache) {
+                gossip_err("Cannot create orangefs_inode_cache\n");
+                return -ENOMEM;
+        }
+        return 0;
+}
+int orangefs_inode_cache_finalize(void)
+{
+        kmem_cache_destroy(orangefs_inode_cache);
+        return 0;
+}
diff --git a/fs/orangefs/symlink.c b/fs/orangefs/symlink.c
new file mode 100644
index 000000000000..6418dd638680
--- /dev/null
+++ b/fs/orangefs/symlink.c
@@ -0,0 +1,19 @@
+/*
+ * (C) 2001 Clemson University and The University of Chicago
+ *
+ * See COPYING in top-level directory.
+ */
+#include "protocol.h"
+#include "orangefs-kernel.h"
+#include "orangefs-bufmap.h"
+struct inode_operations orangefs_symlink_inode_operations = {
+        .readlink = generic_readlink,
+        .get_link = simple_get_link,
+        .setattr = orangefs_setattr,
+        .getattr = orangefs_getattr,
+        .listxattr = orangefs_listxattr,
+        .setxattr = generic_setxattr,
+        .permission = orangefs_permission,
+};
diff --git a/fs/orangefs/upcall.h b/fs/orangefs/upcall.h
new file mode 100644
index 000000000000..001b20239407
--- /dev/null
+++ b/fs/orangefs/upcall.h
@@ -0,0 +1,246 @@
+/*
+ * (C) 2001 Clemson University and The University of Chicago
+ *
+ * See COPYING in top-level directory.
+ */
+#ifndef __UPCALL_H
+#define __UPCALL_H
+/*
+ * Sanitized this header file to fix
+ * 32-64 bit interaction issues between
+ * client-core and device
+ */
+struct orangefs_io_request_s {
+        __s32 __pad1;
+        __s32 buf_index;
+        __s32 count;
+        __s32 __pad2;
+        __s64 offset;
+        struct orangefs_object_kref refn;
+        enum ORANGEFS_io_type io_type;
+        __s32 readahead_size;
+};
+struct orangefs_lookup_request_s {
+        __s32 sym_follow;
+        __s32 __pad1;
+        struct orangefs_object_kref parent_refn;
+        char d_name[ORANGEFS_NAME_MAX];
+};
+struct orangefs_create_request_s {
+        struct orangefs_object_kref parent_refn;
+        struct ORANGEFS_sys_attr_s attributes;
+        char d_name[ORANGEFS_NAME_MAX];
+};
+struct orangefs_symlink_request_s {
+        struct orangefs_object_kref parent_refn;
+        struct ORANGEFS_sys_attr_s attributes;
+        char entry_name[ORANGEFS_NAME_MAX];
+        char target[ORANGEFS_NAME_MAX];
+};
+struct orangefs_getattr_request_s {
+        struct orangefs_object_kref refn;
+        __u32 mask;
+        __u32 __pad1;
+};
+struct orangefs_setattr_request_s {
+        struct orangefs_object_kref refn;
+        struct ORANGEFS_sys_attr_s attributes;
+};
+struct orangefs_remove_request_s {
+        struct orangefs_object_kref parent_refn;
+        char d_name[ORANGEFS_NAME_MAX];
+};
+struct orangefs_mkdir_request_s {
+        struct orangefs_object_kref parent_refn;
+        struct ORANGEFS_sys_attr_s attributes;
+        char d_name[ORANGEFS_NAME_MAX];
+};
+struct orangefs_readdir_request_s {
+        struct orangefs_object_kref refn;
+        __u64 token;
+        __s32 max_dirent_count;
+        __s32 buf_index;
+};
+struct orangefs_readdirplus_request_s {
+        struct orangefs_object_kref refn;
+        __u64 token;
+        __s32 max_dirent_count;
+        __u32 mask;
+        __s32 buf_index;
+        __s32 __pad1;
+};
+struct orangefs_rename_request_s {
+        struct orangefs_object_kref old_parent_refn;
+        struct orangefs_object_kref new_parent_refn;
+        char d_old_name[ORANGEFS_NAME_MAX];
+        char d_new_name[ORANGEFS_NAME_MAX];
+};
+struct orangefs_statfs_request_s {
+        __s32 fs_id;
+        __s32 __pad1;
+};
+struct orangefs_truncate_request_s {
+        struct orangefs_object_kref refn;
+        __s64 size;
+};
+struct orangefs_mmap_ra_cache_flush_request_s {
+        struct orangefs_object_kref refn;
+};
+struct orangefs_fs_mount_request_s {
+        char orangefs_config_server[ORANGEFS_MAX_SERVER_ADDR_LEN];
+};
+struct orangefs_fs_umount_request_s {
+        __s32 id;
+        __s32 fs_id;
+        char orangefs_config_server[ORANGEFS_MAX_SERVER_ADDR_LEN];
+};
+struct orangefs_getxattr_request_s {
+        struct orangefs_object_kref refn;
+        __s32 key_sz;
+        __s32 __pad1;
+        char key[ORANGEFS_MAX_XATTR_NAMELEN];
+};
+struct orangefs_setxattr_request_s {
+        struct orangefs_object_kref refn;
+        struct ORANGEFS_keyval_pair keyval;
+        __s32 flags;
+        __s32 __pad1;
+};
+struct orangefs_listxattr_request_s {
+        struct orangefs_object_kref refn;
+        __s32 requested_count;
+        __s32 __pad1;
+        __u64 token;
+};
+struct orangefs_removexattr_request_s {
+        struct orangefs_object_kref refn;
+        __s32 key_sz;
+        __s32 __pad1;
+        char key[ORANGEFS_MAX_XATTR_NAMELEN];
+};
+struct orangefs_op_cancel_s {
+        __u64 op_tag;
+};
+struct orangefs_fsync_request_s {
+        struct orangefs_object_kref refn;
+};
+enum orangefs_param_request_type {
+        ORANGEFS_PARAM_REQUEST_SET = 1,
+        ORANGEFS_PARAM_REQUEST_GET = 2
+};
+enum orangefs_param_request_op {
+        ORANGEFS_PARAM_REQUEST_OP_ACACHE_TIMEOUT_MSECS = 1,
+        ORANGEFS_PARAM_REQUEST_OP_ACACHE_HARD_LIMIT = 2,
+        ORANGEFS_PARAM_REQUEST_OP_ACACHE_SOFT_LIMIT = 3,
+        ORANGEFS_PARAM_REQUEST_OP_ACACHE_RECLAIM_PERCENTAGE = 4,
+        ORANGEFS_PARAM_REQUEST_OP_PERF_TIME_INTERVAL_SECS = 5,
+        ORANGEFS_PARAM_REQUEST_OP_PERF_HISTORY_SIZE = 6,
+        ORANGEFS_PARAM_REQUEST_OP_PERF_RESET = 7,
+        ORANGEFS_PARAM_REQUEST_OP_NCACHE_TIMEOUT_MSECS = 8,
+        ORANGEFS_PARAM_REQUEST_OP_NCACHE_HARD_LIMIT = 9,
+        ORANGEFS_PARAM_REQUEST_OP_NCACHE_SOFT_LIMIT = 10,
+        ORANGEFS_PARAM_REQUEST_OP_NCACHE_RECLAIM_PERCENTAGE = 11,
+        ORANGEFS_PARAM_REQUEST_OP_STATIC_ACACHE_TIMEOUT_MSECS = 12,
+        ORANGEFS_PARAM_REQUEST_OP_STATIC_ACACHE_HARD_LIMIT = 13,
+        ORANGEFS_PARAM_REQUEST_OP_STATIC_ACACHE_SOFT_LIMIT = 14,
+        ORANGEFS_PARAM_REQUEST_OP_STATIC_ACACHE_RECLAIM_PERCENTAGE = 15,
+        ORANGEFS_PARAM_REQUEST_OP_CLIENT_DEBUG = 16,
+        ORANGEFS_PARAM_REQUEST_OP_CCACHE_TIMEOUT_SECS = 17,
+        ORANGEFS_PARAM_REQUEST_OP_CCACHE_HARD_LIMIT = 18,
+        ORANGEFS_PARAM_REQUEST_OP_CCACHE_SOFT_LIMIT = 19,
+        ORANGEFS_PARAM_REQUEST_OP_CCACHE_RECLAIM_PERCENTAGE = 20,
+        ORANGEFS_PARAM_REQUEST_OP_CAPCACHE_TIMEOUT_SECS = 21,
+        ORANGEFS_PARAM_REQUEST_OP_CAPCACHE_HARD_LIMIT = 22,
+        ORANGEFS_PARAM_REQUEST_OP_CAPCACHE_SOFT_LIMIT = 23,
+        ORANGEFS_PARAM_REQUEST_OP_CAPCACHE_RECLAIM_PERCENTAGE = 24,
+        ORANGEFS_PARAM_REQUEST_OP_TWO_MASK_VALUES = 25,
+};
+struct orangefs_param_request_s {
+        enum orangefs_param_request_type type;
+        enum orangefs_param_request_op op;
+        __s64 value;
+        char s_value[ORANGEFS_MAX_DEBUG_STRING_LEN];
+};
+enum orangefs_perf_count_request_type {
+        ORANGEFS_PERF_COUNT_REQUEST_ACACHE = 1,
+        ORANGEFS_PERF_COUNT_REQUEST_NCACHE = 2,
+        ORANGEFS_PERF_COUNT_REQUEST_CAPCACHE = 3,
+};
+struct orangefs_perf_count_request_s {
+        enum orangefs_perf_count_request_type type;
+        __s32 __pad1;
+};
+struct orangefs_fs_key_request_s {
+        __s32 fsid;
+        __s32 __pad1;
+};
+struct orangefs_upcall_s {
+        __s32 type;
+        __u32 uid;
+        __u32 gid;
+        int pid;
+        int tgid;
+        /* Trailers unused but must be retained for protocol compatibility. */
+        __s64 trailer_size;
+        char *trailer_buf;
+        union {
+                struct orangefs_io_request_s io;
+                struct orangefs_lookup_request_s lookup;
+                struct orangefs_create_request_s create;
+                struct orangefs_symlink_request_s sym;
+                struct orangefs_getattr_request_s getattr;
+                struct orangefs_setattr_request_s setattr;
+                struct orangefs_remove_request_s remove;
+                struct orangefs_mkdir_request_s mkdir;
+                struct orangefs_readdir_request_s readdir;
+                struct orangefs_readdirplus_request_s readdirplus;
+                struct orangefs_rename_request_s rename;
+                struct orangefs_statfs_request_s statfs;
+                struct orangefs_truncate_request_s truncate;
+                struct orangefs_mmap_ra_cache_flush_request_s ra_cache_flush;
+                struct orangefs_fs_mount_request_s fs_mount;
+                struct orangefs_fs_umount_request_s fs_umount;
+                struct orangefs_getxattr_request_s getxattr;
+                struct orangefs_setxattr_request_s setxattr;
+                struct orangefs_listxattr_request_s listxattr;
+                struct orangefs_removexattr_request_s removexattr;
+                struct orangefs_op_cancel_s cancel;
+                struct orangefs_fsync_request_s fsync;
+                struct orangefs_param_request_s param;
+                struct orangefs_perf_count_request_s perf_count;
+                struct orangefs_fs_key_request_s fs_key;
+        } req;
+};
+#endif /* __UPCALL_H */
diff --git a/fs/orangefs/waitqueue.c b/fs/orangefs/waitqueue.c
new file mode 100644
index 000000000000..31635bc303fe
--- /dev/null
+++ b/fs/orangefs/waitqueue.c
@@ -0,0 +1,357 @@
+/*
+ * (C) 2001 Clemson University and The University of Chicago
+ * (C) 2011 Omnibond Systems
+ *
+ * Changes by Acxiom Corporation to implement generic service_operation()
+ * function, Copyright Acxiom Corporation, 2005.
+ *
+ * See COPYING in top-level directory.
+ */
+/*
+ *  In-kernel waitqueue operations.
+ */
+#include "protocol.h"
+#include "orangefs-kernel.h"
+#include "orangefs-bufmap.h"
+static int wait_for_matching_downcall(struct orangefs_kernel_op_s *, long, bool);
+static void orangefs_clean_up_interrupted_operation(struct orangefs_kernel_op_s *);
+/*
+ * What we do in this function is to walk the list of operations that are
+ * present in the request queue and mark them as purged.
+ * NOTE: This is called from the device close after client-core has
+ * guaranteed that no new operations could appear on the list since the
+ * client-core is anyway going to exit.
+ */
+void purge_waiting_ops(void)
+{
+        struct orangefs_kernel_op_s *op;
+        spin_lock(&orangefs_request_list_lock);
+        list_for_each_entry(op, &orangefs_request_list, list) {
+                gossip_debug(GOSSIP_WAIT_DEBUG,
+                             "pvfs2-client-core: purging op tag %llu %s\n",
+                             llu(op->tag),
+                             get_opname_string(op));
+                set_op_state_purged(op);
+                gossip_debug(GOSSIP_DEV_DEBUG,
+                             "%s: op:%s: op_state:%d: process:%s:\n",
+                             __func__,
+                             get_opname_string(op),
+                             op->op_state,
+                             current->comm);
+        }
+        spin_unlock(&orangefs_request_list_lock);
+}
+/*
+ * submits a ORANGEFS operation and waits for it to complete
+ *
+ * Note op->downcall.status will contain the status of the operation (in
+ * errno format), whether provided by pvfs2-client or a result of failure to
+ * service the operation.  If the caller wishes to distinguish, then
+ * op->state can be checked to see if it was serviced or not.
+ *
+ * Returns contents of op->downcall.status for convenience
+ */
+int service_operation(struct orangefs_kernel_op_s *op,
+                      const char *op_name,
+                      int flags)
+{
+        long timeout = MAX_SCHEDULE_TIMEOUT;
+        int ret = 0;
+        DEFINE_WAIT(wait_entry);
+        op->upcall.tgid = current->tgid;
+        op->upcall.pid = current->pid;
+retry_servicing:
+        op->downcall.status = 0;
+        gossip_debug(GOSSIP_WAIT_DEBUG,
+                     "%s: %s op:%p: process:%s: pid:%d:\n",
+                     __func__,
+                     op_name,
+                     op,
+                     current->comm,
+                     current->pid);
+        /*
+         * If ORANGEFS_OP_NO_MUTEX was set in flags, we need to avoid
+         * acquiring the request_mutex because we're servicing a
+         * high priority remount operation and the request_mutex is
+         * already taken.
+         */
+        if (!(flags & ORANGEFS_OP_NO_MUTEX)) {
+                if (flags & ORANGEFS_OP_INTERRUPTIBLE)
+                        ret = mutex_lock_interruptible(&request_mutex);
+                else
+                        ret = mutex_lock_killable(&request_mutex);
+                /*
+                 * check to see if we were interrupted while waiting for
+                 * mutex
+                 */
+                if (ret < 0) {
+                        op->downcall.status = ret;
+                        gossip_debug(GOSSIP_WAIT_DEBUG,
+                                     "%s: service_operation interrupted.\n",
+                                     __func__);
+                        return ret;
+                }
+        }
+        /* queue up the operation */
+        spin_lock(&orangefs_request_list_lock);
+        spin_lock(&op->lock);
+        set_op_state_waiting(op);
+        gossip_debug(GOSSIP_DEV_DEBUG,
+                     "%s: op:%s: op_state:%d: process:%s:\n",
+                     __func__,
+                     get_opname_string(op),
+                     op->op_state,
+                     current->comm);
+        /* add high priority remount op to the front of the line. */
+        if (flags & ORANGEFS_OP_PRIORITY)
+                list_add(&op->list, &orangefs_request_list);
+        else
+                list_add_tail(&op->list, &orangefs_request_list);
+        spin_unlock(&op->lock);
+        wake_up_interruptible(&orangefs_request_list_waitq);
+        if (!__is_daemon_in_service()) {
+                gossip_debug(GOSSIP_WAIT_DEBUG,
+                             "%s:client core is NOT in service.\n",
+                             __func__);
+                timeout = op_timeout_secs * HZ;
+        }
+        spin_unlock(&orangefs_request_list_lock);
+        if (!(flags & ORANGEFS_OP_NO_MUTEX))
+                mutex_unlock(&request_mutex);
+        ret = wait_for_matching_downcall(op, timeout,
+                                         flags & ORANGEFS_OP_INTERRUPTIBLE);
+        gossip_debug(GOSSIP_WAIT_DEBUG,
+                     "%s: wait_for_matching_downcall returned %d for %p\n",
+                     __func__,
+                     ret,
+                     op);
+        /* got matching downcall; make sure status is in errno format */
+        if (!ret) {
+                spin_unlock(&op->lock);
+                op->downcall.status =
+                    orangefs_normalize_to_errno(op->downcall.status);
+                ret = op->downcall.status;
+                goto out;
+        }
+        /* failed to get matching downcall */
+        if (ret == -ETIMEDOUT) {
+                gossip_err("%s: %s -- wait timed out; aborting attempt.\n",
+                           __func__,
+                           op_name);
+        }
+        /*
+         * remove a waiting op from the request list or
+         * remove an in-progress op from the in-progress list.
+         */
+        orangefs_clean_up_interrupted_operation(op);
+        op->downcall.status = ret;
+        /* retry if operation has not been serviced and if requested */
+        if (ret == -EAGAIN) {
+                op->attempts++;
+                timeout = op_timeout_secs * HZ;
+                gossip_debug(GOSSIP_WAIT_DEBUG,
+                             "orangefs: tag %llu (%s)"
+                             " -- operation to be retried (%d attempt)\n",
+                             llu(op->tag),
+                             op_name,
+                             op->attempts);
+                /*
+                 * io ops (ops that use the shared memory buffer) have
+                 * to be returned to their caller for a retry. Other ops
+                 * can just be recycled here.
+                 */
+                if (!op->uses_shared_memory)
+                        goto retry_servicing;
+        }
+out:
+        gossip_debug(GOSSIP_WAIT_DEBUG,
+                     "%s: %s returning: %d for %p.\n",
+                     __func__,
+                     op_name,
+                     ret,
+                     op);
+        return ret;
+}
+/* This can get called on an I/O op if it had a bad service_operation. */
+bool orangefs_cancel_op_in_progress(struct orangefs_kernel_op_s *op)
+{
+        u64 tag = op->tag;
+        if (!op_state_in_progress(op))
+                return false;
+        op->slot_to_free = op->upcall.req.io.buf_index;
+        memset(&op->upcall, 0, sizeof(op->upcall));
+        memset(&op->downcall, 0, sizeof(op->downcall));
+        op->upcall.type = ORANGEFS_VFS_OP_CANCEL;
+        op->upcall.req.cancel.op_tag = tag;
+        op->downcall.type = ORANGEFS_VFS_OP_INVALID;
+        op->downcall.status = -1;
+        orangefs_new_tag(op);
+        spin_lock(&orangefs_request_list_lock);
+        /* orangefs_request_list_lock is enough of a barrier here */
+        if (!__is_daemon_in_service()) {
+                spin_unlock(&orangefs_request_list_lock);
+                return false;
+        }
+        spin_lock(&op->lock);
+        set_op_state_waiting(op);
+        gossip_debug(GOSSIP_DEV_DEBUG,
+                     "%s: op:%s: op_state:%d: process:%s:\n",
+                     __func__,
+                     get_opname_string(op),
+                     op->op_state,
+                     current->comm);
+        list_add(&op->list, &orangefs_request_list);
+        spin_unlock(&op->lock);
+        spin_unlock(&orangefs_request_list_lock);
+        gossip_debug(GOSSIP_WAIT_DEBUG,
+                     "Attempting ORANGEFS operation cancellation of tag %llu\n",
+                     llu(tag));
+        return true;
+}
+/*
+ * Change an op to the "given up" state and remove it from its list.
+ */
+static void
+        orangefs_clean_up_interrupted_operation(struct orangefs_kernel_op_s *op)
+{
+        /*
+         * handle interrupted cases depending on what state we were in when
+         * the interruption is detected.
+         *
+         * Called with op->lock held.
+         */
+        /*
+         * List manipulation code elsewhere will ignore ops that
+         * have been given up upon.
+         */
+        op->op_state |= OP_VFS_STATE_GIVEN_UP;
+        if (list_empty(&op->list)) {
+                /* caught copying to/from daemon */
+                BUG_ON(op_state_serviced(op));
+                spin_unlock(&op->lock);
+                wait_for_completion(&op->waitq);
+        } else if (op_state_waiting(op)) {
+                /*
+                 * upcall hasn't been read; remove op from upcall request
+                 * list.
+                 */
+                spin_unlock(&op->lock);
+                spin_lock(&orangefs_request_list_lock);
+                list_del_init(&op->list);
+                spin_unlock(&orangefs_request_list_lock);
+                gossip_debug(GOSSIP_WAIT_DEBUG,
+                             "Interrupted: Removed op %p from request_list\n",
+                             op);
+        } else if (op_state_in_progress(op)) {
+                /* op must be removed from the in progress htable */
+                spin_unlock(&op->lock);
+                spin_lock(&htable_ops_in_progress_lock);
+                list_del_init(&op->list);
+                spin_unlock(&htable_ops_in_progress_lock);
+                gossip_debug(GOSSIP_WAIT_DEBUG,
+                             "Interrupted: Removed op %p"
+                             " from htable_ops_in_progress\n",
+                             op);
+        } else {
+                spin_unlock(&op->lock);
+                gossip_err("interrupted operation is in a weird state 0x%x\n",
+                           op->op_state);
+        }
+        reinit_completion(&op->waitq);
+}
+/*
+ * Sleeps on waitqueue waiting for matching downcall.
+ * If client-core finishes servicing, then we are good to go.
+ * else if client-core exits, we get woken up here, and retry with a timeout
+ *
+ * When this call returns to the caller, the specified op will no
+ * longer be in either the in_progress hash table or on the request list.
+ *
+ * Returns 0 on success and -errno on failure
+ * Errors are:
+ * EAGAIN in case we want the caller to requeue and try again..
+ * EINTR/EIO/ETIMEDOUT indicating we are done trying to service this
+ * operation since client-core seems to be exiting too often
+ * or if we were interrupted.
+ *
+ * Returns with op->lock taken.
+ */
+static int wait_for_matching_downcall(struct orangefs_kernel_op_s *op,
+                                      long timeout,
+                                      bool interruptible)
+{
+        long n;
+        /*
+         * There's a "schedule_timeout" inside of these wait
+         * primitives, during which the op is out of the hands of the
+         * user process that needs something done and is being
+         * manipulated by the client-core process.
+         */
+        if (interruptible)
+                n = wait_for_completion_interruptible_timeout(&op->waitq,
+                                                              timeout);
+        else
+                n = wait_for_completion_killable_timeout(&op->waitq, timeout);
+        spin_lock(&op->lock);
+        if (op_state_serviced(op))
+                return 0;
+        if (unlikely(n < 0)) {
+                gossip_debug(GOSSIP_WAIT_DEBUG,
+                             "%s: operation interrupted, tag %llu, %p\n",
+                             __func__,
+                             llu(op->tag),
+                             op);
+                return -EINTR;
+        }
+        if (op_state_purged(op)) {
+                gossip_debug(GOSSIP_WAIT_DEBUG,
+                             "%s: operation purged, tag %llu, %p, %d\n",
+                             __func__,
+                             llu(op->tag),
+                             op,
+                             op->attempts);
+                return (op->attempts < ORANGEFS_PURGE_RETRY_COUNT) ?
+                         -EAGAIN :
+                         -EIO;
+        }
+        /* must have timed out, then... */
+        gossip_debug(GOSSIP_WAIT_DEBUG,
+                     "%s: operation timed out, tag %llu, %p, %d)\n",
+                     __func__,
+                     llu(op->tag),
+                     op,
+                     op->attempts);
+        return -ETIMEDOUT;
+}
diff --git a/fs/orangefs/xattr.c b/fs/orangefs/xattr.c
new file mode 100644
index 000000000000..ef5da7538cd5
--- /dev/null
+++ b/fs/orangefs/xattr.c
@@ -0,0 +1,545 @@
+/*
+ * (C) 2001 Clemson University and The University of Chicago
+ *
+ * See COPYING in top-level directory.
+ */
+/*
+ *  Linux VFS extended attribute operations.
+ */
+#include "protocol.h"
+#include "orangefs-kernel.h"
+#include "orangefs-bufmap.h"
+#include <linux/posix_acl_xattr.h>
+#include <linux/xattr.h>
+#define SYSTEM_ORANGEFS_KEY "system.pvfs2."
+#define SYSTEM_ORANGEFS_KEY_LEN 13
+/*
+ * this function returns
+ *   0 if the key corresponding to name is not meant to be printed as part
+ *     of a listxattr.
+ *   1 if the key corresponding to name is meant to be returned as part of
+ *     a listxattr.
+ * The ones that start SYSTEM_ORANGEFS_KEY are the ones to avoid printing.
+ */
+static int is_reserved_key(const char *key, size_t size)
+{
+        if (size < SYSTEM_ORANGEFS_KEY_LEN)
+                return 1;
+        return strncmp(key, SYSTEM_ORANGEFS_KEY, SYSTEM_ORANGEFS_KEY_LEN) ?  1 : 0;
+}
+static inline int convert_to_internal_xattr_flags(int setxattr_flags)
+{
+        int internal_flag = 0;
+        if (setxattr_flags & XATTR_REPLACE) {
+                /* Attribute must exist! */
+                internal_flag = ORANGEFS_XATTR_REPLACE;
+        } else if (setxattr_flags & XATTR_CREATE) {
+                /* Attribute must not exist */
+                internal_flag = ORANGEFS_XATTR_CREATE;
+        }
+        return internal_flag;
+}
+/*
+ * Tries to get a specified key's attributes of a given
+ * file into a user-specified buffer. Note that the getxattr
+ * interface allows for the users to probe the size of an
+ * extended attribute by passing in a value of 0 to size.
+ * Thus our return value is always the size of the attribute
+ * unless the key does not exist for the file and/or if
+ * there were errors in fetching the attribute value.
+ */
+ssize_t orangefs_inode_getxattr(struct inode *inode, const char *prefix,
+                const char *name, void *buffer, size_t size)
+{
+        struct orangefs_inode_s *orangefs_inode = ORANGEFS_I(inode);
+        struct orangefs_kernel_op_s *new_op = NULL;
+        ssize_t ret = -ENOMEM;
+        ssize_t length = 0;
+        int fsuid;
+        int fsgid;
+        gossip_debug(GOSSIP_XATTR_DEBUG,
+                     "%s: prefix %s name %s, buffer_size %zd\n",
+                     __func__, prefix, name, size);
+        if (name == NULL || (size > 0 && buffer == NULL)) {
+                gossip_err("orangefs_inode_getxattr: bogus NULL pointers\n");
+                return -EINVAL;
+        }
+        if ((strlen(name) + strlen(prefix)) >= ORANGEFS_MAX_XATTR_NAMELEN) {
+                gossip_err("Invalid key length (%d)\n",
+                           (int)(strlen(name) + strlen(prefix)));
+                return -EINVAL;
+        }
+        fsuid = from_kuid(current_user_ns(), current_fsuid());
+        fsgid = from_kgid(current_user_ns(), current_fsgid());
+        gossip_debug(GOSSIP_XATTR_DEBUG,
+                     "getxattr on inode %pU, name %s "
+                     "(uid %o, gid %o)\n",
+                     get_khandle_from_ino(inode),
+                     name,
+                     fsuid,
+                     fsgid);
+        down_read(&orangefs_inode->xattr_sem);
+        new_op = op_alloc(ORANGEFS_VFS_OP_GETXATTR);
+        if (!new_op)
+                goto out_unlock;
+        new_op->upcall.req.getxattr.refn = orangefs_inode->refn;
+        ret = snprintf((char *)new_op->upcall.req.getxattr.key,
+                       ORANGEFS_MAX_XATTR_NAMELEN, "%s%s", prefix, name);
+        /*
+         * NOTE: Although keys are meant to be NULL terminated textual
+         * strings, I am going to explicitly pass the length just in case
+         * we change this later on...
+         */
+        new_op->upcall.req.getxattr.key_sz = ret + 1;
+        ret = service_operation(new_op, "orangefs_inode_getxattr",
+                                get_interruptible_flag(inode));
+        if (ret != 0) {
+                if (ret == -ENOENT) {
+                        ret = -ENODATA;
+                        gossip_debug(GOSSIP_XATTR_DEBUG,
+                                     "orangefs_inode_getxattr: inode %pU key %s"
+                                     " does not exist!\n",
+                                     get_khandle_from_ino(inode),
+                                     (char *)new_op->upcall.req.getxattr.key);
+                }
+                goto out_release_op;
+        }
+        /*
+         * Length returned includes null terminator.
+         */
+        length = new_op->downcall.resp.getxattr.val_sz;
+        /*
+         * Just return the length of the queried attribute.
+         */
+        if (size == 0) {
+                ret = length;
+                goto out_release_op;
+        }
+        /*
+         * Check to see if key length is > provided buffer size.
+         */
+        if (length > size) {
+                ret = -ERANGE;
+                goto out_release_op;
+        }
+        memset(buffer, 0, size);
+        memcpy(buffer, new_op->downcall.resp.getxattr.val, length);
+        gossip_debug(GOSSIP_XATTR_DEBUG,
+             "orangefs_inode_getxattr: inode %pU "
+             "key %s key_sz %d, val_len %d\n",
+             get_khandle_from_ino(inode),
+             (char *)new_op->
+                upcall.req.getxattr.key,
+                     (int)new_op->
+                upcall.req.getxattr.key_sz,
+             (int)ret);
+        ret = length;
+out_release_op:
+        op_release(new_op);
+out_unlock:
+        up_read(&orangefs_inode->xattr_sem);
+        return ret;
+}
+static int orangefs_inode_removexattr(struct inode *inode,
+                            const char *prefix,
+                            const char *name,
+                            int flags)
+{
+        struct orangefs_inode_s *orangefs_inode = ORANGEFS_I(inode);
+        struct orangefs_kernel_op_s *new_op = NULL;
+        int ret = -ENOMEM;
+        down_write(&orangefs_inode->xattr_sem);
+        new_op = op_alloc(ORANGEFS_VFS_OP_REMOVEXATTR);
+        if (!new_op)
+                goto out_unlock;
+        new_op->upcall.req.removexattr.refn = orangefs_inode->refn;
+        /*
+         * NOTE: Although keys are meant to be NULL terminated
+         * textual strings, I am going to explicitly pass the
+         * length just in case we change this later on...
+         */
+        ret = snprintf((char *)new_op->upcall.req.removexattr.key,
+                       ORANGEFS_MAX_XATTR_NAMELEN,
+                       "%s%s",
+                       (prefix ? prefix : ""),
+                       name);
+        new_op->upcall.req.removexattr.key_sz = ret + 1;
+        gossip_debug(GOSSIP_XATTR_DEBUG,
+                     "orangefs_inode_removexattr: key %s, key_sz %d\n",
+                     (char *)new_op->upcall.req.removexattr.key,
+                     (int)new_op->upcall.req.removexattr.key_sz);
+        ret = service_operation(new_op,
+                                "orangefs_inode_removexattr",
+                                get_interruptible_flag(inode));
+        if (ret == -ENOENT) {
+                /*
+                 * Request to replace a non-existent attribute is an error.
+                 */
+                if (flags & XATTR_REPLACE)
+                        ret = -ENODATA;
+                else
+                        ret = 0;
+        }
+        gossip_debug(GOSSIP_XATTR_DEBUG,
+                     "orangefs_inode_removexattr: returning %d\n", ret);
+        op_release(new_op);
+out_unlock:
+        up_write(&orangefs_inode->xattr_sem);
+        return ret;
+}
+/*
+ * Tries to set an attribute for a given key on a file.
+ *
+ * Returns a -ve number on error and 0 on success.  Key is text, but value
+ * can be binary!
+ */
+int orangefs_inode_setxattr(struct inode *inode, const char *prefix,
+                const char *name, const void *value, size_t size, int flags)
+{
+        struct orangefs_inode_s *orangefs_inode = ORANGEFS_I(inode);
+        struct orangefs_kernel_op_s *new_op;
+        int internal_flag = 0;
+        int ret = -ENOMEM;
+        gossip_debug(GOSSIP_XATTR_DEBUG,
+                     "%s: prefix %s, name %s, buffer_size %zd\n",
+                     __func__, prefix, name, size);
+        if (size < 0 ||
+            size >= ORANGEFS_MAX_XATTR_VALUELEN ||
+            flags < 0) {
+                gossip_err("orangefs_inode_setxattr: bogus values of size(%d), flags(%d)\n",
+                           (int)size,
+                           flags);
+                return -EINVAL;
+        }
+        if (name == NULL ||
+            (size > 0 && value == NULL)) {
+                gossip_err("orangefs_inode_setxattr: bogus NULL pointers!\n");
+                return -EINVAL;
+        }
+        internal_flag = convert_to_internal_xattr_flags(flags);
+        if (prefix) {
+                if (strlen(name) + strlen(prefix) >= ORANGEFS_MAX_XATTR_NAMELEN) {
+                        gossip_err
+                            ("orangefs_inode_setxattr: bogus key size (%d)\n",
+                             (int)(strlen(name) + strlen(prefix)));
+                        return -EINVAL;
+                }
+        } else {
+                if (strlen(name) >= ORANGEFS_MAX_XATTR_NAMELEN) {
+                        gossip_err
+                            ("orangefs_inode_setxattr: bogus key size (%d)\n",
+                             (int)(strlen(name)));
+                        return -EINVAL;
+                }
+        }
+        /* This is equivalent to a removexattr */
+        if (size == 0 && value == NULL) {
+                gossip_debug(GOSSIP_XATTR_DEBUG,
+                             "removing xattr (%s%s)\n",
+                             prefix,
+                             name);
+                return orangefs_inode_removexattr(inode, prefix, name, flags);
+        }
+        gossip_debug(GOSSIP_XATTR_DEBUG,
+                     "setxattr on inode %pU, name %s\n",
+                     get_khandle_from_ino(inode),
+                     name);
+        down_write(&orangefs_inode->xattr_sem);
+        new_op = op_alloc(ORANGEFS_VFS_OP_SETXATTR);
+        if (!new_op)
+                goto out_unlock;
+        new_op->upcall.req.setxattr.refn = orangefs_inode->refn;
+        new_op->upcall.req.setxattr.flags = internal_flag;
+        /*
+         * NOTE: Although keys are meant to be NULL terminated textual
+         * strings, I am going to explicitly pass the length just in
+         * case we change this later on...
+         */
+        ret = snprintf((char *)new_op->upcall.req.setxattr.keyval.key,
+                       ORANGEFS_MAX_XATTR_NAMELEN,
+                       "%s%s",
+                       prefix, name);
+        new_op->upcall.req.setxattr.keyval.key_sz = ret + 1;
+        memcpy(new_op->upcall.req.setxattr.keyval.val, value, size);
+        new_op->upcall.req.setxattr.keyval.val_sz = size;
+        gossip_debug(GOSSIP_XATTR_DEBUG,
+                     "orangefs_inode_setxattr: key %s, key_sz %d "
+                     " value size %zd\n",
+                     (char *)new_op->upcall.req.setxattr.keyval.key,
+                     (int)new_op->upcall.req.setxattr.keyval.key_sz,
+                     size);
+        ret = service_operation(new_op,
+                                "orangefs_inode_setxattr",
+                                get_interruptible_flag(inode));
+        gossip_debug(GOSSIP_XATTR_DEBUG,
+                     "orangefs_inode_setxattr: returning %d\n",
+                     ret);
+        /* when request is serviced properly, free req op struct */
+        op_release(new_op);
+out_unlock:
+        up_write(&orangefs_inode->xattr_sem);
+        return ret;
+}
+/*
+ * Tries to get a specified object's keys into a user-specified buffer of a
+ * given size.  Note that like the previous instances of xattr routines, this
+ * also allows you to pass in a NULL pointer and 0 size to probe the size for
+ * subsequent memory allocations. Thus our return value is always the size of
+ * all the keys unless there were errors in fetching the keys!
+ */
+ssize_t orangefs_listxattr(struct dentry *dentry, char *buffer, size_t size)
+{
+        struct inode *inode = dentry->d_inode;
+        struct orangefs_inode_s *orangefs_inode = ORANGEFS_I(inode);
+        struct orangefs_kernel_op_s *new_op;
+        __u64 token = ORANGEFS_ITERATE_START;
+        ssize_t ret = -ENOMEM;
+        ssize_t total = 0;
+        int count_keys = 0;
+        int key_size;
+        int i = 0;
+        int returned_count = 0;
+        if (size > 0 && buffer == NULL) {
+                gossip_err("%s: bogus NULL pointers\n", __func__);
+                return -EINVAL;
+        }
+        if (size < 0) {
+                gossip_err("Invalid size (%d)\n", (int)size);
+                return -EINVAL;
+        }
+        down_read(&orangefs_inode->xattr_sem);
+        new_op = op_alloc(ORANGEFS_VFS_OP_LISTXATTR);
+        if (!new_op)
+                goto out_unlock;
+        if (buffer && size > 0)
+                memset(buffer, 0, size);
+try_again:
+        key_size = 0;
+        new_op->upcall.req.listxattr.refn = orangefs_inode->refn;
+        new_op->upcall.req.listxattr.token = token;
+        new_op->upcall.req.listxattr.requested_count =
+            (size == 0) ? 0 : ORANGEFS_MAX_XATTR_LISTLEN;
+        ret = service_operation(new_op, __func__,
+                                get_interruptible_flag(inode));
+        if (ret != 0)
+                goto done;
+        if (size == 0) {
+                /*
+                 * This is a bit of a big upper limit, but I did not want to
+                 * spend too much time getting this correct, since users end
+                 * up allocating memory rather than us...
+                 */
+                total = new_op->downcall.resp.listxattr.returned_count *
+                        ORANGEFS_MAX_XATTR_NAMELEN;
+                goto done;
+        }
+        returned_count = new_op->downcall.resp.listxattr.returned_count;
+        if (returned_count < 0 ||
+            returned_count >= ORANGEFS_MAX_XATTR_LISTLEN) {
+                gossip_err("%s: impossible value for returned_count:%d:\n",
+                __func__,
+                returned_count);
+                ret = -EIO;
+                goto done;
+        }
+        /*
+         * Check to see how much can be fit in the buffer. Fit only whole keys.
+         */
+        for (i = 0; i < returned_count; i++) {
+                if (new_op->downcall.resp.listxattr.lengths[i] < 0 ||
+                    new_op->downcall.resp.listxattr.lengths[i] >
+                    ORANGEFS_MAX_XATTR_NAMELEN) {
+                        gossip_err("%s: impossible value for lengths[%d]\n",
+                            __func__,
+                            new_op->downcall.resp.listxattr.lengths[i]);
+                        ret = -EIO;
+                        goto done;
+                }
+                if (total + new_op->downcall.resp.listxattr.lengths[i] > size)
+                        goto done;
+                /*
+                 * Since many dumb programs try to setxattr() on our reserved
+                 * xattrs this is a feeble attempt at defeating those by not
+                 * listing them in the output of listxattr.. sigh
+                 */
+                if (is_reserved_key(new_op->downcall.resp.listxattr.key +
+                                    key_size,
+                                    new_op->downcall.resp.
+                                        listxattr.lengths[i])) {
+                        gossip_debug(GOSSIP_XATTR_DEBUG, "Copying key %d -> %s\n",
+                                        i, new_op->downcall.resp.listxattr.key +
+                                                key_size);
+                        memcpy(buffer + total,
+                                new_op->downcall.resp.listxattr.key + key_size,
+                                new_op->downcall.resp.listxattr.lengths[i]);
+                        total += new_op->downcall.resp.listxattr.lengths[i];
+                        count_keys++;
+                } else {
+                        gossip_debug(GOSSIP_XATTR_DEBUG, "[RESERVED] key %d -> %s\n",
+                                        i, new_op->downcall.resp.listxattr.key +
+                                                key_size);
+                }
+                key_size += new_op->downcall.resp.listxattr.lengths[i];
+        }
+        /*
+         * Since the buffer was large enough, we might have to continue
+         * fetching more keys!
+         */
+        token = new_op->downcall.resp.listxattr.token;
+        if (token != ORANGEFS_ITERATE_END)
+                goto try_again;
+done:
+        gossip_debug(GOSSIP_XATTR_DEBUG, "%s: returning %d"
+                     " [size of buffer %ld] (filled in %d keys)\n",
+                     __func__,
+                     ret ? (int)ret : (int)total,
+                     (long)size,
+                     count_keys);
+        op_release(new_op);
+        if (ret == 0)
+                ret = total;
+out_unlock:
+        up_read(&orangefs_inode->xattr_sem);
+        return ret;
+}
+static int orangefs_xattr_set_default(const struct xattr_handler *handler,
+                                      struct dentry *dentry,
+                                      const char *name,
+                                      const void *buffer,
+                                      size_t size,
+                                      int flags)
+{
+        return orangefs_inode_setxattr(dentry->d_inode,
+                                    ORANGEFS_XATTR_NAME_DEFAULT_PREFIX,
+                                    name,
+                                    buffer,
+                                    size,
+                                    flags);
+}
+static int orangefs_xattr_get_default(const struct xattr_handler *handler,
+                                      struct dentry *dentry,
+                                      const char *name,
+                                      void *buffer,
+                                      size_t size)
+{
+        return orangefs_inode_getxattr(dentry->d_inode,
+                                    ORANGEFS_XATTR_NAME_DEFAULT_PREFIX,
+                                    name,
+                                    buffer,
+                                    size);
+}
+static int orangefs_xattr_set_trusted(const struct xattr_handler *handler,
+                                     struct dentry *dentry,
+                                     const char *name,
+                                     const void *buffer,
+                                     size_t size,
+                                     int flags)
+{
+        return orangefs_inode_setxattr(dentry->d_inode,
+                                    ORANGEFS_XATTR_NAME_TRUSTED_PREFIX,
+                                    name,
+                                    buffer,
+                                    size,
+                                    flags);
+}
+static int orangefs_xattr_get_trusted(const struct xattr_handler *handler,
+                                      struct dentry *dentry,
+                                      const char *name,
+                                      void *buffer,
+                                      size_t size)
+{
+        return orangefs_inode_getxattr(dentry->d_inode,
+                                    ORANGEFS_XATTR_NAME_TRUSTED_PREFIX,
+                                    name,
+                                    buffer,
+                                    size);
+}
+static struct xattr_handler orangefs_xattr_trusted_handler = {
+        .prefix = ORANGEFS_XATTR_NAME_TRUSTED_PREFIX,
+        .get = orangefs_xattr_get_trusted,
+        .set = orangefs_xattr_set_trusted,
+};
+static struct xattr_handler orangefs_xattr_default_handler = {
+        /*
+         * NOTE: this is set to be the empty string.
+         * so that all un-prefixed xattrs keys get caught
+         * here!
+         */
+        .prefix = ORANGEFS_XATTR_NAME_DEFAULT_PREFIX,
+        .get = orangefs_xattr_get_default,
+        .set = orangefs_xattr_set_default,
+};
+const struct xattr_handler *orangefs_xattr_handlers[] = {
+        &posix_acl_access_xattr_handler,
+        &posix_acl_default_xattr_handler,
+        &orangefs_xattr_trusted_handler,
+        &orangefs_xattr_default_handler,
+        NULL
+};
diff --git a/fs/overlayfs/copy_up.c b/fs/overlayfs/copy_up.c
index d894e7cd9a86..cc514da6f3e7 100644
--- a/fs/overlayfs/copy_up.c
+++ b/fs/overlayfs/copy_up.c
@@ -7,6 +7,7 @@
 * the Free Software Foundation.
 */
+#include <linux/module.h>
 #include <linux/fs.h>
 #include <linux/slab.h>
 #include <linux/file.h>
@@ -16,10 +17,41 @@
 #include <linux/uaccess.h>
 #include <linux/sched.h>
 #include <linux/namei.h>
+#include <linux/fdtable.h>
+#include <linux/ratelimit.h>
 #include "overlayfs.h"
 #define OVL_COPY_UP_CHUNK_SIZE (1 << 20)
+static bool __read_mostly ovl_check_copy_up;
+module_param_named(check_copy_up, ovl_check_copy_up, bool,
+                   S_IWUSR | S_IRUGO);
+MODULE_PARM_DESC(ovl_check_copy_up,
+                 "Warn on copy-up when causing process also has a R/O fd open");
+static int ovl_check_fd(const void *data, struct file *f, unsigned int fd)
+{
+        const struct dentry *dentry = data;
+        if (f->f_inode == d_inode(dentry))
+                pr_warn_ratelimited("overlayfs: Warning: Copying up %pD, but open R/O on fd %u which will cease to be coherent [pid=%d %s]\n",
+                                    f, fd, current->pid, current->comm);
+        return 0;
+}
+/*
+ * Check the fds open by this process and warn if something like the following
+ * scenario is about to occur:
+ *
+ *      fd1 = open("foo", O_RDONLY);
+ *      fd2 = open("foo", O_RDWR);
+ */
+static void ovl_do_check_copy_up(struct dentry *dentry)
+{
+        if (ovl_check_copy_up)
+                iterate_fd(current->files, 0, ovl_check_fd, dentry);
+}
 int ovl_copy_xattr(struct dentry *old, struct dentry *new)
 {
        ssize_t list_size, size, value_size = 0;
@@ -235,6 +267,7 @@ static int ovl_copy_up_locked(struct dentry *workdir, struct dentry *upperdir,
        if (S_ISREG(stat->mode)) {
                struct path upperpath;
                ovl_path_upper(dentry, &upperpath);
                BUG_ON(upperpath.dentry != NULL);
                upperpath.dentry = newdentry;
@@ -309,6 +342,8 @@ int ovl_copy_up_one(struct dentry *parent, struct dentry *dentry,
        if (WARN_ON(!workdir))
                return -EROFS;
+        ovl_do_check_copy_up(lowerpath->dentry);
        ovl_path_upper(parent, &parentpath);
        upperdir = parentpath.dentry;
diff --git a/fs/overlayfs/dir.c b/fs/overlayfs/dir.c
index 52f6de5d40a9..b3fc0a35bf62 100644
--- a/fs/overlayfs/dir.c
+++ b/fs/overlayfs/dir.c
@@ -596,21 +596,25 @@ static int ovl_remove_upper(struct dentry *dentry, bool is_dir)
 {
        struct dentry *upperdir = ovl_dentry_upper(dentry->d_parent);
        struct inode *dir = upperdir->d_inode;
-        struct dentry *upper = ovl_dentry_upper(dentry);
+        struct dentry *upper;
        int err;
        inode_lock_nested(dir, I_MUTEX_PARENT);
+        upper = lookup_one_len(dentry->d_name.name, upperdir,
+                               dentry->d_name.len);
+        err = PTR_ERR(upper);
+        if (IS_ERR(upper))
+                goto out_unlock;
        err = -ESTALE;
-        if (upper->d_parent == upperdir) {
+        if (upper == ovl_dentry_upper(dentry)) {
-                /* Don't let d_delete() think it can reset d_inode */
-                dget(upper);
                if (is_dir)
                        err = vfs_rmdir(dir, upper);
                else
                        err = vfs_unlink(dir, upper, NULL);
-                dput(upper);
                ovl_dentry_version_inc(dentry->d_parent);
        }
+        dput(upper);
        /*
         * Keeping this dentry hashed would mean having to release
@@ -620,6 +624,7 @@ static int ovl_remove_upper(struct dentry *dentry, bool is_dir)
         */
        if (!err)
                d_drop(dentry);
+out_unlock:
        inode_unlock(dir);
        return err;
@@ -714,7 +719,6 @@ static int ovl_rename2(struct inode *olddir, struct dentry *old,
        struct dentry *trap;
        bool old_opaque;
        bool new_opaque;
-        bool new_create = false;
        bool cleanup_whiteout = false;
        bool overwrite = !(flags & RENAME_EXCHANGE);
        bool is_dir = d_is_dir(old);
@@ -840,29 +844,38 @@ static int ovl_rename2(struct inode *olddir, struct dentry *old,
        trap = lock_rename(new_upperdir, old_upperdir);
-        olddentry = ovl_dentry_upper(old);
-        newdentry = ovl_dentry_upper(new);
+        olddentry = lookup_one_len(old->d_name.name, old_upperdir,
-        if (newdentry) {
+                                   old->d_name.len);
+        err = PTR_ERR(olddentry);
+        if (IS_ERR(olddentry))
+                goto out_unlock;
+        err = -ESTALE;
+        if (olddentry != ovl_dentry_upper(old))
+                goto out_dput_old;
+        newdentry = lookup_one_len(new->d_name.name, new_upperdir,
+                                   new->d_name.len);
+        err = PTR_ERR(newdentry);
+        if (IS_ERR(newdentry))
+                goto out_dput_old;
+        err = -ESTALE;
+        if (ovl_dentry_upper(new)) {
                if (opaquedir) {
-                        newdentry = opaquedir;
+                        if (newdentry != opaquedir)
-                        opaquedir = NULL;
+                                goto out_dput;
                } else {
-                        dget(newdentry);
+                        if (newdentry != ovl_dentry_upper(new))
+                                goto out_dput;
                }
        } else {
-                new_create = true;
+                if (!d_is_negative(newdentry) &&
-                newdentry = lookup_one_len(new->d_name.name, new_upperdir,
+                    (!new_opaque || !ovl_is_whiteout(newdentry)))
-                                           new->d_name.len);
+                        goto out_dput;
-                err = PTR_ERR(newdentry);
-                if (IS_ERR(newdentry))
-                        goto out_unlock;
        }
-        err = -ESTALE;
-        if (olddentry->d_parent != old_upperdir)
-                goto out_dput;
-        if (newdentry->d_parent != new_upperdir)
-                goto out_dput;
        if (olddentry == trap)
                goto out_dput;
        if (newdentry == trap)
@@ -925,6 +938,8 @@ static int ovl_rename2(struct inode *olddir, struct dentry *old,
 out_dput:
        dput(newdentry);
+out_dput_old:
+        dput(olddentry);
 out_unlock:
        unlock_rename(new_upperdir, old_upperdir);
 out_revert_creds:
diff --git a/fs/overlayfs/overlayfs.h b/fs/overlayfs/overlayfs.h
index 99b4168c36ff..6a7090f4a441 100644
--- a/fs/overlayfs/overlayfs.h
+++ b/fs/overlayfs/overlayfs.h
@@ -166,6 +166,7 @@ extern const struct file_operations ovl_dir_operations;
 int ovl_check_empty_dir(struct dentry *dentry, struct list_head *list);
 void ovl_cleanup_whiteouts(struct dentry *upper, struct list_head *list);
 void ovl_cache_free(struct list_head *list);
+int ovl_check_d_type_supported(struct path *realpath);
 /* inode.c */
 int ovl_setattr(struct dentry *dentry, struct iattr *attr);
diff --git a/fs/overlayfs/readdir.c b/fs/overlayfs/readdir.c
index fdaf28f75e12..6ec1e43a9a54 100644
--- a/fs/overlayfs/readdir.c
+++ b/fs/overlayfs/readdir.c
@@ -36,13 +36,14 @@ struct ovl_dir_cache {
 struct ovl_readdir_data {
        struct dir_context ctx;
-        bool is_merge;
+        bool is_lowest;
        struct rb_root root;
        struct list_head *list;
        struct list_head middle;
        struct ovl_cache_entry *first_maybe_whiteout;
        int count;
        int err;
+        bool d_type_supported;
 };
 struct ovl_dir_file {
@@ -139,9 +140,9 @@ static int ovl_cache_entry_add_rb(struct ovl_readdir_data *rdd,
        return 0;
 }
-static int ovl_fill_lower(struct ovl_readdir_data *rdd,
+static int ovl_fill_lowest(struct ovl_readdir_data *rdd,
-                          const char *name, int namelen,
+                           const char *name, int namelen,
-                          loff_t offset, u64 ino, unsigned int d_type)
+                           loff_t offset, u64 ino, unsigned int d_type)
 {
        struct ovl_cache_entry *p;
@@ -193,10 +194,10 @@ static int ovl_fill_merge(struct dir_context *ctx, const char *name,
                container_of(ctx, struct ovl_readdir_data, ctx);
        rdd->count++;
-        if (!rdd->is_merge)
+        if (!rdd->is_lowest)
                return ovl_cache_entry_add_rb(rdd, name, namelen, ino, d_type);
        else
-                return ovl_fill_lower(rdd, name, namelen, offset, ino, d_type);
+                return ovl_fill_lowest(rdd, name, namelen, offset, ino, d_type);
 }
 static int ovl_check_whiteouts(struct dentry *dir, struct ovl_readdir_data *rdd)
@@ -289,7 +290,7 @@ static int ovl_dir_read_merged(struct dentry *dentry, struct list_head *list)
                .ctx.actor = ovl_fill_merge,
                .list = list,
                .root = RB_ROOT,
-                .is_merge = false,
+                .is_lowest = false,
        };
        int idx, next;
@@ -306,7 +307,7 @@ static int ovl_dir_read_merged(struct dentry *dentry, struct list_head *list)
                         * allows offsets to be reasonably constant
                         */
                        list_add(&rdd.middle, rdd.list);
-                        rdd.is_merge = true;
+                        rdd.is_lowest = true;
                        err = ovl_dir_read(&realpath, &rdd);
                        list_del(&rdd.middle);
                }
@@ -577,3 +578,39 @@ void ovl_cleanup_whiteouts(struct dentry *upper, struct list_head *list)
        }
        inode_unlock(upper->d_inode);
 }
+static int ovl_check_d_type(struct dir_context *ctx, const char *name,
+                          int namelen, loff_t offset, u64 ino,
+                          unsigned int d_type)
+{
+        struct ovl_readdir_data *rdd =
+                container_of(ctx, struct ovl_readdir_data, ctx);
+        /* Even if d_type is not supported, DT_DIR is returned for . and .. */
+        if (!strncmp(name, ".", namelen) || !strncmp(name, "..", namelen))
+                return 0;
+        if (d_type != DT_UNKNOWN)
+                rdd->d_type_supported = true;
+        return 0;
+}
+/*
+ * Returns 1 if d_type is supported, 0 not supported/unknown. Negative values
+ * if error is encountered.
+ */
+int ovl_check_d_type_supported(struct path *realpath)
+{
+        int err;
+        struct ovl_readdir_data rdd = {
+                .ctx.actor = ovl_check_d_type,
+                .d_type_supported = false,
+        };
+        err = ovl_dir_read(realpath, &rdd);
+        if (err)
+                return err;
+        return rdd.d_type_supported;
+}
diff --git a/fs/overlayfs/super.c b/fs/overlayfs/super.c
index 619ad4b016d2..ef64984c9bbc 100644
--- a/fs/overlayfs/super.c
+++ b/fs/overlayfs/super.c
@@ -936,7 +936,8 @@ static int ovl_fill_super(struct super_block *sb, void *data, int silent)
        err = -EINVAL;
        if (!ufs->config.lowerdir) {
-                pr_err("overlayfs: missing 'lowerdir'\n");
+                if (!silent)
+                        pr_err("overlayfs: missing 'lowerdir'\n");
                goto out_free_config;
        }
@@ -1028,6 +1029,21 @@ static int ovl_fill_super(struct super_block *sb, void *data, int silent)
                        sb->s_flags |= MS_RDONLY;
                        ufs->workdir = NULL;
                }
+                /*
+                 * Upper should support d_type, else whiteouts are visible.
+                 * Given workdir and upper are on same fs, we can do
+                 * iterate_dir() on workdir.
+                 */
+                err = ovl_check_d_type_supported(&workpath);
+                if (err < 0)
+                        goto out_put_workdir;
+                if (!err) {
+                        pr_err("overlayfs: upper fs needs to support d_type.\n");
+                        err = -EINVAL;
+                        goto out_put_workdir;
+                }
        }
        err = -ENOMEM;
diff --git a/fs/proc/base.c b/fs/proc/base.c
index 4f764c2ac1a5..b1755b23893e 100644
--- a/fs/proc/base.c
+++ b/fs/proc/base.c
@@ -434,7 +434,7 @@ static int proc_pid_wchan(struct seq_file *m, struct pid_namespace *ns,
                        && !lookup_symbol_name(wchan, symname))
                seq_printf(m, "%s", symname);
        else
-                seq_putc(m, '0');
+                seq_puts(m, "0\n");
        return 0;
 }
@@ -2158,6 +2158,7 @@ static const struct file_operations proc_map_files_operations = {
        .llseek         = default_llseek,
 };
+#ifdef CONFIG_CHECKPOINT_RESTORE
 struct timers_private {
        struct pid *pid;
        struct task_struct *task;
@@ -2256,6 +2257,73 @@ static const struct file_operations proc_timers_operations = {
        .llseek         = seq_lseek,
        .release        = seq_release_private,
 };
+#endif
+static ssize_t timerslack_ns_write(struct file *file, const char __user *buf,
+                                        size_t count, loff_t *offset)
+{
+        struct inode *inode = file_inode(file);
+        struct task_struct *p;
+        u64 slack_ns;
+        int err;
+        err = kstrtoull_from_user(buf, count, 10, &slack_ns);
+        if (err < 0)
+                return err;
+        p = get_proc_task(inode);
+        if (!p)
+                return -ESRCH;
+        if (ptrace_may_access(p, PTRACE_MODE_ATTACH_FSCREDS)) {
+                task_lock(p);
+                if (slack_ns == 0)
+                        p->timer_slack_ns = p->default_timer_slack_ns;
+                else
+                        p->timer_slack_ns = slack_ns;
+                task_unlock(p);
+        } else
+                count = -EPERM;
+        put_task_struct(p);
+        return count;
+}
+static int timerslack_ns_show(struct seq_file *m, void *v)
+{
+        struct inode *inode = m->private;
+        struct task_struct *p;
+        int err =  0;
+        p = get_proc_task(inode);
+        if (!p)
+                return -ESRCH;
+        if (ptrace_may_access(p, PTRACE_MODE_ATTACH_FSCREDS)) {
+                task_lock(p);
+                seq_printf(m, "%llu\n", p->timer_slack_ns);
+                task_unlock(p);
+        } else
+                err = -EPERM;
+        put_task_struct(p);
+        return err;
+}
+static int timerslack_ns_open(struct inode *inode, struct file *filp)
+{
+        return single_open(filp, timerslack_ns_show, inode);
+}
+static const struct file_operations proc_pid_set_timerslack_ns_operations = {
+        .open           = timerslack_ns_open,
+        .read           = seq_read,
+        .write          = timerslack_ns_write,
+        .llseek         = seq_lseek,
+        .release        = single_release,
+};
 static int proc_pident_instantiate(struct inode *dir,
        struct dentry *dentry, struct task_struct *task, const void *ptr)
@@ -2831,6 +2899,7 @@ static const struct pid_entry tgid_base_stuff[] = {
 #ifdef CONFIG_CHECKPOINT_RESTORE
        REG("timers",     S_IRUGO, proc_timers_operations),
 #endif
+        REG("timerslack_ns", S_IRUGO|S_IWUGO, proc_pid_set_timerslack_ns_operations),
 };
 static int proc_tgid_base_readdir(struct file *file, struct dir_context *ctx)
diff --git a/fs/proc/meminfo.c b/fs/proc/meminfo.c
index df4661abadc4..83720460c5bc 100644
--- a/fs/proc/meminfo.c
+++ b/fs/proc/meminfo.c
@@ -29,10 +29,7 @@ static int meminfo_proc_show(struct seq_file *m, void *v)
        unsigned long committed;
        long cached;
        long available;
-        unsigned long pagecache;
-        unsigned long wmark_low = 0;
        unsigned long pages[NR_LRU_LISTS];
-        struct zone *zone;
        int lru;
 /*
@@ -51,33 +48,7 @@ static int meminfo_proc_show(struct seq_file *m, void *v)
        for (lru = LRU_BASE; lru < NR_LRU_LISTS; lru++)
                pages[lru] = global_page_state(NR_LRU_BASE + lru);
-        for_each_zone(zone)
+        available = si_mem_available();
-                wmark_low += zone->watermark[WMARK_LOW];
-        /*
-         * Estimate the amount of memory available for userspace allocations,
-         * without causing swapping.
-         */
-        available = i.freeram - totalreserve_pages;
-        /*
-         * Not all the page cache can be freed, otherwise the system will
-         * start swapping. Assume at least half of the page cache, or the
-         * low watermark worth of cache, needs to stay.
-         */
-        pagecache = pages[LRU_ACTIVE_FILE] + pages[LRU_INACTIVE_FILE];
-        pagecache -= min(pagecache / 2, wmark_low);
-        available += pagecache;
-        /*
-         * Part of the reclaimable slab consists of items that are in use,
-         * and cannot be freed. Cap this estimate at the low watermark.
-         */
-        available += global_page_state(NR_SLAB_RECLAIMABLE) -
-                     min(global_page_state(NR_SLAB_RECLAIMABLE) / 2, wmark_low);
-        if (available < 0)
-                available = 0;
        /*
         * Tagged format, for easy grepping and expansion.
diff --git a/fs/proc/namespaces.c b/fs/proc/namespaces.c
index 276f12431dbf..72cb26f85d58 100644
--- a/fs/proc/namespaces.c
+++ b/fs/proc/namespaces.c
@@ -28,6 +28,9 @@ static const struct proc_ns_operations *ns_entries[] = {
        &userns_operations,
 #endif
        &mntns_operations,
+#ifdef CONFIG_CGROUPS
+        &cgroupns_operations,
+#endif
 };
 static const char *proc_ns_get_link(struct dentry *dentry,
diff --git a/fs/proc/page.c b/fs/proc/page.c
index b2855eea5405..712f1b9992cc 100644
--- a/fs/proc/page.c
+++ b/fs/proc/page.c
@@ -103,9 +103,9 @@ u64 stable_page_flags(struct page *page)
         * pseudo flags for the well known (anonymous) memory mapped pages
         *
         * Note that page->_mapcount is overloaded in SLOB/SLUB/SLQB, so the
-         * simple test in page_mapcount() is not enough.
+         * simple test in page_mapped() is not enough.
         */
-        if (!PageSlab(page) && page_mapcount(page))
+        if (!PageSlab(page) && page_mapped(page))
                u |= 1 << KPF_MMAP;
        if (PageAnon(page))
                u |= 1 << KPF_ANON;
@@ -148,6 +148,8 @@ u64 stable_page_flags(struct page *page)
         */
        if (PageBuddy(page))
                u |= 1 << KPF_BUDDY;
+        else if (page_count(page) == 0 && is_free_buddy_page(page))
+                u |= 1 << KPF_BUDDY;
        if (PageBalloon(page))
                u |= 1 << KPF_BALLOON;
@@ -158,6 +160,8 @@ u64 stable_page_flags(struct page *page)
        u |= kpf_copy_bit(k, KPF_LOCKED,        PG_locked);
        u |= kpf_copy_bit(k, KPF_SLAB,          PG_slab);
+        if (PageTail(page) && PageSlab(compound_head(page)))
+                u |= 1 << KPF_SLAB;
        u |= kpf_copy_bit(k, KPF_ERROR,         PG_error);
        u |= kpf_copy_bit(k, KPF_DIRTY,         PG_dirty);
diff --git a/fs/proc/task_mmu.c b/fs/proc/task_mmu.c
index fa95ab2d3674..9df431642042 100644
--- a/fs/proc/task_mmu.c
+++ b/fs/proc/task_mmu.c
@@ -660,11 +660,20 @@ static void show_smap_vma_flags(struct seq_file *m, struct vm_area_struct *vma)
                [ilog2(VM_MERGEABLE)]   = "mg",
                [ilog2(VM_UFFD_MISSING)]= "um",
                [ilog2(VM_UFFD_WP)]     = "uw",
+#ifdef CONFIG_X86_INTEL_MEMORY_PROTECTION_KEYS
+                /* These come out via ProtectionKey: */
+                [ilog2(VM_PKEY_BIT0)]   = "",
+                [ilog2(VM_PKEY_BIT1)]   = "",
+                [ilog2(VM_PKEY_BIT2)]   = "",
+                [ilog2(VM_PKEY_BIT3)]   = "",
+#endif
        };
        size_t i;
        seq_puts(m, "VmFlags: ");
        for (i = 0; i < BITS_PER_LONG; i++) {
+                if (!mnemonics[i][0])
+                        continue;
                if (vma->vm_flags & (1UL << i)) {
                        seq_printf(m, "%c%c ",
                                   mnemonics[i][0], mnemonics[i][1]);
@@ -702,6 +711,10 @@ static int smaps_hugetlb_range(pte_t *pte, unsigned long hmask,
 }
 #endif /* HUGETLB_PAGE */
+void __weak arch_show_smap(struct seq_file *m, struct vm_area_struct *vma)
+{
+}
 static int show_smap(struct seq_file *m, void *v, int is_pid)
 {
        struct vm_area_struct *vma = v;
@@ -783,6 +796,7 @@ static int show_smap(struct seq_file *m, void *v, int is_pid)
                   (vma->vm_flags & VM_LOCKED) ?
                        (unsigned long)(mss.pss >> (10 + PSS_SHIFT)) : 0);
+        arch_show_smap(m, vma);
        show_smap_vma_flags(m, vma);
        m_cache_vma(m, vma);
        return 0;
diff --git a/fs/proc/vmcore.c b/fs/proc/vmcore.c
index 4e61388ec03d..55bb57e6a30d 100644
--- a/fs/proc/vmcore.c
+++ b/fs/proc/vmcore.c
@@ -231,7 +231,9 @@ static ssize_t __read_vmcore(char *buffer, size_t buflen, loff_t *fpos,
        list_for_each_entry(m, &vmcore_list, list) {
                if (*fpos < m->offset + m->size) {
-                        tsz = min_t(size_t, m->offset + m->size - *fpos, buflen);
+                        tsz = (size_t)min_t(unsigned long long,
+                                            m->offset + m->size - *fpos,
+                                            buflen);
                        start = m->paddr + *fpos - m->offset;
                        tmp = read_from_oldmem(buffer, tsz, &start, userbuf);
                        if (tmp < 0)
@@ -461,7 +463,8 @@ static int mmap_vmcore(struct file *file, struct vm_area_struct *vma)
                if (start < m->offset + m->size) {
                        u64 paddr = 0;
-                        tsz = min_t(size_t, m->offset + m->size - start, size);
+                        tsz = (size_t)min_t(unsigned long long,
+                                            m->offset + m->size - start, size);
                        paddr = m->paddr + start - m->offset;
                        if (vmcore_remap_oldmem_pfn(vma, vma->vm_start + len,
                                                    paddr >> PAGE_SHIFT, tsz,
diff --git a/fs/proc_namespace.c b/fs/proc_namespace.c
index 2256e7e23e67..3f1190d18991 100644
--- a/fs/proc_namespace.c
+++ b/fs/proc_namespace.c
@@ -199,6 +199,8 @@ static int show_vfsstat(struct seq_file *m, struct vfsmount *mnt)
        if (sb->s_op->show_devname) {
                seq_puts(m, "device ");
                err = sb->s_op->show_devname(m, mnt_path.dentry);
+                if (err)
+                        goto out;
        } else {
                if (r->mnt_devname) {
                        seq_puts(m, "device ");
diff --git a/fs/pstore/ram.c b/fs/pstore/ram.c
index 319c3a60cfa5..bd9812e83461 100644
--- a/fs/pstore/ram.c
+++ b/fs/pstore/ram.c
@@ -55,8 +55,8 @@ static ulong ramoops_pmsg_size = MIN_MEM_SIZE;
 module_param_named(pmsg_size, ramoops_pmsg_size, ulong, 0400);
 MODULE_PARM_DESC(pmsg_size, "size of user space message log");
-static ulong mem_address;
+static unsigned long long mem_address;
-module_param(mem_address, ulong, 0400);
+module_param(mem_address, ullong, 0400);
 MODULE_PARM_DESC(mem_address,
                "start of reserved RAM used to store oops/panic logs");
diff --git a/fs/quota/dquot.c b/fs/quota/dquot.c
index 3c3b81bb6dfe..ba827daea5a0 100644
--- a/fs/quota/dquot.c
+++ b/fs/quota/dquot.c
@@ -411,6 +411,8 @@ int dquot_acquire(struct dquot *dquot)
                ret = dqopt->ops[dquot->dq_id.type]->read_dqblk(dquot);
        if (ret < 0)
                goto out_iolock;
+        /* Make sure flags update is visible after dquot has been filled */
+        smp_mb__before_atomic();
        set_bit(DQ_READ_B, &dquot->dq_flags);
        /* Instantiate dquot if needed */
        if (!test_bit(DQ_ACTIVE_B, &dquot->dq_flags) && !dquot->dq_off) {
@@ -427,6 +429,11 @@ int dquot_acquire(struct dquot *dquot)
                        goto out_iolock;
                }
        }
+        /*
+         * Make sure flags update is visible after on-disk struct has been
+         * allocated. Paired with smp_rmb() in dqget().
+         */
+        smp_mb__before_atomic();
        set_bit(DQ_ACTIVE_B, &dquot->dq_flags);
 out_iolock:
        mutex_unlock(&dqopt->dqio_mutex);
@@ -887,6 +894,11 @@ we_slept:
                        goto out;
                }
        }
+        /*
+         * Make sure following reads see filled structure - paired with
+         * smp_mb__before_atomic() in dquot_acquire().
+         */
+        smp_rmb();
 #ifdef CONFIG_QUOTA_DEBUG
        BUG_ON(!dquot->dq_sb);  /* Has somebody invalidated entry under us? */
 #endif
@@ -1398,7 +1410,7 @@ static int dquot_active(const struct inode *inode)
 static int __dquot_initialize(struct inode *inode, int type)
 {
        int cnt, init_needed = 0;
-        struct dquot **dquots, *got[MAXQUOTAS];
+        struct dquot **dquots, *got[MAXQUOTAS] = {};
        struct super_block *sb = inode->i_sb;
        qsize_t rsv;
        int ret = 0;
@@ -1415,7 +1427,6 @@ static int __dquot_initialize(struct inode *inode, int type)
                int rc;
                struct dquot *dquot;
-                got[cnt] = NULL;
                if (type != -1 && cnt != type)
                        continue;
                /*
@@ -2031,6 +2042,21 @@ int dquot_commit_info(struct super_block *sb, int type)
 }
 EXPORT_SYMBOL(dquot_commit_info);
+int dquot_get_next_id(struct super_block *sb, struct kqid *qid)
+{
+        struct quota_info *dqopt = sb_dqopt(sb);
+        int err;
+        if (!dqopt->ops[qid->type]->get_next_id)
+                return -ENOSYS;
+        mutex_lock(&dqopt->dqio_mutex);
+        err = dqopt->ops[qid->type]->get_next_id(sb, qid);
+        mutex_unlock(&dqopt->dqio_mutex);
+        return err;
+}
+EXPORT_SYMBOL(dquot_get_next_id);
 /*
 * Definitions of diskquota operations.
 */
@@ -2042,6 +2068,7 @@ const struct dquot_operations dquot_operations = {
        .write_info     = dquot_commit_info,
        .alloc_dquot    = dquot_alloc,
        .destroy_dquot  = dquot_destroy,
+        .get_next_id    = dquot_get_next_id,
 };
 EXPORT_SYMBOL(dquot_operations);
@@ -2430,9 +2457,7 @@ int dquot_quota_on_mount(struct super_block *sb, char *qf_name,
        struct dentry *dentry;
        int error;
-        inode_lock(d_inode(sb->s_root));
+        dentry = lookup_one_len_unlocked(qf_name, sb->s_root, strlen(qf_name));
-        dentry = lookup_one_len(qf_name, sb->s_root, strlen(qf_name));
-        inode_unlock(d_inode(sb->s_root));
        if (IS_ERR(dentry))
                return PTR_ERR(dentry);
@@ -2565,6 +2590,27 @@ int dquot_get_dqblk(struct super_block *sb, struct kqid qid,
 }
 EXPORT_SYMBOL(dquot_get_dqblk);
+int dquot_get_next_dqblk(struct super_block *sb, struct kqid *qid,
+                         struct qc_dqblk *di)
+{
+        struct dquot *dquot;
+        int err;
+        if (!sb->dq_op->get_next_id)
+                return -ENOSYS;
+        err = sb->dq_op->get_next_id(sb, qid);
+        if (err < 0)
+                return err;
+        dquot = dqget(sb, *qid);
+        if (IS_ERR(dquot))
+                return PTR_ERR(dquot);
+        do_get_dqblk(dquot, di);
+        dqput(dquot);
+        return 0;
+}
+EXPORT_SYMBOL(dquot_get_next_dqblk);
 #define VFS_QC_MASK \
        (QC_SPACE | QC_SPC_SOFT | QC_SPC_HARD | \
         QC_INO_COUNT | QC_INO_SOFT | QC_INO_HARD | \
@@ -2765,6 +2811,7 @@ const struct quotactl_ops dquot_quotactl_ops = {
        .get_state      = dquot_get_state,
        .set_info       = dquot_set_dqinfo,
        .get_dqblk      = dquot_get_dqblk,
+        .get_nextdqblk  = dquot_get_next_dqblk,
        .set_dqblk      = dquot_set_dqblk
 };
 EXPORT_SYMBOL(dquot_quotactl_ops);
@@ -2776,6 +2823,7 @@ const struct quotactl_ops dquot_quotactl_sysfile_ops = {
        .get_state      = dquot_get_state,
        .set_info       = dquot_set_dqinfo,
        .get_dqblk      = dquot_get_dqblk,
+        .get_nextdqblk  = dquot_get_next_dqblk,
        .set_dqblk      = dquot_set_dqblk
 };
 EXPORT_SYMBOL(dquot_quotactl_sysfile_ops);
diff --git a/fs/quota/quota.c b/fs/quota/quota.c
index 3746367098fd..0f10ee9892ce 100644
--- a/fs/quota/quota.c
+++ b/fs/quota/quota.c
@@ -79,7 +79,7 @@ unsigned int qtype_enforce_flag(int type)
        return 0;
 }
-static int quota_quotaon(struct super_block *sb, int type, int cmd, qid_t id,
+static int quota_quotaon(struct super_block *sb, int type, qid_t id,
                         struct path *path)
 {
        if (!sb->s_qcop->quota_on && !sb->s_qcop->quota_enable)
@@ -222,6 +222,34 @@ static int quota_getquota(struct super_block *sb, int type, qid_t id,
        return 0;
 }
+/*
+ * Return quota for next active quota >= this id, if any exists,
+ * otherwise return -ENOENT via ->get_nextdqblk
+ */
+static int quota_getnextquota(struct super_block *sb, int type, qid_t id,
+                          void __user *addr)
+{
+        struct kqid qid;
+        struct qc_dqblk fdq;
+        struct if_nextdqblk idq;
+        int ret;
+        if (!sb->s_qcop->get_nextdqblk)
+                return -ENOSYS;
+        qid = make_kqid(current_user_ns(), type, id);
+        if (!qid_valid(qid))
+                return -EINVAL;
+        ret = sb->s_qcop->get_nextdqblk(sb, &qid, &fdq);
+        if (ret)
+                return ret;
+        /* struct if_nextdqblk is a superset of struct if_dqblk */
+        copy_to_if_dqblk((struct if_dqblk *)&idq, &fdq);
+        idq.dqb_id = from_kqid(current_user_ns(), qid);
+        if (copy_to_user(addr, &idq, sizeof(idq)))
+                return -EFAULT;
+        return 0;
+}
 static void copy_from_if_dqblk(struct qc_dqblk *dst, struct if_dqblk *src)
 {
        dst->d_spc_hardlimit = qbtos(src->dqb_bhardlimit);
@@ -625,6 +653,34 @@ static int quota_getxquota(struct super_block *sb, int type, qid_t id,
        return ret;
 }
+/*
+ * Return quota for next active quota >= this id, if any exists,
+ * otherwise return -ENOENT via ->get_nextdqblk.
+ */
+static int quota_getnextxquota(struct super_block *sb, int type, qid_t id,
+                            void __user *addr)
+{
+        struct fs_disk_quota fdq;
+        struct qc_dqblk qdq;
+        struct kqid qid;
+        qid_t id_out;
+        int ret;
+        if (!sb->s_qcop->get_nextdqblk)
+                return -ENOSYS;
+        qid = make_kqid(current_user_ns(), type, id);
+        if (!qid_valid(qid))
+                return -EINVAL;
+        ret = sb->s_qcop->get_nextdqblk(sb, &qid, &qdq);
+        if (ret)
+                return ret;
+        id_out = from_kqid(current_user_ns(), qid);
+        copy_to_xfs_dqblk(&fdq, &qdq, type, id_out);
+        if (copy_to_user(addr, &fdq, sizeof(fdq)))
+                return -EFAULT;
+        return ret;
+}
 static int quota_rmxquota(struct super_block *sb, void __user *addr)
 {
        __u32 flags;
@@ -659,7 +715,7 @@ static int do_quotactl(struct super_block *sb, int type, int cmd, qid_t id,
        switch (cmd) {
        case Q_QUOTAON:
-                return quota_quotaon(sb, type, cmd, id, path);
+                return quota_quotaon(sb, type, id, path);
        case Q_QUOTAOFF:
                return quota_quotaoff(sb, type);
        case Q_GETFMT:
@@ -670,6 +726,8 @@ static int do_quotactl(struct super_block *sb, int type, int cmd, qid_t id,
                return quota_setinfo(sb, type, addr);
        case Q_GETQUOTA:
                return quota_getquota(sb, type, id, addr);
+        case Q_GETNEXTQUOTA:
+                return quota_getnextquota(sb, type, id, addr);
        case Q_SETQUOTA:
                return quota_setquota(sb, type, id, addr);
        case Q_SYNC:
@@ -690,6 +748,8 @@ static int do_quotactl(struct super_block *sb, int type, int cmd, qid_t id,
                return quota_setxquota(sb, type, id, addr);
        case Q_XGETQUOTA:
                return quota_getxquota(sb, type, id, addr);
+        case Q_XGETNEXTQUOTA:
+                return quota_getnextxquota(sb, type, id, addr);
        case Q_XQUOTASYNC:
                if (sb->s_flags & MS_RDONLY)
                        return -EROFS;
@@ -705,6 +765,11 @@ static int do_quotactl(struct super_block *sb, int type, int cmd, qid_t id,
 /* Return 1 if 'cmd' will block on frozen filesystem */
 static int quotactl_cmd_write(int cmd)
 {
+        /*
+         * We cannot allow Q_GETQUOTA and Q_GETNEXTQUOTA without write access
+         * as dquot_acquire() may allocate space for new structure and OCFS2
+         * needs to increment on-disk use count.
+         */
        switch (cmd) {
        case Q_GETFMT:
        case Q_GETINFO:
@@ -712,6 +777,7 @@ static int quotactl_cmd_write(int cmd)
        case Q_XGETQSTAT:
        case Q_XGETQSTATV:
        case Q_XGETQUOTA:
+        case Q_XGETNEXTQUOTA:
        case Q_XQUOTASYNC:
                return 0;
        }
diff --git a/fs/quota/quota_tree.c b/fs/quota/quota_tree.c
index 58efb83dec1c..0738972e8d3f 100644
--- a/fs/quota/quota_tree.c
+++ b/fs/quota/quota_tree.c
@@ -22,10 +22,9 @@ MODULE_LICENSE("GPL");
 #define __QUOTA_QT_PARANOIA
-static int get_index(struct qtree_mem_dqinfo *info, struct kqid qid, int depth)
+static int __get_index(struct qtree_mem_dqinfo *info, qid_t id, int depth)
 {
        unsigned int epb = info->dqi_usable_bs >> 2;
-        qid_t id = from_kqid(&init_user_ns, qid);
        depth = info->dqi_qtree_depth - depth - 1;
        while (depth--)
@@ -33,6 +32,13 @@ static int get_index(struct qtree_mem_dqinfo *info, struct kqid qid, int depth)
        return id % epb;
 }
+static int get_index(struct qtree_mem_dqinfo *info, struct kqid qid, int depth)
+{
+        qid_t id = from_kqid(&init_user_ns, qid);
+        return __get_index(info, id, depth);
+}
 /* Number of entries in one blocks */
 static int qtree_dqstr_in_blk(struct qtree_mem_dqinfo *info)
 {
@@ -668,3 +674,60 @@ int qtree_release_dquot(struct qtree_mem_dqinfo *info, struct dquot *dquot)
        return 0;
 }
 EXPORT_SYMBOL(qtree_release_dquot);
+static int find_next_id(struct qtree_mem_dqinfo *info, qid_t *id,
+                        unsigned int blk, int depth)
+{
+        char *buf = getdqbuf(info->dqi_usable_bs);
+        __le32 *ref = (__le32 *)buf;
+        ssize_t ret;
+        unsigned int epb = info->dqi_usable_bs >> 2;
+        unsigned int level_inc = 1;
+        int i;
+        if (!buf)
+                return -ENOMEM;
+        for (i = depth; i < info->dqi_qtree_depth - 1; i++)
+                level_inc *= epb;
+        ret = read_blk(info, blk, buf);
+        if (ret < 0) {
+                quota_error(info->dqi_sb,
+                            "Can't read quota tree block %u", blk);
+                goto out_buf;
+        }
+        for (i = __get_index(info, *id, depth); i < epb; i++) {
+                if (ref[i] == cpu_to_le32(0)) {
+                        *id += level_inc;
+                        continue;
+                }
+                if (depth == info->dqi_qtree_depth - 1) {
+                        ret = 0;
+                        goto out_buf;
+                }
+                ret = find_next_id(info, id, le32_to_cpu(ref[i]), depth + 1);
+                if (ret != -ENOENT)
+                        break;
+        }
+        if (i == epb) {
+                ret = -ENOENT;
+                goto out_buf;
+        }
+out_buf:
+        kfree(buf);
+        return ret;
+}
+int qtree_get_next_id(struct qtree_mem_dqinfo *info, struct kqid *qid)
+{
+        qid_t id = from_kqid(&init_user_ns, *qid);
+        int ret;
+        ret = find_next_id(info, &id, QT_TREEOFF, 0);
+        if (ret < 0)
+                return ret;
+        *qid = make_kqid(&init_user_ns, qid->type, id);
+        return 0;
+}
+EXPORT_SYMBOL(qtree_get_next_id);
diff --git a/fs/quota/quota_v2.c b/fs/quota/quota_v2.c
index ed85d4f35c04..ca71bf881ad1 100644
--- a/fs/quota/quota_v2.c
+++ b/fs/quota/quota_v2.c
@@ -304,6 +304,11 @@ static int v2_free_file_info(struct super_block *sb, int type)
        return 0;
 }
+static int v2_get_next_id(struct super_block *sb, struct kqid *qid)
+{
+        return qtree_get_next_id(sb_dqinfo(sb, qid->type)->dqi_priv, qid);
+}
 static const struct quota_format_ops v2_format_ops = {
        .check_quota_file       = v2_check_quota_file,
        .read_file_info         = v2_read_file_info,
@@ -312,6 +317,7 @@ static const struct quota_format_ops v2_format_ops = {
        .read_dqblk             = v2_read_dquot,
        .commit_dqblk           = v2_write_dquot,
        .release_dqblk          = v2_release_dquot,
+        .get_next_id            = v2_get_next_id,
 };
 static struct quota_format_type v2r0_quota_format = {
diff --git a/fs/read_write.c b/fs/read_write.c
index dadf24e5c95b..cf377cf9dfe3 100644
--- a/fs/read_write.c
+++ b/fs/read_write.c
@@ -693,12 +693,17 @@ unsigned long iov_shorten(struct iovec *iov, unsigned long nr_segs, size_t to)
 EXPORT_SYMBOL(iov_shorten);
 static ssize_t do_iter_readv_writev(struct file *filp, struct iov_iter *iter,
-                loff_t *ppos, iter_fn_t fn)
+                loff_t *ppos, iter_fn_t fn, int flags)
 {
        struct kiocb kiocb;
        ssize_t ret;
+        if (flags & ~RWF_HIPRI)
+                return -EOPNOTSUPP;
        init_sync_kiocb(&kiocb, filp);
+        if (flags & RWF_HIPRI)
+                kiocb.ki_flags |= IOCB_HIPRI;
        kiocb.ki_pos = *ppos;
        ret = fn(&kiocb, iter);
@@ -709,10 +714,13 @@ static ssize_t do_iter_readv_writev(struct file *filp, struct iov_iter *iter,
 /* Do it by hand, with file-ops */
 static ssize_t do_loop_readv_writev(struct file *filp, struct iov_iter *iter,
-                loff_t *ppos, io_fn_t fn)
+                loff_t *ppos, io_fn_t fn, int flags)
 {
        ssize_t ret = 0;
+        if (flags & ~RWF_HIPRI)
+                return -EOPNOTSUPP;
        while (iov_iter_count(iter)) {
                struct iovec iovec = iov_iter_iovec(iter);
                ssize_t nr;
@@ -813,7 +821,8 @@ out:
 static ssize_t do_readv_writev(int type, struct file *file,
                               const struct iovec __user * uvector,
-                               unsigned long nr_segs, loff_t *pos)
+                               unsigned long nr_segs, loff_t *pos,
+                               int flags)
 {
        size_t tot_len;
        struct iovec iovstack[UIO_FASTIOV];
@@ -845,9 +854,9 @@ static ssize_t do_readv_writev(int type, struct file *file,
        }
        if (iter_fn)
-                ret = do_iter_readv_writev(file, &iter, pos, iter_fn);
+                ret = do_iter_readv_writev(file, &iter, pos, iter_fn, flags);
        else
-                ret = do_loop_readv_writev(file, &iter, pos, fn);
+                ret = do_loop_readv_writev(file, &iter, pos, fn, flags);
        if (type != READ)
                file_end_write(file);
@@ -864,40 +873,40 @@ out:
 }
 ssize_t vfs_readv(struct file *file, const struct iovec __user *vec,
-                  unsigned long vlen, loff_t *pos)
+                  unsigned long vlen, loff_t *pos, int flags)
 {
        if (!(file->f_mode & FMODE_READ))
                return -EBADF;
        if (!(file->f_mode & FMODE_CAN_READ))
                return -EINVAL;
-        return do_readv_writev(READ, file, vec, vlen, pos);
+        return do_readv_writev(READ, file, vec, vlen, pos, flags);
 }
 EXPORT_SYMBOL(vfs_readv);
 ssize_t vfs_writev(struct file *file, const struct iovec __user *vec,
-                   unsigned long vlen, loff_t *pos)
+                   unsigned long vlen, loff_t *pos, int flags)
 {
        if (!(file->f_mode & FMODE_WRITE))
                return -EBADF;
        if (!(file->f_mode & FMODE_CAN_WRITE))
                return -EINVAL;
-        return do_readv_writev(WRITE, file, vec, vlen, pos);
+        return do_readv_writev(WRITE, file, vec, vlen, pos, flags);
 }
 EXPORT_SYMBOL(vfs_writev);
-SYSCALL_DEFINE3(readv, unsigned long, fd, const struct iovec __user *, vec,
+static ssize_t do_readv(unsigned long fd, const struct iovec __user *vec,
-                unsigned long, vlen)
+                        unsigned long vlen, int flags)
 {
        struct fd f = fdget_pos(fd);
        ssize_t ret = -EBADF;
        if (f.file) {
                loff_t pos = file_pos_read(f.file);
-                ret = vfs_readv(f.file, vec, vlen, &pos);
+                ret = vfs_readv(f.file, vec, vlen, &pos, flags);
                if (ret >= 0)
                        file_pos_write(f.file, pos);
                fdput_pos(f);
@@ -909,15 +918,15 @@ SYSCALL_DEFINE3(readv, unsigned long, fd, const struct iovec __user *, vec,
        return ret;
 }
-SYSCALL_DEFINE3(writev, unsigned long, fd, const struct iovec __user *, vec,
+static ssize_t do_writev(unsigned long fd, const struct iovec __user *vec,
-                unsigned long, vlen)
+                         unsigned long vlen, int flags)
 {
        struct fd f = fdget_pos(fd);
        ssize_t ret = -EBADF;
        if (f.file) {
                loff_t pos = file_pos_read(f.file);
-                ret = vfs_writev(f.file, vec, vlen, &pos);
+                ret = vfs_writev(f.file, vec, vlen, &pos, flags);
                if (ret >= 0)
                        file_pos_write(f.file, pos);
                fdput_pos(f);
@@ -935,10 +944,9 @@ static inline loff_t pos_from_hilo(unsigned long high, unsigned long low)
        return (((loff_t)high << HALF_LONG_BITS) << HALF_LONG_BITS) | low;
 }
-SYSCALL_DEFINE5(preadv, unsigned long, fd, const struct iovec __user *, vec,
+static ssize_t do_preadv(unsigned long fd, const struct iovec __user *vec,
-                unsigned long, vlen, unsigned long, pos_l, unsigned long, pos_h)
+                         unsigned long vlen, loff_t pos, int flags)
 {
-        loff_t pos = pos_from_hilo(pos_h, pos_l);
        struct fd f;
        ssize_t ret = -EBADF;
@@ -949,7 +957,7 @@ SYSCALL_DEFINE5(preadv, unsigned long, fd, const struct iovec __user *, vec,
        if (f.file) {
                ret = -ESPIPE;
                if (f.file->f_mode & FMODE_PREAD)
-                        ret = vfs_readv(f.file, vec, vlen, &pos);
+                        ret = vfs_readv(f.file, vec, vlen, &pos, flags);
                fdput(f);
        }
@@ -959,10 +967,9 @@ SYSCALL_DEFINE5(preadv, unsigned long, fd, const struct iovec __user *, vec,
        return ret;
 }
-SYSCALL_DEFINE5(pwritev, unsigned long, fd, const struct iovec __user *, vec,
+static ssize_t do_pwritev(unsigned long fd, const struct iovec __user *vec,
-                unsigned long, vlen, unsigned long, pos_l, unsigned long, pos_h)
+                          unsigned long vlen, loff_t pos, int flags)
 {
-        loff_t pos = pos_from_hilo(pos_h, pos_l);
        struct fd f;
        ssize_t ret = -EBADF;
@@ -973,7 +980,7 @@ SYSCALL_DEFINE5(pwritev, unsigned long, fd, const struct iovec __user *, vec,
        if (f.file) {
                ret = -ESPIPE;
                if (f.file->f_mode & FMODE_PWRITE)
-                        ret = vfs_writev(f.file, vec, vlen, &pos);
+                        ret = vfs_writev(f.file, vec, vlen, &pos, flags);
                fdput(f);
        }
@@ -983,11 +990,64 @@ SYSCALL_DEFINE5(pwritev, unsigned long, fd, const struct iovec __user *, vec,
        return ret;
 }
+SYSCALL_DEFINE3(readv, unsigned long, fd, const struct iovec __user *, vec,
+                unsigned long, vlen)
+{
+        return do_readv(fd, vec, vlen, 0);
+}
+SYSCALL_DEFINE3(writev, unsigned long, fd, const struct iovec __user *, vec,
+                unsigned long, vlen)
+{
+        return do_writev(fd, vec, vlen, 0);
+}
+SYSCALL_DEFINE5(preadv, unsigned long, fd, const struct iovec __user *, vec,
+                unsigned long, vlen, unsigned long, pos_l, unsigned long, pos_h)
+{
+        loff_t pos = pos_from_hilo(pos_h, pos_l);
+        return do_preadv(fd, vec, vlen, pos, 0);
+}
+SYSCALL_DEFINE6(preadv2, unsigned long, fd, const struct iovec __user *, vec,
+                unsigned long, vlen, unsigned long, pos_l, unsigned long, pos_h,
+                int, flags)
+{
+        loff_t pos = pos_from_hilo(pos_h, pos_l);
+        if (pos == -1)
+                return do_readv(fd, vec, vlen, flags);
+        return do_preadv(fd, vec, vlen, pos, flags);
+}
+SYSCALL_DEFINE5(pwritev, unsigned long, fd, const struct iovec __user *, vec,
+                unsigned long, vlen, unsigned long, pos_l, unsigned long, pos_h)
+{
+        loff_t pos = pos_from_hilo(pos_h, pos_l);
+        return do_pwritev(fd, vec, vlen, pos, 0);
+}
+SYSCALL_DEFINE6(pwritev2, unsigned long, fd, const struct iovec __user *, vec,
+                unsigned long, vlen, unsigned long, pos_l, unsigned long, pos_h,
+                int, flags)
+{
+        loff_t pos = pos_from_hilo(pos_h, pos_l);
+        if (pos == -1)
+                return do_writev(fd, vec, vlen, flags);
+        return do_pwritev(fd, vec, vlen, pos, flags);
+}
 #ifdef CONFIG_COMPAT
 static ssize_t compat_do_readv_writev(int type, struct file *file,
                               const struct compat_iovec __user *uvector,
-                               unsigned long nr_segs, loff_t *pos)
+                               unsigned long nr_segs, loff_t *pos,
+                               int flags)
 {
        compat_ssize_t tot_len;
        struct iovec iovstack[UIO_FASTIOV];
@@ -1019,9 +1079,9 @@ static ssize_t compat_do_readv_writev(int type, struct file *file,
        }
        if (iter_fn)
-                ret = do_iter_readv_writev(file, &iter, pos, iter_fn);
+                ret = do_iter_readv_writev(file, &iter, pos, iter_fn, flags);
        else
-                ret = do_loop_readv_writev(file, &iter, pos, fn);
+                ret = do_loop_readv_writev(file, &iter, pos, fn, flags);
        if (type != READ)
                file_end_write(file);
@@ -1039,7 +1099,7 @@ out:
 static size_t compat_readv(struct file *file,
                           const struct compat_iovec __user *vec,
-                           unsigned long vlen, loff_t *pos)
+                           unsigned long vlen, loff_t *pos, int flags)
 {
        ssize_t ret = -EBADF;
@@ -1050,7 +1110,7 @@ static size_t compat_readv(struct file *file,
        if (!(file->f_mode & FMODE_CAN_READ))
                goto out;
-        ret = compat_do_readv_writev(READ, file, vec, vlen, pos);
+        ret = compat_do_readv_writev(READ, file, vec, vlen, pos, flags);
 out:
        if (ret > 0)
@@ -1059,9 +1119,9 @@ out:
        return ret;
 }
-COMPAT_SYSCALL_DEFINE3(readv, compat_ulong_t, fd,
+static size_t do_compat_readv(compat_ulong_t fd,
-                const struct compat_iovec __user *,vec,
+                                 const struct compat_iovec __user *vec,
-                compat_ulong_t, vlen)
+                                 compat_ulong_t vlen, int flags)
 {
        struct fd f = fdget_pos(fd);
        ssize_t ret;
@@ -1070,16 +1130,24 @@ COMPAT_SYSCALL_DEFINE3(readv, compat_ulong_t, fd,
        if (!f.file)
                return -EBADF;
        pos = f.file->f_pos;
-        ret = compat_readv(f.file, vec, vlen, &pos);
+        ret = compat_readv(f.file, vec, vlen, &pos, flags);
        if (ret >= 0)
                f.file->f_pos = pos;
        fdput_pos(f);
        return ret;
+}
+COMPAT_SYSCALL_DEFINE3(readv, compat_ulong_t, fd,
+                const struct compat_iovec __user *,vec,
+                compat_ulong_t, vlen)
+{
+        return do_compat_readv(fd, vec, vlen, 0);
 }
-static long __compat_sys_preadv64(unsigned long fd,
+static long do_compat_preadv64(unsigned long fd,
                                  const struct compat_iovec __user *vec,
-                                  unsigned long vlen, loff_t pos)
+                                  unsigned long vlen, loff_t pos, int flags)
 {
        struct fd f;
        ssize_t ret;
@@ -1091,7 +1159,7 @@ static long __compat_sys_preadv64(unsigned long fd,
                return -EBADF;
        ret = -ESPIPE;
        if (f.file->f_mode & FMODE_PREAD)
-                ret = compat_readv(f.file, vec, vlen, &pos);
+                ret = compat_readv(f.file, vec, vlen, &pos, flags);
        fdput(f);
        return ret;
 }
@@ -1101,7 +1169,7 @@ COMPAT_SYSCALL_DEFINE4(preadv64, unsigned long, fd,
                const struct compat_iovec __user *,vec,
                unsigned long, vlen, loff_t, pos)
 {
-        return __compat_sys_preadv64(fd, vec, vlen, pos);
+        return do_compat_preadv64(fd, vec, vlen, pos, 0);
 }
 #endif
@@ -1111,12 +1179,25 @@ COMPAT_SYSCALL_DEFINE5(preadv, compat_ulong_t, fd,
 {
        loff_t pos = ((loff_t)pos_high << 32) | pos_low;
-        return __compat_sys_preadv64(fd, vec, vlen, pos);
+        return do_compat_preadv64(fd, vec, vlen, pos, 0);
+}
+COMPAT_SYSCALL_DEFINE6(preadv2, compat_ulong_t, fd,
+                const struct compat_iovec __user *,vec,
+                compat_ulong_t, vlen, u32, pos_low, u32, pos_high,
+                int, flags)
+{
+        loff_t pos = ((loff_t)pos_high << 32) | pos_low;
+        if (pos == -1)
+                return do_compat_readv(fd, vec, vlen, flags);
+        return do_compat_preadv64(fd, vec, vlen, pos, flags);
 }
 static size_t compat_writev(struct file *file,
                            const struct compat_iovec __user *vec,
-                            unsigned long vlen, loff_t *pos)
+                            unsigned long vlen, loff_t *pos, int flags)
 {
        ssize_t ret = -EBADF;
@@ -1127,7 +1208,7 @@ static size_t compat_writev(struct file *file,
        if (!(file->f_mode & FMODE_CAN_WRITE))
                goto out;
-        ret = compat_do_readv_writev(WRITE, file, vec, vlen, pos);
+        ret = compat_do_readv_writev(WRITE, file, vec, vlen, pos, 0);
 out:
        if (ret > 0)
@@ -1136,9 +1217,9 @@ out:
        return ret;
 }
-COMPAT_SYSCALL_DEFINE3(writev, compat_ulong_t, fd,
+static size_t do_compat_writev(compat_ulong_t fd,
-                const struct compat_iovec __user *, vec,
+                                  const struct compat_iovec __user* vec,
-                compat_ulong_t, vlen)
+                                  compat_ulong_t vlen, int flags)
 {
        struct fd f = fdget_pos(fd);
        ssize_t ret;
@@ -1147,16 +1228,23 @@ COMPAT_SYSCALL_DEFINE3(writev, compat_ulong_t, fd,
        if (!f.file)
                return -EBADF;
        pos = f.file->f_pos;
-        ret = compat_writev(f.file, vec, vlen, &pos);
+        ret = compat_writev(f.file, vec, vlen, &pos, flags);
        if (ret >= 0)
                f.file->f_pos = pos;
        fdput_pos(f);
        return ret;
 }
-static long __compat_sys_pwritev64(unsigned long fd,
+COMPAT_SYSCALL_DEFINE3(writev, compat_ulong_t, fd,
+                const struct compat_iovec __user *, vec,
+                compat_ulong_t, vlen)
+{
+        return do_compat_writev(fd, vec, vlen, 0);
+}
+static long do_compat_pwritev64(unsigned long fd,
                                   const struct compat_iovec __user *vec,
-                                   unsigned long vlen, loff_t pos)
+                                   unsigned long vlen, loff_t pos, int flags)
 {
        struct fd f;
        ssize_t ret;
@@ -1168,7 +1256,7 @@ static long __compat_sys_pwritev64(unsigned long fd,
                return -EBADF;
        ret = -ESPIPE;
        if (f.file->f_mode & FMODE_PWRITE)
-                ret = compat_writev(f.file, vec, vlen, &pos);
+                ret = compat_writev(f.file, vec, vlen, &pos, flags);
        fdput(f);
        return ret;
 }
@@ -1178,7 +1266,7 @@ COMPAT_SYSCALL_DEFINE4(pwritev64, unsigned long, fd,
                const struct compat_iovec __user *,vec,
                unsigned long, vlen, loff_t, pos)
 {
-        return __compat_sys_pwritev64(fd, vec, vlen, pos);
+        return do_compat_pwritev64(fd, vec, vlen, pos, 0);
 }
 #endif
@@ -1188,8 +1276,21 @@ COMPAT_SYSCALL_DEFINE5(pwritev, compat_ulong_t, fd,
 {
        loff_t pos = ((loff_t)pos_high << 32) | pos_low;
-        return __compat_sys_pwritev64(fd, vec, vlen, pos);
+        return do_compat_pwritev64(fd, vec, vlen, pos, 0);
+}
+COMPAT_SYSCALL_DEFINE6(pwritev2, compat_ulong_t, fd,
+                const struct compat_iovec __user *,vec,
+                compat_ulong_t, vlen, u32, pos_low, u32, pos_high, int, flags)
+{
+        loff_t pos = ((loff_t)pos_high << 32) | pos_low;
+        if (pos == -1)
+                return do_compat_writev(fd, vec, vlen, flags);
+        return do_compat_pwritev64(fd, vec, vlen, pos, flags);
 }
 #endif
 static ssize_t do_sendfile(int out_fd, int in_fd, loff_t *ppos,
diff --git a/fs/reiserfs/super.c b/fs/reiserfs/super.c
index c0306ec8ed7b..b8f2d1e8c645 100644
--- a/fs/reiserfs/super.c
+++ b/fs/reiserfs/super.c
@@ -802,6 +802,7 @@ static const struct dquot_operations reiserfs_quota_operations = {
        .write_info = reiserfs_write_info,
        .alloc_dquot    = dquot_alloc,
        .destroy_dquot  = dquot_destroy,
+        .get_next_id    = dquot_get_next_id,
 };
 static const struct quotactl_ops reiserfs_qctl_operations = {
diff --git a/fs/select.c b/fs/select.c
index 79d0d4953cad..869293988c2a 100644
--- a/fs/select.c
+++ b/fs/select.c
@@ -70,9 +70,9 @@ static long __estimate_accuracy(struct timespec *tv)
        return slack;
 }
-long select_estimate_accuracy(struct timespec *tv)
+u64 select_estimate_accuracy(struct timespec *tv)
 {
-        unsigned long ret;
+        u64 ret;
        struct timespec now;
        /*
@@ -402,7 +402,7 @@ int do_select(int n, fd_set_bits *fds, struct timespec *end_time)
        struct poll_wqueues table;
        poll_table *wait;
        int retval, i, timed_out = 0;
-        unsigned long slack = 0;
+        u64 slack = 0;
        unsigned int busy_flag = net_busy_loop_on() ? POLL_BUSY_LOOP : 0;
        unsigned long busy_end = 0;
@@ -784,7 +784,7 @@ static int do_poll(struct poll_list *list, struct poll_wqueues *wait,
        poll_table* pt = &wait->pt;
        ktime_t expire, *to = NULL;
        int timed_out = 0, count = 0;
-        unsigned long slack = 0;
+        u64 slack = 0;
        unsigned int busy_flag = net_busy_loop_on() ? POLL_BUSY_LOOP : 0;
        unsigned long busy_end = 0;
diff --git a/fs/splice.c b/fs/splice.c
index 82bc0d64fc38..9947b5c69664 100644
--- a/fs/splice.c
+++ b/fs/splice.c
@@ -185,6 +185,9 @@ ssize_t splice_to_pipe(struct pipe_inode_info *pipe,
        unsigned int spd_pages = spd->nr_pages;
        int ret, do_wakeup, page_nr;
+        if (!spd_pages)
+                return 0;
        ret = 0;
        do_wakeup = 0;
        page_nr = 0;
@@ -577,7 +580,7 @@ static ssize_t kernel_readv(struct file *file, const struct iovec *vec,
        old_fs = get_fs();
        set_fs(get_ds());
        /* The cast to a user pointer is valid due to the set_fs() */
-        res = vfs_readv(file, (const struct iovec __user *)vec, vlen, &pos);
+        res = vfs_readv(file, (const struct iovec __user *)vec, vlen, &pos, 0);
        set_fs(old_fs);
        return res;
diff --git a/fs/ubifs/Makefile b/fs/ubifs/Makefile
index 2c6f0cb816b4..c54a24360f85 100644
--- a/fs/ubifs/Makefile
+++ b/fs/ubifs/Makefile
@@ -4,3 +4,4 @@ ubifs-y += shrinker.o journal.o file.o dir.o super.o sb.o io.o
 ubifs-y += tnc.o master.o scan.o replay.o log.o commit.o gc.o orphan.o
 ubifs-y += budget.o find.o tnc_commit.o compress.o lpt.o lprops.o
 ubifs-y += recovery.o ioctl.o lpt_commit.o tnc_misc.o xattr.o debug.o
+ubifs-y += misc.o
diff --git a/fs/ubifs/misc.c b/fs/ubifs/misc.c
new file mode 100644
index 000000000000..486a2844949f
--- /dev/null
+++ b/fs/ubifs/misc.c
@@ -0,0 +1,57 @@
+#include <linux/kernel.h>
+#include "ubifs.h"
+/* Normal UBIFS messages */
+void ubifs_msg(const struct ubifs_info *c, const char *fmt, ...)
+{
+        struct va_format vaf;
+        va_list args;
+        va_start(args, fmt);
+        vaf.fmt = fmt;
+        vaf.va = &args;
+        pr_notice("UBIFS (ubi%d:%d): %pV\n",
+                  c->vi.ubi_num, c->vi.vol_id, &vaf);
+        va_end(args);
+}                                                                   \
+/* UBIFS error messages */
+void ubifs_err(const struct ubifs_info *c, const char *fmt, ...)
+{
+        struct va_format vaf;
+        va_list args;
+        va_start(args, fmt);
+        vaf.fmt = fmt;
+        vaf.va = &args;
+        pr_err("UBIFS error (ubi%d:%d pid %d): %ps: %pV\n",
+               c->vi.ubi_num, c->vi.vol_id, current->pid,
+               __builtin_return_address(0),
+               &vaf);
+        va_end(args);
+}                                                                   \
+/* UBIFS warning messages */
+void ubifs_warn(const struct ubifs_info *c, const char *fmt, ...)
+{
+        struct va_format vaf;
+        va_list args;
+        va_start(args, fmt);
+        vaf.fmt = fmt;
+        vaf.va = &args;
+        pr_warn("UBIFS warning (ubi%d:%d pid %d): %ps: %pV\n",
+                c->vi.ubi_num, c->vi.vol_id, current->pid,
+                __builtin_return_address(0),
+                &vaf);
+        va_end(args);
+}
diff --git a/fs/ubifs/ubifs.h b/fs/ubifs/ubifs.h
index a5697de763f5..c2a57e193a81 100644
--- a/fs/ubifs/ubifs.h
+++ b/fs/ubifs/ubifs.h
@@ -42,30 +42,6 @@
 /* Version of this UBIFS implementation */
 #define UBIFS_VERSION 1
-/* Normal UBIFS messages */
-#define ubifs_msg(c, fmt, ...)                                      \
-        pr_notice("UBIFS (ubi%d:%d): " fmt "\n",                    \
-                  (c)->vi.ubi_num, (c)->vi.vol_id, ##__VA_ARGS__)
-/* UBIFS error messages */
-#define ubifs_err(c, fmt, ...)                                      \
-        pr_err("UBIFS error (ubi%d:%d pid %d): %s: " fmt "\n",      \
-               (c)->vi.ubi_num, (c)->vi.vol_id, current->pid,       \
-               __func__, ##__VA_ARGS__)
-/* UBIFS warning messages */
-#define ubifs_warn(c, fmt, ...)                                     \
-        pr_warn("UBIFS warning (ubi%d:%d pid %d): %s: " fmt "\n",   \
-                (c)->vi.ubi_num, (c)->vi.vol_id, current->pid,      \
-                __func__, ##__VA_ARGS__)
-/*
- * A variant of 'ubifs_err()' which takes the UBIFS file-sytem description
- * object as an argument.
- */
-#define ubifs_errc(c, fmt, ...)                                     \
-        do {                                                        \
-                if (!(c)->probing)                                  \
-                        ubifs_err(c, fmt, ##__VA_ARGS__);           \
-        } while (0)
 /* UBIFS file system VFS magic number */
 #define UBIFS_SUPER_MAGIC 0x24051905
@@ -1802,4 +1778,21 @@ int ubifs_decompress(const struct ubifs_info *c, const void *buf, int len,
 #include "misc.h"
 #include "key.h"
+/* Normal UBIFS messages */
+__printf(2, 3)
+void ubifs_msg(const struct ubifs_info *c, const char *fmt, ...);
+__printf(2, 3)
+void ubifs_err(const struct ubifs_info *c, const char *fmt, ...);
+__printf(2, 3)
+void ubifs_warn(const struct ubifs_info *c, const char *fmt, ...);
+/*
+ * A variant of 'ubifs_err()' which takes the UBIFS file-sytem description
+ * object as an argument.
+ */
+#define ubifs_errc(c, fmt, ...)                                         \
+do {                                                                    \
+        if (!(c)->probing)                                              \
+                ubifs_err(c, fmt, ##__VA_ARGS__);                       \
+} while (0)
 #endif /* !__UBIFS_H__ */
diff --git a/fs/ubifs/xattr.c b/fs/ubifs/xattr.c
index c7f4d434d098..b043e044121d 100644
--- a/fs/ubifs/xattr.c
+++ b/fs/ubifs/xattr.c
@@ -59,7 +59,6 @@
 #include <linux/fs.h>
 #include <linux/slab.h>
 #include <linux/xattr.h>
-#include <linux/posix_acl_xattr.h>
 /*
 * Limit the number of extended attributes per inode so that the total size
diff --git a/fs/udf/dir.c b/fs/udf/dir.c
index 541d9c65014d..b51b371b874a 100644
--- a/fs/udf/dir.c
+++ b/fs/udf/dir.c
@@ -45,7 +45,7 @@ static int udf_readdir(struct file *file, struct dir_context *ctx)
        int block, iblock;
        loff_t nf_pos;
        int flen;
-        unsigned char *fname = NULL;
+        unsigned char *fname = NULL, *copy_name = NULL;
        unsigned char *nameptr;
        uint16_t liu;
        uint8_t lfi;
@@ -143,7 +143,15 @@ static int udf_readdir(struct file *file, struct dir_context *ctx)
                        if (poffset >= lfi) {
                                nameptr = (char *)(fibh.ebh->b_data + poffset - lfi);
                        } else {
-                                nameptr = fname;
+                                if (!copy_name) {
+                                        copy_name = kmalloc(UDF_NAME_LEN,
+                                                            GFP_NOFS);
+                                        if (!copy_name) {
+                                                ret = -ENOMEM;
+                                                goto out;
+                                        }
+                                }
+                                nameptr = copy_name;
                                memcpy(nameptr, fi->fileIdent + liu,
                                       lfi - poffset);
                                memcpy(nameptr + lfi - poffset,
@@ -185,6 +193,7 @@ out:
        brelse(fibh.sbh);
        brelse(epos.bh);
        kfree(fname);
+        kfree(copy_name);
        return ret;
 }
diff --git a/fs/udf/namei.c b/fs/udf/namei.c
index 42eafb91f7ff..a2ba11eca995 100644
--- a/fs/udf/namei.c
+++ b/fs/udf/namei.c
@@ -165,7 +165,7 @@ static struct fileIdentDesc *udf_find_entry(struct inode *dir,
        struct fileIdentDesc *fi = NULL;
        loff_t f_pos;
        int block, flen;
-        unsigned char *fname = NULL;
+        unsigned char *fname = NULL, *copy_name = NULL;
        unsigned char *nameptr;
        uint8_t lfi;
        uint16_t liu;
@@ -236,7 +236,15 @@ static struct fileIdentDesc *udf_find_entry(struct inode *dir,
                                nameptr = (uint8_t *)(fibh->ebh->b_data +
                                                      poffset - lfi);
                        else {
-                                nameptr = fname;
+                                if (!copy_name) {
+                                        copy_name = kmalloc(UDF_NAME_LEN,
+                                                            GFP_NOFS);
+                                        if (!copy_name) {
+                                                fi = ERR_PTR(-ENOMEM);
+                                                goto out_err;
+                                        }
+                                }
+                                nameptr = copy_name;
                                memcpy(nameptr, fi->fileIdent + liu,
                                        lfi - poffset);
                                memcpy(nameptr + lfi - poffset,
@@ -279,6 +287,7 @@ out_err:
 out_ok:
        brelse(epos.bh);
        kfree(fname);
+        kfree(copy_name);
        return fi;
 }
@@ -291,7 +300,7 @@ static struct dentry *udf_lookup(struct inode *dir, struct dentry *dentry,
        struct udf_fileident_bh fibh;
        struct fileIdentDesc *fi;
-        if (dentry->d_name.len > UDF_NAME_LEN - 2)
+        if (dentry->d_name.len > UDF_NAME_LEN)
                return ERR_PTR(-ENAMETOOLONG);
 #ifdef UDF_RECOVERY
@@ -351,7 +360,7 @@ static struct fileIdentDesc *udf_add_entry(struct inode *dir,
        struct udf_inode_info *dinfo;
        fibh->sbh = fibh->ebh = NULL;
-        name = kmalloc(UDF_NAME_LEN, GFP_NOFS);
+        name = kmalloc(UDF_NAME_LEN_CS0, GFP_NOFS);
        if (!name) {
                *err = -ENOMEM;
                goto out_err;
@@ -362,8 +371,9 @@ static struct fileIdentDesc *udf_add_entry(struct inode *dir,
                        *err = -EINVAL;
                        goto out_err;
                }
-                namelen = udf_put_filename(sb, dentry->d_name.name, name,
+                namelen = udf_put_filename(sb, dentry->d_name.name,
-                                                 dentry->d_name.len);
+                                           dentry->d_name.len,
+                                           name, UDF_NAME_LEN_CS0);
                if (!namelen) {
                        *err = -ENAMETOOLONG;
                        goto out_err;
@@ -914,7 +924,7 @@ static int udf_symlink(struct inode *dir, struct dentry *dentry,
        iinfo = UDF_I(inode);
        down_write(&iinfo->i_data_sem);
-        name = kmalloc(UDF_NAME_LEN, GFP_NOFS);
+        name = kmalloc(UDF_NAME_LEN_CS0, GFP_NOFS);
        if (!name) {
                err = -ENOMEM;
                goto out_no_entry;
@@ -997,8 +1007,9 @@ static int udf_symlink(struct inode *dir, struct dentry *dentry,
                }
                if (pc->componentType == 5) {
-                        namelen = udf_put_filename(sb, compstart, name,
+                        namelen = udf_put_filename(sb, compstart,
-                                                   symname - compstart);
+                                                   symname - compstart,
+                                                   name, UDF_NAME_LEN_CS0);
                        if (!namelen)
                                goto out_no_entry;
diff --git a/fs/udf/super.c b/fs/udf/super.c
index a522c15a0bfd..fa92fe839fda 100644
--- a/fs/udf/super.c
+++ b/fs/udf/super.c
@@ -887,18 +887,14 @@ static int udf_find_fileset(struct super_block *sb,
 static int udf_load_pvoldesc(struct super_block *sb, sector_t block)
 {
        struct primaryVolDesc *pvoldesc;
-        struct ustr *instr, *outstr;
+        uint8_t *outstr;
        struct buffer_head *bh;
        uint16_t ident;
        int ret = -ENOMEM;
-        instr = kmalloc(sizeof(struct ustr), GFP_NOFS);
+        outstr = kmalloc(128, GFP_NOFS);
-        if (!instr)
-                return -ENOMEM;
-        outstr = kmalloc(sizeof(struct ustr), GFP_NOFS);
        if (!outstr)
-                goto out1;
+                return -ENOMEM;
        bh = udf_read_tagged(sb, block, block, &ident);
        if (!bh) {
@@ -923,31 +919,25 @@ static int udf_load_pvoldesc(struct super_block *sb, sector_t block)
 #endif
        }
-        if (!udf_build_ustr(instr, pvoldesc->volIdent, 32)) {
+        ret = udf_CS0toUTF8(outstr, 31, pvoldesc->volIdent, 32);
-                ret = udf_CS0toUTF8(outstr, instr);
+        if (ret < 0)
-                if (ret < 0)
+                goto out_bh;
-                        goto out_bh;
-                strncpy(UDF_SB(sb)->s_volume_ident, outstr->u_name,
+        strncpy(UDF_SB(sb)->s_volume_ident, outstr, ret);
-                        outstr->u_len > 31 ? 31 : outstr->u_len);
+        udf_debug("volIdent[] = '%s'\n", UDF_SB(sb)->s_volume_ident);
-                udf_debug("volIdent[] = '%s'\n", UDF_SB(sb)->s_volume_ident);
-        }
-        if (!udf_build_ustr(instr, pvoldesc->volSetIdent, 128)) {
+        ret = udf_CS0toUTF8(outstr, 127, pvoldesc->volSetIdent, 128);
-                ret = udf_CS0toUTF8(outstr, instr);
+        if (ret < 0)
-                if (ret < 0)
+                goto out_bh;
-                        goto out_bh;
-                udf_debug("volSetIdent[] = '%s'\n", outstr->u_name);
+        outstr[ret] = 0;
-        }
+        udf_debug("volSetIdent[] = '%s'\n", outstr);
        ret = 0;
 out_bh:
        brelse(bh);
 out2:
        kfree(outstr);
-out1:
-        kfree(instr);
        return ret;
 }
@@ -2358,7 +2348,7 @@ static int udf_statfs(struct dentry *dentry, struct kstatfs *buf)
                                          le32_to_cpu(lvidiu->numDirs)) : 0)
                        + buf->f_bfree;
        buf->f_ffree = buf->f_bfree;
-        buf->f_namelen = UDF_NAME_LEN - 2;
+        buf->f_namelen = UDF_NAME_LEN;
        buf->f_fsid.val[0] = (u32)id;
        buf->f_fsid.val[1] = (u32)(id >> 32);
diff --git a/fs/udf/udfdecl.h b/fs/udf/udfdecl.h
index fa0044b6b81d..972b70625614 100644
--- a/fs/udf/udfdecl.h
+++ b/fs/udf/udfdecl.h
@@ -49,8 +49,8 @@ extern __printf(3, 4) void _udf_warn(struct super_block *sb,
 #define UDF_EXTENT_FLAG_MASK    0xC0000000
 #define UDF_NAME_PAD            4
-#define UDF_NAME_LEN            256
+#define UDF_NAME_LEN            254
-#define UDF_PATH_LEN            1023
+#define UDF_NAME_LEN_CS0        255
 static inline size_t udf_file_entry_alloc_offset(struct inode *inode)
 {
@@ -106,12 +106,6 @@ struct generic_desc {
        __le32          volDescSeqNum;
 };
-struct ustr {
-        uint8_t u_cmpID;
-        uint8_t u_name[UDF_NAME_LEN - 2];
-        uint8_t u_len;
-};
 /* super.c */
@@ -214,12 +208,11 @@ udf_get_lb_pblock(struct super_block *sb, struct kernel_lb_addr *loc,
 }
 /* unicode.c */
-extern int udf_get_filename(struct super_block *, uint8_t *, int, uint8_t *,
+extern int udf_get_filename(struct super_block *, const uint8_t *, int,
-                            int);
+                            uint8_t *, int);
-extern int udf_put_filename(struct super_block *, const uint8_t *, uint8_t *,
+extern int udf_put_filename(struct super_block *, const uint8_t *, int,
-                            int);
+                            uint8_t *, int);
-extern int udf_build_ustr(struct ustr *, dstring *, int);
+extern int udf_CS0toUTF8(uint8_t *, int, const uint8_t *, int);
-extern int udf_CS0toUTF8(struct ustr *, const struct ustr *);
 /* ialloc.c */
 extern void udf_free_inode(struct inode *);
diff --git a/fs/udf/unicode.c b/fs/udf/unicode.c
index e788a05aab83..3ff42f4437f3 100644
--- a/fs/udf/unicode.c
+++ b/fs/udf/unicode.c
@@ -28,199 +28,72 @@
 #include "udf_sb.h"
-static int udf_translate_to_linux(uint8_t *, int, uint8_t *, int, uint8_t *,
+static int udf_uni2char_utf8(wchar_t uni,
-                                  int);
+                             unsigned char *out,
+                             int boundlen)
-static int udf_char_to_ustr(struct ustr *dest, const uint8_t *src, int strlen)
-{
-        if ((!dest) || (!src) || (!strlen) || (strlen > UDF_NAME_LEN - 2))
-                return 0;
-        memset(dest, 0, sizeof(struct ustr));
-        memcpy(dest->u_name, src, strlen);
-        dest->u_cmpID = 0x08;
-        dest->u_len = strlen;
-        return strlen;
-}
-/*
- * udf_build_ustr
- */
-int udf_build_ustr(struct ustr *dest, dstring *ptr, int size)
-{
-        int usesize;
-        if (!dest || !ptr || !size)
-                return -1;
-        BUG_ON(size < 2);
-        usesize = min_t(size_t, ptr[size - 1], sizeof(dest->u_name));
-        usesize = min(usesize, size - 2);
-        dest->u_cmpID = ptr[0];
-        dest->u_len = usesize;
-        memcpy(dest->u_name, ptr + 1, usesize);
-        memset(dest->u_name + usesize, 0, sizeof(dest->u_name) - usesize);
-        return 0;
-}
-/*
- * udf_build_ustr_exact
- */
-static void udf_build_ustr_exact(struct ustr *dest, dstring *ptr, int exactsize)
-{
-        memset(dest, 0, sizeof(struct ustr));
-        dest->u_cmpID = ptr[0];
-        dest->u_len = exactsize - 1;
-        memcpy(dest->u_name, ptr + 1, exactsize - 1);
-}
-/*
- * udf_CS0toUTF8
- *
- * PURPOSE
- *      Convert OSTA Compressed Unicode to the UTF-8 equivalent.
- *
- * PRE-CONDITIONS
- *      utf                     Pointer to UTF-8 output buffer.
- *      ocu                     Pointer to OSTA Compressed Unicode input buffer
- *                              of size UDF_NAME_LEN bytes.
- *                              both of type "struct ustr *"
- *
- * POST-CONDITIONS
- *      <return>                >= 0 on success.
- *
- * HISTORY
- *      November 12, 1997 - Andrew E. Mileski
- *      Written, tested, and released.
- */
-int udf_CS0toUTF8(struct ustr *utf_o, const struct ustr *ocu_i)
 {
-        const uint8_t *ocu;
+        int u_len = 0;
-        uint8_t cmp_id, ocu_len;
-        int i;
+        if (boundlen <= 0)
+                return -ENAMETOOLONG;
-        ocu_len = ocu_i->u_len;
-        if (ocu_len == 0) {
+        if (uni < 0x80) {
-                memset(utf_o, 0, sizeof(struct ustr));
+                out[u_len++] = (unsigned char)uni;
-                return 0;
+        } else if (uni < 0x800) {
-        }
+                if (boundlen < 2)
+                        return -ENAMETOOLONG;
-        cmp_id = ocu_i->u_cmpID;
+                out[u_len++] = (unsigned char)(0xc0 | (uni >> 6));
-        if (cmp_id != 8 && cmp_id != 16) {
+                out[u_len++] = (unsigned char)(0x80 | (uni & 0x3f));
-                memset(utf_o, 0, sizeof(struct ustr));
+        } else {
-                pr_err("unknown compression code (%d) stri=%s\n",
+                if (boundlen < 3)
-                       cmp_id, ocu_i->u_name);
+                        return -ENAMETOOLONG;
-                return -EINVAL;
+                out[u_len++] = (unsigned char)(0xe0 | (uni >> 12));
-        }
+                out[u_len++] = (unsigned char)(0x80 | ((uni >> 6) & 0x3f));
+                out[u_len++] = (unsigned char)(0x80 | (uni & 0x3f));
-        ocu = ocu_i->u_name;
-        utf_o->u_len = 0;
-        for (i = 0; (i < ocu_len) && (utf_o->u_len <= (UDF_NAME_LEN - 3));) {
-                /* Expand OSTA compressed Unicode to Unicode */
-                uint32_t c = ocu[i++];
-                if (cmp_id == 16)
-                        c = (c << 8) | ocu[i++];
-                /* Compress Unicode to UTF-8 */
-                if (c < 0x80U)
-                        utf_o->u_name[utf_o->u_len++] = (uint8_t)c;
-                else if (c < 0x800U) {
-                        if (utf_o->u_len > (UDF_NAME_LEN - 4))
-                                break;
-                        utf_o->u_name[utf_o->u_len++] =
-                                                (uint8_t)(0xc0 | (c >> 6));
-                        utf_o->u_name[utf_o->u_len++] =
-                                                (uint8_t)(0x80 | (c & 0x3f));
-                } else {
-                        if (utf_o->u_len > (UDF_NAME_LEN - 5))
-                                break;
-                        utf_o->u_name[utf_o->u_len++] =
-                                                (uint8_t)(0xe0 | (c >> 12));
-                        utf_o->u_name[utf_o->u_len++] =
-                                                (uint8_t)(0x80 |
-                                                          ((c >> 6) & 0x3f));
-                        utf_o->u_name[utf_o->u_len++] =
-                                                (uint8_t)(0x80 | (c & 0x3f));
-                }
        }
-        utf_o->u_cmpID = 8;
+        return u_len;
-        return utf_o->u_len;
 }
-/*
+static int udf_char2uni_utf8(const unsigned char *in,
- *
+                             int boundlen,
- * udf_UTF8toCS0
+                             wchar_t *uni)
- *
- * PURPOSE
- *      Convert UTF-8 to the OSTA Compressed Unicode equivalent.
- *
- * DESCRIPTION
- *      This routine is only called by udf_lookup().
- *
- * PRE-CONDITIONS
- *      ocu                     Pointer to OSTA Compressed Unicode output
- *                              buffer of size UDF_NAME_LEN bytes.
- *      utf                     Pointer to UTF-8 input buffer.
- *      utf_len                 Length of UTF-8 input buffer in bytes.
- *
- * POST-CONDITIONS
- *      <return>                Zero on success.
- *
- * HISTORY
- *      November 12, 1997 - Andrew E. Mileski
- *      Written, tested, and released.
- */
-static int udf_UTF8toCS0(dstring *ocu, struct ustr *utf, int length)
 {
-        unsigned c, i, max_val, utf_char;
+        unsigned int utf_char;
-        int utf_cnt, u_len, u_ch;
+        unsigned char c;
+        int utf_cnt, u_len;
-        memset(ocu, 0, sizeof(dstring) * length);
-        ocu[0] = 8;
-        max_val = 0xffU;
-        u_ch = 1;
-try_again:
+        utf_char = 0;
-        u_len = 0U;
+        utf_cnt = 0;
-        utf_char = 0U;
+        for (u_len = 0; u_len < boundlen;) {
-        utf_cnt = 0U;
+                c = in[u_len++];
-        for (i = 0U; i < utf->u_len; i++) {
-                /* Name didn't fit? */
-                if (u_len + 1 + u_ch >= length)
-                        return 0;
-                c = (uint8_t)utf->u_name[i];
                /* Complete a multi-byte UTF-8 character */
                if (utf_cnt) {
-                        utf_char = (utf_char << 6) | (c & 0x3fU);
+                        utf_char = (utf_char << 6) | (c & 0x3f);
                        if (--utf_cnt)
                                continue;
                } else {
                        /* Check for a multi-byte UTF-8 character */
-                        if (c & 0x80U) {
+                        if (c & 0x80) {
                                /* Start a multi-byte UTF-8 character */
-                                if ((c & 0xe0U) == 0xc0U) {
+                                if ((c & 0xe0) == 0xc0) {
-                                        utf_char = c & 0x1fU;
+                                        utf_char = c & 0x1f;
                                        utf_cnt = 1;
-                                } else if ((c & 0xf0U) == 0xe0U) {
+                                } else if ((c & 0xf0) == 0xe0) {
-                                        utf_char = c & 0x0fU;
+                                        utf_char = c & 0x0f;
                                        utf_cnt = 2;
-                                } else if ((c & 0xf8U) == 0xf0U) {
+                                } else if ((c & 0xf8) == 0xf0) {
-                                        utf_char = c & 0x07U;
+                                        utf_char = c & 0x07;
                                        utf_cnt = 3;
-                                } else if ((c & 0xfcU) == 0xf8U) {
+                                } else if ((c & 0xfc) == 0xf8) {
-                                        utf_char = c & 0x03U;
+                                        utf_char = c & 0x03;
                                        utf_cnt = 4;
-                                } else if ((c & 0xfeU) == 0xfcU) {
+                                } else if ((c & 0xfe) == 0xfc) {
-                                        utf_char = c & 0x01U;
+                                        utf_char = c & 0x01;
                                        utf_cnt = 5;
                                } else {
-                                        goto error_out;
+                                        utf_cnt = -1;
+                                        break;
                                }
                                continue;
                        } else {
@@ -228,97 +101,216 @@ try_again:
                                utf_char = c;
                        }
                }
+                *uni = utf_char;
-                /* Choose no compression if necessary */
+                break;
-                if (utf_char > max_val) {
-                        if (max_val == 0xffU) {
-                                max_val = 0xffffU;
-                                ocu[0] = (uint8_t)0x10U;
-                                u_ch = 2;
-                                goto try_again;
-                        }
-                        goto error_out;
-                }
-                if (max_val == 0xffffU)
-                        ocu[++u_len] = (uint8_t)(utf_char >> 8);
-                ocu[++u_len] = (uint8_t)(utf_char & 0xffU);
        }
        if (utf_cnt) {
-error_out:
+                *uni = '?';
-                ocu[++u_len] = '?';
+                return -EINVAL;
-                printk(KERN_DEBUG pr_fmt("bad UTF-8 character\n"));
        }
+        return u_len;
+}
-        ocu[length - 1] = (uint8_t)u_len + 1;
+#define ILLEGAL_CHAR_MARK       '_'
+#define EXT_MARK                '.'
+#define CRC_MARK                '#'
+#define EXT_SIZE                5
+/* Number of chars we need to store generated CRC to make filename unique */
+#define CRC_LEN                 5
+static int udf_name_conv_char(uint8_t *str_o, int str_o_max_len,
+                              int *str_o_idx,
+                              const uint8_t *str_i, int str_i_max_len,
+                              int *str_i_idx,
+                              int u_ch, int *needsCRC,
+                              int (*conv_f)(wchar_t, unsigned char *, int),
+                              int translate)
+{
+        uint32_t c;
+        int illChar = 0;
+        int len, gotch = 0;
+        for (; (!gotch) && (*str_i_idx < str_i_max_len); *str_i_idx += u_ch) {
+                if (*str_o_idx >= str_o_max_len) {
+                        *needsCRC = 1;
+                        return gotch;
+                }
-        return u_len + 1;
+                /* Expand OSTA compressed Unicode to Unicode */
+                c = str_i[*str_i_idx];
+                if (u_ch > 1)
+                        c = (c << 8) | str_i[*str_i_idx + 1];
+                if (translate && (c == '/' || c == 0))
+                        illChar = 1;
+                else if (illChar)
+                        break;
+                else
+                        gotch = 1;
+        }
+        if (illChar) {
+                *needsCRC = 1;
+                c = ILLEGAL_CHAR_MARK;
+                gotch = 1;
+        }
+        if (gotch) {
+                len = conv_f(c, &str_o[*str_o_idx], str_o_max_len - *str_o_idx);
+                /* Valid character? */
+                if (len >= 0)
+                        *str_o_idx += len;
+                else if (len == -ENAMETOOLONG) {
+                        *needsCRC = 1;
+                        gotch = 0;
+                } else {
+                        str_o[(*str_o_idx)++] = '?';
+                        *needsCRC = 1;
+                }
+        }
+        return gotch;
 }
-static int udf_CS0toNLS(struct nls_table *nls, struct ustr *utf_o,
+static int udf_name_from_CS0(uint8_t *str_o, int str_max_len,
-                        const struct ustr *ocu_i)
+                             const uint8_t *ocu, int ocu_len,
+                             int (*conv_f)(wchar_t, unsigned char *, int),
+                             int translate)
 {
-        const uint8_t *ocu;
+        uint32_t c;
-        uint8_t cmp_id, ocu_len;
+        uint8_t cmp_id;
-        int i, len;
+        int idx, len;
+        int u_ch;
+        int needsCRC = 0;
+        int ext_i_len, ext_max_len;
+        int str_o_len = 0;      /* Length of resulting output */
+        int ext_o_len = 0;      /* Extension output length */
+        int ext_crc_len = 0;    /* Extension output length if used with CRC */
+        int i_ext = -1;         /* Extension position in input buffer */
+        int o_crc = 0;          /* Rightmost possible output pos for CRC+ext */
+        unsigned short valueCRC;
+        uint8_t ext[EXT_SIZE * NLS_MAX_CHARSET_SIZE + 1];
+        uint8_t crc[CRC_LEN];
+        if (str_max_len <= 0)
+                return 0;
-        ocu_len = ocu_i->u_len;
        if (ocu_len == 0) {
-                memset(utf_o, 0, sizeof(struct ustr));
+                memset(str_o, 0, str_max_len);
                return 0;
        }
-        cmp_id = ocu_i->u_cmpID;
+        cmp_id = ocu[0];
        if (cmp_id != 8 && cmp_id != 16) {
-                memset(utf_o, 0, sizeof(struct ustr));
+                memset(str_o, 0, str_max_len);
-                pr_err("unknown compression code (%d) stri=%s\n",
+                pr_err("unknown compression code (%d)\n", cmp_id);
-                       cmp_id, ocu_i->u_name);
                return -EINVAL;
        }
+        u_ch = cmp_id >> 3;
-        ocu = ocu_i->u_name;
+        ocu++;
-        utf_o->u_len = 0;
+        ocu_len--;
-        for (i = 0; (i < ocu_len) && (utf_o->u_len <= (UDF_NAME_LEN - 3));) {
-                /* Expand OSTA compressed Unicode to Unicode */
-                uint32_t c = ocu[i++];
-                if (cmp_id == 16)
-                        c = (c << 8) | ocu[i++];
-                len = nls->uni2char(c, &utf_o->u_name[utf_o->u_len],
+        if (ocu_len % u_ch) {
-                                    UDF_NAME_LEN - 2 - utf_o->u_len);
+                pr_err("incorrect filename length (%d)\n", ocu_len + 1);
-                /* Valid character? */
+                return -EINVAL;
-                if (len >= 0)
+        }
-                        utf_o->u_len += len;
-                else
+        if (translate) {
-                        utf_o->u_name[utf_o->u_len++] = '?';
+                /* Look for extension */
+                for (idx = ocu_len - u_ch, ext_i_len = 0;
+                     (idx >= 0) && (ext_i_len < EXT_SIZE);
+                     idx -= u_ch, ext_i_len++) {
+                        c = ocu[idx];
+                        if (u_ch > 1)
+                                c = (c << 8) | ocu[idx + 1];
+                        if (c == EXT_MARK) {
+                                if (ext_i_len)
+                                        i_ext = idx;
+                                break;
+                        }
+                }
+                if (i_ext >= 0) {
+                        /* Convert extension */
+                        ext_max_len = min_t(int, sizeof(ext), str_max_len);
+                        ext[ext_o_len++] = EXT_MARK;
+                        idx = i_ext + u_ch;
+                        while (udf_name_conv_char(ext, ext_max_len, &ext_o_len,
+                                                  ocu, ocu_len, &idx,
+                                                  u_ch, &needsCRC,
+                                                  conv_f, translate)) {
+                                if ((ext_o_len + CRC_LEN) < str_max_len)
+                                        ext_crc_len = ext_o_len;
+                        }
+                }
        }
-        utf_o->u_cmpID = 8;
-        return utf_o->u_len;
+        idx = 0;
+        while (1) {
+                if (translate && (idx == i_ext)) {
+                        if (str_o_len > (str_max_len - ext_o_len))
+                                needsCRC = 1;
+                        break;
+                }
+                if (!udf_name_conv_char(str_o, str_max_len, &str_o_len,
+                                        ocu, ocu_len, &idx,
+                                        u_ch, &needsCRC, conv_f, translate))
+                        break;
+                if (translate &&
+                    (str_o_len <= (str_max_len - ext_o_len - CRC_LEN)))
+                        o_crc = str_o_len;
+        }
+        if (translate) {
+                if (str_o_len <= 2 && str_o[0] == '.' &&
+                    (str_o_len == 1 || str_o[1] == '.'))
+                        needsCRC = 1;
+                if (needsCRC) {
+                        str_o_len = o_crc;
+                        valueCRC = crc_itu_t(0, ocu, ocu_len);
+                        crc[0] = CRC_MARK;
+                        crc[1] = hex_asc_upper_hi(valueCRC >> 8);
+                        crc[2] = hex_asc_upper_lo(valueCRC >> 8);
+                        crc[3] = hex_asc_upper_hi(valueCRC);
+                        crc[4] = hex_asc_upper_lo(valueCRC);
+                        len = min_t(int, CRC_LEN, str_max_len - str_o_len);
+                        memcpy(&str_o[str_o_len], crc, len);
+                        str_o_len += len;
+                        ext_o_len = ext_crc_len;
+                }
+                if (ext_o_len > 0) {
+                        memcpy(&str_o[str_o_len], ext, ext_o_len);
+                        str_o_len += ext_o_len;
+                }
+        }
+        return str_o_len;
 }
-static int udf_NLStoCS0(struct nls_table *nls, dstring *ocu, struct ustr *uni,
+static int udf_name_to_CS0(uint8_t *ocu, int ocu_max_len,
-                        int length)
+                           const uint8_t *str_i, int str_len,
+                           int (*conv_f)(const unsigned char *, int, wchar_t *))
 {
-        int len;
+        int i, len;
-        unsigned i, max_val;
+        unsigned int max_val;
-        uint16_t uni_char;
+        wchar_t uni_char;
        int u_len, u_ch;
-        memset(ocu, 0, sizeof(dstring) * length);
+        if (ocu_max_len <= 0)
+                return 0;
+        memset(ocu, 0, ocu_max_len);
        ocu[0] = 8;
-        max_val = 0xffU;
+        max_val = 0xff;
        u_ch = 1;
 try_again:
-        u_len = 0U;
+        u_len = 1;
-        for (i = 0U; i < uni->u_len; i++) {
+        for (i = 0; i < str_len; i++) {
                /* Name didn't fit? */
-                if (u_len + 1 + u_ch >= length)
+                if (u_len + u_ch > ocu_max_len)
                        return 0;
-                len = nls->char2uni(&uni->u_name[i], uni->u_len - i, &uni_char);
+                len = conv_f(&str_i[i], str_len - i, &uni_char);
                if (!len)
                        continue;
                /* Invalid character, deal with it */
@@ -328,187 +320,65 @@ try_again:
                }
                if (uni_char > max_val) {
-                        max_val = 0xffffU;
+                        max_val = 0xffff;
-                        ocu[0] = (uint8_t)0x10U;
+                        ocu[0] = 0x10;
                        u_ch = 2;
                        goto try_again;
                }
-                if (max_val == 0xffffU)
+                if (max_val == 0xffff)
-                        ocu[++u_len] = (uint8_t)(uni_char >> 8);
+                        ocu[u_len++] = (uint8_t)(uni_char >> 8);
-                ocu[++u_len] = (uint8_t)(uni_char & 0xffU);
+                ocu[u_len++] = (uint8_t)(uni_char & 0xff);
                i += len - 1;
        }
-        ocu[length - 1] = (uint8_t)u_len + 1;
+        return u_len;
-        return u_len + 1;
 }
-int udf_get_filename(struct super_block *sb, uint8_t *sname, int slen,
+int udf_CS0toUTF8(uint8_t *utf_o, int o_len, const uint8_t *ocu_i, int i_len)
+{
+        return udf_name_from_CS0(utf_o, o_len, ocu_i, i_len,
+                                 udf_uni2char_utf8, 0);
+}
+int udf_get_filename(struct super_block *sb, const uint8_t *sname, int slen,
                     uint8_t *dname, int dlen)
 {
-        struct ustr *filename, *unifilename;
+        int (*conv_f)(wchar_t, unsigned char *, int);
        int ret;
        if (!slen)
                return -EIO;
-        filename = kmalloc(sizeof(struct ustr), GFP_NOFS);
+        if (dlen <= 0)
-        if (!filename)
+                return 0;
-                return -ENOMEM;
-        unifilename = kmalloc(sizeof(struct ustr), GFP_NOFS);
-        if (!unifilename) {
-                ret = -ENOMEM;
-                goto out1;
-        }
-        udf_build_ustr_exact(unifilename, sname, slen);
        if (UDF_QUERY_FLAG(sb, UDF_FLAG_UTF8)) {
-                ret = udf_CS0toUTF8(filename, unifilename);
+                conv_f = udf_uni2char_utf8;
-                if (ret < 0) {
-                        udf_debug("Failed in udf_get_filename: sname = %s\n",
-                                  sname);
-                        goto out2;
-                }
        } else if (UDF_QUERY_FLAG(sb, UDF_FLAG_NLS_MAP)) {
-                ret = udf_CS0toNLS(UDF_SB(sb)->s_nls_map, filename,
+                conv_f = UDF_SB(sb)->s_nls_map->uni2char;
-                                   unifilename);
-                if (ret < 0) {
-                        udf_debug("Failed in udf_get_filename: sname = %s\n",
-                                  sname);
-                        goto out2;
-                }
        } else
                BUG();
-        ret = udf_translate_to_linux(dname, dlen,
+        ret = udf_name_from_CS0(dname, dlen, sname, slen, conv_f, 1);
-                                     filename->u_name, filename->u_len,
-                                     unifilename->u_name, unifilename->u_len);
        /* Zero length filename isn't valid... */
        if (ret == 0)
                ret = -EINVAL;
-out2:
-        kfree(unifilename);
-out1:
-        kfree(filename);
        return ret;
 }
-int udf_put_filename(struct super_block *sb, const uint8_t *sname,
+int udf_put_filename(struct super_block *sb, const uint8_t *sname, int slen,
-                     uint8_t *dname, int flen)
+                     uint8_t *dname, int dlen)
 {
-        struct ustr unifilename;
+        int (*conv_f)(const unsigned char *, int, wchar_t *);
-        int namelen;
-        if (!udf_char_to_ustr(&unifilename, sname, flen))
-                return 0;
        if (UDF_QUERY_FLAG(sb, UDF_FLAG_UTF8)) {
-                namelen = udf_UTF8toCS0(dname, &unifilename, UDF_NAME_LEN);
+                conv_f = udf_char2uni_utf8;
-                if (!namelen)
-                        return 0;
        } else if (UDF_QUERY_FLAG(sb, UDF_FLAG_NLS_MAP)) {
-                namelen = udf_NLStoCS0(UDF_SB(sb)->s_nls_map, dname,
+                conv_f = UDF_SB(sb)->s_nls_map->char2uni;
-                                        &unifilename, UDF_NAME_LEN);
-                if (!namelen)
-                        return 0;
        } else
-                return 0;
+                BUG();
-        return namelen;
+        return udf_name_to_CS0(dname, dlen, sname, slen, conv_f);
 }
-#define ILLEGAL_CHAR_MARK       '_'
-#define EXT_MARK                '.'
-#define CRC_MARK                '#'
-#define EXT_SIZE                5
-/* Number of chars we need to store generated CRC to make filename unique */
-#define CRC_LEN                 5
-static int udf_translate_to_linux(uint8_t *newName, int newLen,
-                                  uint8_t *udfName, int udfLen,
-                                  uint8_t *fidName, int fidNameLen)
-{
-        int index, newIndex = 0, needsCRC = 0;
-        int extIndex = 0, newExtIndex = 0, hasExt = 0;
-        unsigned short valueCRC;
-        uint8_t curr;
-        if (udfName[0] == '.' &&
-            (udfLen == 1 || (udfLen == 2 && udfName[1] == '.'))) {
-                needsCRC = 1;
-                newIndex = udfLen;
-                memcpy(newName, udfName, udfLen);
-        } else {
-                for (index = 0; index < udfLen; index++) {
-                        curr = udfName[index];
-                        if (curr == '/' || curr == 0) {
-                                needsCRC = 1;
-                                curr = ILLEGAL_CHAR_MARK;
-                                while (index + 1 < udfLen &&
-                                                (udfName[index + 1] == '/' ||
-                                                 udfName[index + 1] == 0))
-                                        index++;
-                        }
-                        if (curr == EXT_MARK &&
-                                        (udfLen - index - 1) <= EXT_SIZE) {
-                                if (udfLen == index + 1)
-                                        hasExt = 0;
-                                else {
-                                        hasExt = 1;
-                                        extIndex = index;
-                                        newExtIndex = newIndex;
-                                }
-                        }
-                        if (newIndex < newLen)
-                                newName[newIndex++] = curr;
-                        else
-                                needsCRC = 1;
-                }
-        }
-        if (needsCRC) {
-                uint8_t ext[EXT_SIZE];
-                int localExtIndex = 0;
-                if (hasExt) {
-                        int maxFilenameLen;
-                        for (index = 0;
-                             index < EXT_SIZE && extIndex + index + 1 < udfLen;
-                             index++) {
-                                curr = udfName[extIndex + index + 1];
-                                if (curr == '/' || curr == 0) {
-                                        needsCRC = 1;
-                                        curr = ILLEGAL_CHAR_MARK;
-                                        while (extIndex + index + 2 < udfLen &&
-                                              (index + 1 < EXT_SIZE &&
-                                                (udfName[extIndex + index + 2] == '/' ||
-                                                 udfName[extIndex + index + 2] == 0)))
-                                                index++;
-                                }
-                                ext[localExtIndex++] = curr;
-                        }
-                        maxFilenameLen = newLen - CRC_LEN - localExtIndex;
-                        if (newIndex > maxFilenameLen)
-                                newIndex = maxFilenameLen;
-                        else
-                                newIndex = newExtIndex;
-                } else if (newIndex > newLen - CRC_LEN)
-                        newIndex = newLen - CRC_LEN;
-                newName[newIndex++] = CRC_MARK;
-                valueCRC = crc_itu_t(0, fidName, fidNameLen);
-                newName[newIndex++] = hex_asc_upper_hi(valueCRC >> 8);
-                newName[newIndex++] = hex_asc_upper_lo(valueCRC >> 8);
-                newName[newIndex++] = hex_asc_upper_hi(valueCRC);
-                newName[newIndex++] = hex_asc_upper_lo(valueCRC);
-                if (hasExt) {
-                        newName[newIndex++] = EXT_MARK;
-                        for (index = 0; index < localExtIndex; index++)
-                                newName[newIndex++] = ext[index];
-                }
-        }
-        return newIndex;
-}
diff --git a/fs/xfs/Makefile b/fs/xfs/Makefile
index f64639176670..3542d94fddce 100644
--- a/fs/xfs/Makefile
+++ b/fs/xfs/Makefile
@@ -121,4 +121,5 @@ xfs-$(CONFIG_XFS_RT)		+= xfs_rtalloc.o
 xfs-$(CONFIG_XFS_POSIX_ACL)     += xfs_acl.o
 xfs-$(CONFIG_SYSCTL)            += xfs_sysctl.o
 xfs-$(CONFIG_COMPAT)            += xfs_ioctl32.o
-xfs-$(CONFIG_NFSD_PNFS)         += xfs_pnfs.o
+xfs-$(CONFIG_NFSD_BLOCKLAYOUT)  += xfs_pnfs.o
+xfs-$(CONFIG_NFSD_SCSILAYOUT)   += xfs_pnfs.o
diff --git a/fs/xfs/libxfs/xfs_alloc_btree.c b/fs/xfs/libxfs/xfs_alloc_btree.c
index 444626ddbd1b..d9b42425291e 100644
--- a/fs/xfs/libxfs/xfs_alloc_btree.c
+++ b/fs/xfs/libxfs/xfs_alloc_btree.c
@@ -118,8 +118,6 @@ xfs_allocbt_free_block(
        xfs_extent_busy_insert(cur->bc_tp, be32_to_cpu(agf->agf_seqno), bno, 1,
                              XFS_EXTENT_BUSY_SKIP_DISCARD);
        xfs_trans_agbtree_delta(cur->bc_tp, -1);
-        xfs_trans_binval(cur->bc_tp, bp);
        return 0;
 }
diff --git a/fs/xfs/libxfs/xfs_attr_sf.h b/fs/xfs/libxfs/xfs_attr_sf.h
index 919756e3ba53..90928bbe693c 100644
--- a/fs/xfs/libxfs/xfs_attr_sf.h
+++ b/fs/xfs/libxfs/xfs_attr_sf.h
@@ -24,22 +24,6 @@
 * Small attribute lists are packed as tightly as possible so as
 * to fit into the literal area of the inode.
 */
-/*
- * Entries are packed toward the top as tight as possible.
- */
-typedef struct xfs_attr_shortform {
-        struct xfs_attr_sf_hdr {        /* constant-structure header block */
-                __be16  totsize;        /* total bytes in shortform list */
-                __u8    count;  /* count of active entries */
-        } hdr;
-        struct xfs_attr_sf_entry {
-                __uint8_t namelen;      /* actual length of name (no NULL) */
-                __uint8_t valuelen;     /* actual length of value (no NULL) */
-                __uint8_t flags;        /* flags bits (see xfs_attr_leaf.h) */
-                __uint8_t nameval[1];   /* name & value bytes concatenated */
-        } list[1];                      /* variable sized array */
-} xfs_attr_shortform_t;
 typedef struct xfs_attr_sf_hdr xfs_attr_sf_hdr_t;
 typedef struct xfs_attr_sf_entry xfs_attr_sf_entry_t;
diff --git a/fs/xfs/libxfs/xfs_bmap.c b/fs/xfs/libxfs/xfs_bmap.c
index ef00156f4f96..041b6948aecc 100644
--- a/fs/xfs/libxfs/xfs_bmap.c
+++ b/fs/xfs/libxfs/xfs_bmap.c
@@ -477,10 +477,7 @@ xfs_bmap_check_leaf_extents(
                }
                block = XFS_BUF_TO_BLOCK(bp);
        }
-        if (bp_release) {
-                bp_release = 0;
-                xfs_trans_brelse(NULL, bp);
-        }
        return;
 error0:
@@ -912,7 +909,7 @@ xfs_bmap_local_to_extents(
         * We don't want to deal with the case of keeping inode data inline yet.
         * So sending the data fork of a regular inode is invalid.
         */
-        ASSERT(!(S_ISREG(ip->i_d.di_mode) && whichfork == XFS_DATA_FORK));
+        ASSERT(!(S_ISREG(VFS_I(ip)->i_mode) && whichfork == XFS_DATA_FORK));
        ifp = XFS_IFORK_PTR(ip, whichfork);
        ASSERT(XFS_IFORK_FORMAT(ip, whichfork) == XFS_DINODE_FMT_LOCAL);
@@ -1079,7 +1076,7 @@ xfs_bmap_add_attrfork_local(
        if (ip->i_df.if_bytes <= XFS_IFORK_DSIZE(ip))
                return 0;
-        if (S_ISDIR(ip->i_d.di_mode)) {
+        if (S_ISDIR(VFS_I(ip)->i_mode)) {
                memset(&dargs, 0, sizeof(dargs));
                dargs.geo = ip->i_mount->m_dir_geo;
                dargs.dp = ip;
@@ -1091,7 +1088,7 @@ xfs_bmap_add_attrfork_local(
                return xfs_dir2_sf_to_block(&dargs);
        }
-        if (S_ISLNK(ip->i_d.di_mode))
+        if (S_ISLNK(VFS_I(ip)->i_mode))
                return xfs_bmap_local_to_extents(tp, ip, firstblock, 1,
                                                 flags, XFS_DATA_FORK,
                                                 xfs_symlink_local_to_remote);
@@ -4721,6 +4718,66 @@ error0:
 }
 /*
+ * When a delalloc extent is split (e.g., due to a hole punch), the original
+ * indlen reservation must be shared across the two new extents that are left
+ * behind.
+ *
+ * Given the original reservation and the worst case indlen for the two new
+ * extents (as calculated by xfs_bmap_worst_indlen()), split the original
+ * reservation fairly across the two new extents. If necessary, steal available
+ * blocks from a deleted extent to make up a reservation deficiency (e.g., if
+ * ores == 1). The number of stolen blocks is returned. The availability and
+ * subsequent accounting of stolen blocks is the responsibility of the caller.
+ */
+static xfs_filblks_t
+xfs_bmap_split_indlen(
+        xfs_filblks_t                   ores,           /* original res. */
+        xfs_filblks_t                   *indlen1,       /* ext1 worst indlen */
+        xfs_filblks_t                   *indlen2,       /* ext2 worst indlen */
+        xfs_filblks_t                   avail)          /* stealable blocks */
+{
+        xfs_filblks_t                   len1 = *indlen1;
+        xfs_filblks_t                   len2 = *indlen2;
+        xfs_filblks_t                   nres = len1 + len2; /* new total res. */
+        xfs_filblks_t                   stolen = 0;
+        /*
+         * Steal as many blocks as we can to try and satisfy the worst case
+         * indlen for both new extents.
+         */
+        while (nres > ores && avail) {
+                nres--;
+                avail--;
+                stolen++;
+        }
+        /*
+         * The only blocks available are those reserved for the original
+         * extent and what we can steal from the extent being removed.
+         * If this still isn't enough to satisfy the combined
+         * requirements for the two new extents, skim blocks off of each
+         * of the new reservations until they match what is available.
+         */
+        while (nres > ores) {
+                if (len1) {
+                        len1--;
+                        nres--;
+                }
+                if (nres == ores)
+                        break;
+                if (len2) {
+                        len2--;
+                        nres--;
+                }
+        }
+        *indlen1 = len1;
+        *indlen2 = len2;
+        return stolen;
+}
+/*
 * Called by xfs_bmapi to update file extent records and the btree
 * after removing space (or undoing a delayed allocation).
 */
@@ -4984,28 +5041,29 @@ xfs_bmap_del_extent(
                        XFS_IFORK_NEXT_SET(ip, whichfork,
                                XFS_IFORK_NEXTENTS(ip, whichfork) + 1);
                } else {
+                        xfs_filblks_t   stolen;
                        ASSERT(whichfork == XFS_DATA_FORK);
-                        temp = xfs_bmap_worst_indlen(ip, temp);
+                        /*
+                         * Distribute the original indlen reservation across the
+                         * two new extents. Steal blocks from the deleted extent
+                         * if necessary. Stealing blocks simply fudges the
+                         * fdblocks accounting in xfs_bunmapi().
+                         */
+                        temp = xfs_bmap_worst_indlen(ip, got.br_blockcount);
+                        temp2 = xfs_bmap_worst_indlen(ip, new.br_blockcount);
+                        stolen = xfs_bmap_split_indlen(da_old, &temp, &temp2,
+                                                       del->br_blockcount);
+                        da_new = temp + temp2 - stolen;
+                        del->br_blockcount -= stolen;
+                        /*
+                         * Set the reservation for each extent. Warn if either
+                         * is zero as this can lead to delalloc problems.
+                         */
+                        WARN_ON_ONCE(!temp || !temp2);
                        xfs_bmbt_set_startblock(ep, nullstartblock((int)temp));
-                        temp2 = xfs_bmap_worst_indlen(ip, temp2);
                        new.br_startblock = nullstartblock((int)temp2);
-                        da_new = temp + temp2;
-                        while (da_new > da_old) {
-                                if (temp) {
-                                        temp--;
-                                        da_new--;
-                                        xfs_bmbt_set_startblock(ep,
-                                                nullstartblock((int)temp));
-                                }
-                                if (da_new == da_old)
-                                        break;
-                                if (temp2) {
-                                        temp2--;
-                                        da_new--;
-                                        new.br_startblock =
-                                                nullstartblock((int)temp2);
-                                }
-                        }
                }
                trace_xfs_bmap_post_update(ip, *idx, state, _THIS_IP_);
                xfs_iext_insert(ip, *idx + 1, 1, &new, state);
@@ -5210,7 +5268,7 @@ xfs_bunmapi(
                         * This is better than zeroing it.
                         */
                        ASSERT(del.br_state == XFS_EXT_NORM);
-                        ASSERT(xfs_trans_get_block_res(tp) > 0);
+                        ASSERT(tp->t_blk_res > 0);
                        /*
                         * If this spans a realtime extent boundary,
                         * chop it back to the start of the one we end at.
@@ -5241,7 +5299,7 @@ xfs_bunmapi(
                                del.br_startblock += mod;
                        } else if ((del.br_startoff == start &&
                                    (del.br_state == XFS_EXT_UNWRITTEN ||
-                                     xfs_trans_get_block_res(tp) == 0)) ||
+                                     tp->t_blk_res == 0)) ||
                                   !xfs_sb_version_hasextflgbit(&mp->m_sb)) {
                                /*
                                 * Can't make it unwritten.  There isn't
@@ -5296,9 +5354,37 @@ xfs_bunmapi(
                                goto nodelete;
                        }
                }
+                /*
+                 * If it's the case where the directory code is running
+                 * with no block reservation, and the deleted block is in
+                 * the middle of its extent, and the resulting insert
+                 * of an extent would cause transformation to btree format,
+                 * then reject it.  The calling code will then swap
+                 * blocks around instead.
+                 * We have to do this now, rather than waiting for the
+                 * conversion to btree format, since the transaction
+                 * will be dirty.
+                 */
+                if (!wasdel && tp->t_blk_res == 0 &&
+                    XFS_IFORK_FORMAT(ip, whichfork) == XFS_DINODE_FMT_EXTENTS &&
+                    XFS_IFORK_NEXTENTS(ip, whichfork) >= /* Note the >= */
+                        XFS_IFORK_MAXEXT(ip, whichfork) &&
+                    del.br_startoff > got.br_startoff &&
+                    del.br_startoff + del.br_blockcount <
+                    got.br_startoff + got.br_blockcount) {
+                        error = -ENOSPC;
+                        goto error0;
+                }
+                /*
+                 * Unreserve quota and update realtime free space, if
+                 * appropriate. If delayed allocation, update the inode delalloc
+                 * counter now and wait to update the sb counters as
+                 * xfs_bmap_del_extent() might need to borrow some blocks.
+                 */
                if (wasdel) {
                        ASSERT(startblockval(del.br_startblock) > 0);
-                        /* Update realtime/data freespace, unreserve quota */
                        if (isrt) {
                                xfs_filblks_t rtexts;
@@ -5309,8 +5395,6 @@ xfs_bunmapi(
                                        ip, -((long)del.br_blockcount), 0,
                                        XFS_QMOPT_RES_RTBLKS);
                        } else {
-                                xfs_mod_fdblocks(mp, (int64_t)del.br_blockcount,
-                                                 false);
                                (void)xfs_trans_reserve_quota_nblks(NULL,
                                        ip, -((long)del.br_blockcount), 0,
                                        XFS_QMOPT_RES_REGBLKS);
@@ -5321,32 +5405,16 @@ xfs_bunmapi(
                                        XFS_BTCUR_BPRV_WASDEL;
                } else if (cur)
                        cur->bc_private.b.flags &= ~XFS_BTCUR_BPRV_WASDEL;
-                /*
-                 * If it's the case where the directory code is running
-                 * with no block reservation, and the deleted block is in
-                 * the middle of its extent, and the resulting insert
-                 * of an extent would cause transformation to btree format,
-                 * then reject it.  The calling code will then swap
-                 * blocks around instead.
-                 * We have to do this now, rather than waiting for the
-                 * conversion to btree format, since the transaction
-                 * will be dirty.
-                 */
-                if (!wasdel && xfs_trans_get_block_res(tp) == 0 &&
-                    XFS_IFORK_FORMAT(ip, whichfork) == XFS_DINODE_FMT_EXTENTS &&
-                    XFS_IFORK_NEXTENTS(ip, whichfork) >= /* Note the >= */
-                        XFS_IFORK_MAXEXT(ip, whichfork) &&
-                    del.br_startoff > got.br_startoff &&
-                    del.br_startoff + del.br_blockcount <
-                    got.br_startoff + got.br_blockcount) {
-                        error = -ENOSPC;
-                        goto error0;
-                }
                error = xfs_bmap_del_extent(ip, tp, &lastx, flist, cur, &del,
                                &tmp_logflags, whichfork);
                logflags |= tmp_logflags;
                if (error)
                        goto error0;
+                if (!isrt && wasdel)
+                        xfs_mod_fdblocks(mp, (int64_t)del.br_blockcount, false);
                bno = del.br_startoff - 1;
 nodelete:
                /*
diff --git a/fs/xfs/libxfs/xfs_bmap_btree.c b/fs/xfs/libxfs/xfs_bmap_btree.c
index 1637c37bfbaa..6282f6e708af 100644
--- a/fs/xfs/libxfs/xfs_bmap_btree.c
+++ b/fs/xfs/libxfs/xfs_bmap_btree.c
@@ -461,7 +461,7 @@ xfs_bmbt_alloc_block(
                 * reservation amount is insufficient then we may fail a
                 * block allocation here and corrupt the filesystem.
                 */
-                args.minleft = xfs_trans_get_block_res(args.tp);
+                args.minleft = args.tp->t_blk_res;
        } else if (cur->bc_private.b.flist->xbf_low) {
                args.type = XFS_ALLOCTYPE_START_BNO;
        } else {
@@ -470,7 +470,7 @@ xfs_bmbt_alloc_block(
        args.minlen = args.maxlen = args.prod = 1;
        args.wasdel = cur->bc_private.b.flags & XFS_BTCUR_BPRV_WASDEL;
-        if (!args.wasdel && xfs_trans_get_block_res(args.tp) == 0) {
+        if (!args.wasdel && args.tp->t_blk_res == 0) {
                error = -ENOSPC;
                goto error0;
        }
@@ -531,7 +531,6 @@ xfs_bmbt_free_block(
        xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE);
        xfs_trans_mod_dquot_byino(tp, ip, XFS_TRANS_DQ_BCOUNT, -1L);
-        xfs_trans_binval(tp, bp);
        return 0;
 }
diff --git a/fs/xfs/libxfs/xfs_btree.c b/fs/xfs/libxfs/xfs_btree.c
index a0eb18ce3ad3..1f88e1ce770f 100644
--- a/fs/xfs/libxfs/xfs_btree.c
+++ b/fs/xfs/libxfs/xfs_btree.c
@@ -294,6 +294,21 @@ xfs_btree_sblock_verify_crc(
        return true;
 }
+static int
+xfs_btree_free_block(
+        struct xfs_btree_cur    *cur,
+        struct xfs_buf          *bp)
+{
+        int                     error;
+        error = cur->bc_ops->free_block(cur, bp);
+        if (!error) {
+                xfs_trans_binval(cur->bc_tp, bp);
+                XFS_BTREE_STATS_INC(cur, free);
+        }
+        return error;
+}
 /*
 * Delete the btree cursor.
 */
@@ -3209,6 +3224,7 @@ xfs_btree_kill_iroot(
        int                     level;
        int                     index;
        int                     numrecs;
+        int                     error;
 #ifdef DEBUG
        union xfs_btree_ptr     ptr;
        int                     i;
@@ -3272,8 +3288,6 @@ xfs_btree_kill_iroot(
        cpp = xfs_btree_ptr_addr(cur, 1, cblock);
 #ifdef DEBUG
        for (i = 0; i < numrecs; i++) {
-                int             error;
                error = xfs_btree_check_ptr(cur, cpp, i, level - 1);
                if (error) {
                        XFS_BTREE_TRACE_CURSOR(cur, XBT_ERROR);
@@ -3283,8 +3297,11 @@ xfs_btree_kill_iroot(
 #endif
        xfs_btree_copy_ptrs(cur, pp, cpp, numrecs);
-        cur->bc_ops->free_block(cur, cbp);
+        error = xfs_btree_free_block(cur, cbp);
-        XFS_BTREE_STATS_INC(cur, free);
+        if (error) {
+                XFS_BTREE_TRACE_CURSOR(cur, XBT_ERROR);
+                return error;
+        }
        cur->bc_bufs[level - 1] = NULL;
        be16_add_cpu(&block->bb_level, -1);
@@ -3317,14 +3334,12 @@ xfs_btree_kill_root(
         */
        cur->bc_ops->set_root(cur, newroot, -1);
-        error = cur->bc_ops->free_block(cur, bp);
+        error = xfs_btree_free_block(cur, bp);
        if (error) {
                XFS_BTREE_TRACE_CURSOR(cur, XBT_ERROR);
                return error;
        }
-        XFS_BTREE_STATS_INC(cur, free);
        cur->bc_bufs[level] = NULL;
        cur->bc_ra[level] = 0;
        cur->bc_nlevels--;
@@ -3830,10 +3845,9 @@ xfs_btree_delrec(
        }
        /* Free the deleted block. */
-        error = cur->bc_ops->free_block(cur, rbp);
+        error = xfs_btree_free_block(cur, rbp);
        if (error)
                goto error0;
-        XFS_BTREE_STATS_INC(cur, free);
        /*
         * If we joined with the left neighbor, set the buffer in the
diff --git a/fs/xfs/libxfs/xfs_da_format.h b/fs/xfs/libxfs/xfs_da_format.h
index b14bbd6bb05f..8d4d8bce41bf 100644
--- a/fs/xfs/libxfs/xfs_da_format.h
+++ b/fs/xfs/libxfs/xfs_da_format.h
@@ -641,6 +641,22 @@ xfs_dir2_block_leaf_p(struct xfs_dir2_block_tail *btp)
 */
 #define XFS_ATTR_LEAF_MAPSIZE   3       /* how many freespace slots */
+/*
+ * Entries are packed toward the top as tight as possible.
+ */
+typedef struct xfs_attr_shortform {
+        struct xfs_attr_sf_hdr {        /* constant-structure header block */
+                __be16  totsize;        /* total bytes in shortform list */
+                __u8    count;  /* count of active entries */
+        } hdr;
+        struct xfs_attr_sf_entry {
+                __uint8_t namelen;      /* actual length of name (no NULL) */
+                __uint8_t valuelen;     /* actual length of value (no NULL) */
+                __uint8_t flags;        /* flags bits (see xfs_attr_leaf.h) */
+                __uint8_t nameval[1];   /* name & value bytes concatenated */
+        } list[1];                      /* variable sized array */
+} xfs_attr_shortform_t;
 typedef struct xfs_attr_leaf_map {      /* RLE map of free bytes */
        __be16  base;                     /* base of free region */
        __be16  size;                     /* length of free region */
diff --git a/fs/xfs/libxfs/xfs_dir2.c b/fs/xfs/libxfs/xfs_dir2.c
index 2fb53a5c0a74..af0f9d171f8a 100644
--- a/fs/xfs/libxfs/xfs_dir2.c
+++ b/fs/xfs/libxfs/xfs_dir2.c
@@ -176,7 +176,7 @@ xfs_dir_isempty(
 {
        xfs_dir2_sf_hdr_t       *sfp;
-        ASSERT(S_ISDIR(dp->i_d.di_mode));
+        ASSERT(S_ISDIR(VFS_I(dp)->i_mode));
        if (dp->i_d.di_size == 0)       /* might happen during shutdown. */
                return 1;
        if (dp->i_d.di_size > XFS_IFORK_DSIZE(dp))
@@ -231,7 +231,7 @@ xfs_dir_init(
        struct xfs_da_args *args;
        int             error;
-        ASSERT(S_ISDIR(dp->i_d.di_mode));
+        ASSERT(S_ISDIR(VFS_I(dp)->i_mode));
        error = xfs_dir_ino_validate(tp->t_mountp, pdp->i_ino);
        if (error)
                return error;
@@ -266,7 +266,7 @@ xfs_dir_createname(
        int                     rval;
        int                     v;              /* type-checking value */
-        ASSERT(S_ISDIR(dp->i_d.di_mode));
+        ASSERT(S_ISDIR(VFS_I(dp)->i_mode));
        if (inum) {
                rval = xfs_dir_ino_validate(tp->t_mountp, inum);
                if (rval)
@@ -364,7 +364,7 @@ xfs_dir_lookup(
        int             v;              /* type-checking value */
        int             lock_mode;
-        ASSERT(S_ISDIR(dp->i_d.di_mode));
+        ASSERT(S_ISDIR(VFS_I(dp)->i_mode));
        XFS_STATS_INC(dp->i_mount, xs_dir_lookup);
        /*
@@ -443,7 +443,7 @@ xfs_dir_removename(
        int             rval;
        int             v;              /* type-checking value */
-        ASSERT(S_ISDIR(dp->i_d.di_mode));
+        ASSERT(S_ISDIR(VFS_I(dp)->i_mode));
        XFS_STATS_INC(dp->i_mount, xs_dir_remove);
        args = kmem_zalloc(sizeof(*args), KM_SLEEP | KM_NOFS);
@@ -505,7 +505,7 @@ xfs_dir_replace(
        int             rval;
        int             v;              /* type-checking value */
-        ASSERT(S_ISDIR(dp->i_d.di_mode));
+        ASSERT(S_ISDIR(VFS_I(dp)->i_mode));
        rval = xfs_dir_ino_validate(tp->t_mountp, inum);
        if (rval)
diff --git a/fs/xfs/libxfs/xfs_dir2_node.c b/fs/xfs/libxfs/xfs_dir2_node.c
index 63ee03db796c..75a557432d0f 100644
--- a/fs/xfs/libxfs/xfs_dir2_node.c
+++ b/fs/xfs/libxfs/xfs_dir2_node.c
@@ -2235,6 +2235,9 @@ xfs_dir2_node_trim_free(
        dp = args->dp;
        tp = args->trans;
+        *rvalp = 0;
        /*
         * Read the freespace block.
         */
@@ -2255,7 +2258,6 @@ xfs_dir2_node_trim_free(
         */
        if (freehdr.nused > 0) {
                xfs_trans_brelse(tp, bp);
-                *rvalp = 0;
                return 0;
        }
        /*
diff --git a/fs/xfs/libxfs/xfs_ialloc.c b/fs/xfs/libxfs/xfs_ialloc.c
index 66d702e6b9ff..22297f9b0fd5 100644
--- a/fs/xfs/libxfs/xfs_ialloc.c
+++ b/fs/xfs/libxfs/xfs_ialloc.c
@@ -2403,8 +2403,8 @@ xfs_ialloc_compute_maxlevels(
        maxleafents = (1LL << XFS_INO_AGINO_BITS(mp)) >>
                XFS_INODES_PER_CHUNK_LOG;
-        minleafrecs = mp->m_alloc_mnr[0];
+        minleafrecs = mp->m_inobt_mnr[0];
-        minnoderecs = mp->m_alloc_mnr[1];
+        minnoderecs = mp->m_inobt_mnr[1];
        maxblocks = (maxleafents + minleafrecs - 1) / minleafrecs;
        for (level = 1; maxblocks > 1; level++)
                maxblocks = (maxblocks + minnoderecs - 1) / minnoderecs;
diff --git a/fs/xfs/libxfs/xfs_ialloc_btree.c b/fs/xfs/libxfs/xfs_ialloc_btree.c
index c679f3c05b63..89c21d771e35 100644
--- a/fs/xfs/libxfs/xfs_ialloc_btree.c
+++ b/fs/xfs/libxfs/xfs_ialloc_btree.c
@@ -125,16 +125,8 @@ xfs_inobt_free_block(
        struct xfs_btree_cur    *cur,
        struct xfs_buf          *bp)
 {
-        xfs_fsblock_t           fsbno;
+        return xfs_free_extent(cur->bc_tp,
-        int                     error;
+                        XFS_DADDR_TO_FSB(cur->bc_mp, XFS_BUF_ADDR(bp)), 1);
-        fsbno = XFS_DADDR_TO_FSB(cur->bc_mp, XFS_BUF_ADDR(bp));
-        error = xfs_free_extent(cur->bc_tp, fsbno, 1);
-        if (error)
-                return error;
-        xfs_trans_binval(cur->bc_tp, bp);
-        return error;
 }
 STATIC int
diff --git a/fs/xfs/libxfs/xfs_inode_buf.c b/fs/xfs/libxfs/xfs_inode_buf.c
index 1aabfda669b0..9d9559eb2835 100644
--- a/fs/xfs/libxfs/xfs_inode_buf.c
+++ b/fs/xfs/libxfs/xfs_inode_buf.c
@@ -195,28 +195,50 @@ xfs_imap_to_bp(
 }
 void
-xfs_dinode_from_disk(
+xfs_inode_from_disk(
-        xfs_icdinode_t          *to,
+        struct xfs_inode        *ip,
-        xfs_dinode_t            *from)
+        struct xfs_dinode       *from)
 {
-        to->di_magic = be16_to_cpu(from->di_magic);
+        struct xfs_icdinode     *to = &ip->i_d;
-        to->di_mode = be16_to_cpu(from->di_mode);
+        struct inode            *inode = VFS_I(ip);
-        to->di_version = from ->di_version;
+        /*
+         * Convert v1 inodes immediately to v2 inode format as this is the
+         * minimum inode version format we support in the rest of the code.
+         */
+        to->di_version = from->di_version;
+        if (to->di_version == 1) {
+                set_nlink(inode, be16_to_cpu(from->di_onlink));
+                to->di_projid_lo = 0;
+                to->di_projid_hi = 0;
+                to->di_version = 2;
+        } else {
+                set_nlink(inode, be32_to_cpu(from->di_nlink));
+                to->di_projid_lo = be16_to_cpu(from->di_projid_lo);
+                to->di_projid_hi = be16_to_cpu(from->di_projid_hi);
+        }
        to->di_format = from->di_format;
-        to->di_onlink = be16_to_cpu(from->di_onlink);
        to->di_uid = be32_to_cpu(from->di_uid);
        to->di_gid = be32_to_cpu(from->di_gid);
-        to->di_nlink = be32_to_cpu(from->di_nlink);
-        to->di_projid_lo = be16_to_cpu(from->di_projid_lo);
-        to->di_projid_hi = be16_to_cpu(from->di_projid_hi);
-        memcpy(to->di_pad, from->di_pad, sizeof(to->di_pad));
        to->di_flushiter = be16_to_cpu(from->di_flushiter);
-        to->di_atime.t_sec = be32_to_cpu(from->di_atime.t_sec);
-        to->di_atime.t_nsec = be32_to_cpu(from->di_atime.t_nsec);
+        /*
-        to->di_mtime.t_sec = be32_to_cpu(from->di_mtime.t_sec);
+         * Time is signed, so need to convert to signed 32 bit before
-        to->di_mtime.t_nsec = be32_to_cpu(from->di_mtime.t_nsec);
+         * storing in inode timestamp which may be 64 bit. Otherwise
-        to->di_ctime.t_sec = be32_to_cpu(from->di_ctime.t_sec);
+         * a time before epoch is converted to a time long after epoch
-        to->di_ctime.t_nsec = be32_to_cpu(from->di_ctime.t_nsec);
+         * on 64 bit systems.
+         */
+        inode->i_atime.tv_sec = (int)be32_to_cpu(from->di_atime.t_sec);
+        inode->i_atime.tv_nsec = (int)be32_to_cpu(from->di_atime.t_nsec);
+        inode->i_mtime.tv_sec = (int)be32_to_cpu(from->di_mtime.t_sec);
+        inode->i_mtime.tv_nsec = (int)be32_to_cpu(from->di_mtime.t_nsec);
+        inode->i_ctime.tv_sec = (int)be32_to_cpu(from->di_ctime.t_sec);
+        inode->i_ctime.tv_nsec = (int)be32_to_cpu(from->di_ctime.t_nsec);
+        inode->i_generation = be32_to_cpu(from->di_gen);
+        inode->i_mode = be16_to_cpu(from->di_mode);
        to->di_size = be64_to_cpu(from->di_size);
        to->di_nblocks = be64_to_cpu(from->di_nblocks);
        to->di_extsize = be32_to_cpu(from->di_extsize);
@@ -227,42 +249,96 @@ xfs_dinode_from_disk(
        to->di_dmevmask = be32_to_cpu(from->di_dmevmask);
        to->di_dmstate  = be16_to_cpu(from->di_dmstate);
        to->di_flags    = be16_to_cpu(from->di_flags);
-        to->di_gen      = be32_to_cpu(from->di_gen);
        if (to->di_version == 3) {
-                to->di_changecount = be64_to_cpu(from->di_changecount);
+                inode->i_version = be64_to_cpu(from->di_changecount);
                to->di_crtime.t_sec = be32_to_cpu(from->di_crtime.t_sec);
                to->di_crtime.t_nsec = be32_to_cpu(from->di_crtime.t_nsec);
                to->di_flags2 = be64_to_cpu(from->di_flags2);
-                to->di_ino = be64_to_cpu(from->di_ino);
-                to->di_lsn = be64_to_cpu(from->di_lsn);
-                memcpy(to->di_pad2, from->di_pad2, sizeof(to->di_pad2));
-                uuid_copy(&to->di_uuid, &from->di_uuid);
        }
 }
 void
-xfs_dinode_to_disk(
+xfs_inode_to_disk(
-        xfs_dinode_t            *to,
+        struct xfs_inode        *ip,
-        xfs_icdinode_t          *from)
+        struct xfs_dinode       *to,
+        xfs_lsn_t               lsn)
+{
+        struct xfs_icdinode     *from = &ip->i_d;
+        struct inode            *inode = VFS_I(ip);
+        to->di_magic = cpu_to_be16(XFS_DINODE_MAGIC);
+        to->di_onlink = 0;
+        to->di_version = from->di_version;
+        to->di_format = from->di_format;
+        to->di_uid = cpu_to_be32(from->di_uid);
+        to->di_gid = cpu_to_be32(from->di_gid);
+        to->di_projid_lo = cpu_to_be16(from->di_projid_lo);
+        to->di_projid_hi = cpu_to_be16(from->di_projid_hi);
+        memset(to->di_pad, 0, sizeof(to->di_pad));
+        to->di_atime.t_sec = cpu_to_be32(inode->i_atime.tv_sec);
+        to->di_atime.t_nsec = cpu_to_be32(inode->i_atime.tv_nsec);
+        to->di_mtime.t_sec = cpu_to_be32(inode->i_mtime.tv_sec);
+        to->di_mtime.t_nsec = cpu_to_be32(inode->i_mtime.tv_nsec);
+        to->di_ctime.t_sec = cpu_to_be32(inode->i_ctime.tv_sec);
+        to->di_ctime.t_nsec = cpu_to_be32(inode->i_ctime.tv_nsec);
+        to->di_nlink = cpu_to_be32(inode->i_nlink);
+        to->di_gen = cpu_to_be32(inode->i_generation);
+        to->di_mode = cpu_to_be16(inode->i_mode);
+        to->di_size = cpu_to_be64(from->di_size);
+        to->di_nblocks = cpu_to_be64(from->di_nblocks);
+        to->di_extsize = cpu_to_be32(from->di_extsize);
+        to->di_nextents = cpu_to_be32(from->di_nextents);
+        to->di_anextents = cpu_to_be16(from->di_anextents);
+        to->di_forkoff = from->di_forkoff;
+        to->di_aformat = from->di_aformat;
+        to->di_dmevmask = cpu_to_be32(from->di_dmevmask);
+        to->di_dmstate = cpu_to_be16(from->di_dmstate);
+        to->di_flags = cpu_to_be16(from->di_flags);
+        if (from->di_version == 3) {
+                to->di_changecount = cpu_to_be64(inode->i_version);
+                to->di_crtime.t_sec = cpu_to_be32(from->di_crtime.t_sec);
+                to->di_crtime.t_nsec = cpu_to_be32(from->di_crtime.t_nsec);
+                to->di_flags2 = cpu_to_be64(from->di_flags2);
+                to->di_ino = cpu_to_be64(ip->i_ino);
+                to->di_lsn = cpu_to_be64(lsn);
+                memset(to->di_pad2, 0, sizeof(to->di_pad2));
+                uuid_copy(&to->di_uuid, &ip->i_mount->m_sb.sb_meta_uuid);
+                to->di_flushiter = 0;
+        } else {
+                to->di_flushiter = cpu_to_be16(from->di_flushiter);
+        }
+}
+void
+xfs_log_dinode_to_disk(
+        struct xfs_log_dinode   *from,
+        struct xfs_dinode       *to)
 {
        to->di_magic = cpu_to_be16(from->di_magic);
        to->di_mode = cpu_to_be16(from->di_mode);
-        to->di_version = from ->di_version;
+        to->di_version = from->di_version;
        to->di_format = from->di_format;
-        to->di_onlink = cpu_to_be16(from->di_onlink);
+        to->di_onlink = 0;
        to->di_uid = cpu_to_be32(from->di_uid);
        to->di_gid = cpu_to_be32(from->di_gid);
        to->di_nlink = cpu_to_be32(from->di_nlink);
        to->di_projid_lo = cpu_to_be16(from->di_projid_lo);
        to->di_projid_hi = cpu_to_be16(from->di_projid_hi);
        memcpy(to->di_pad, from->di_pad, sizeof(to->di_pad));
        to->di_atime.t_sec = cpu_to_be32(from->di_atime.t_sec);
        to->di_atime.t_nsec = cpu_to_be32(from->di_atime.t_nsec);
        to->di_mtime.t_sec = cpu_to_be32(from->di_mtime.t_sec);
        to->di_mtime.t_nsec = cpu_to_be32(from->di_mtime.t_nsec);
        to->di_ctime.t_sec = cpu_to_be32(from->di_ctime.t_sec);
        to->di_ctime.t_nsec = cpu_to_be32(from->di_ctime.t_nsec);
        to->di_size = cpu_to_be64(from->di_size);
        to->di_nblocks = cpu_to_be64(from->di_nblocks);
        to->di_extsize = cpu_to_be32(from->di_extsize);
@@ -367,13 +443,10 @@ xfs_iread(
            !(mp->m_flags & XFS_MOUNT_IKEEP)) {
                /* initialise the on-disk inode core */
                memset(&ip->i_d, 0, sizeof(ip->i_d));
-                ip->i_d.di_magic = XFS_DINODE_MAGIC;
+                VFS_I(ip)->i_generation = prandom_u32();
-                ip->i_d.di_gen = prandom_u32();
+                if (xfs_sb_version_hascrc(&mp->m_sb))
-                if (xfs_sb_version_hascrc(&mp->m_sb)) {
                        ip->i_d.di_version = 3;
-                        ip->i_d.di_ino = ip->i_ino;
+                else
-                        uuid_copy(&ip->i_d.di_uuid, &mp->m_sb.sb_meta_uuid);
-                } else
                        ip->i_d.di_version = 2;
                return 0;
        }
@@ -403,7 +476,7 @@ xfs_iread(
         * Otherwise, just get the truly permanent information.
         */
        if (dip->di_mode) {
-                xfs_dinode_from_disk(&ip->i_d, dip);
+                xfs_inode_from_disk(ip, dip);
                error = xfs_iformat_fork(ip, dip);
                if (error)  {
 #ifdef DEBUG
@@ -417,16 +490,10 @@ xfs_iread(
                 * Partial initialisation of the in-core inode. Just the bits
                 * that xfs_ialloc won't overwrite or relies on being correct.
                 */
-                ip->i_d.di_magic = be16_to_cpu(dip->di_magic);
                ip->i_d.di_version = dip->di_version;
-                ip->i_d.di_gen = be32_to_cpu(dip->di_gen);
+                VFS_I(ip)->i_generation = be32_to_cpu(dip->di_gen);
                ip->i_d.di_flushiter = be16_to_cpu(dip->di_flushiter);
-                if (dip->di_version == 3) {
-                        ip->i_d.di_ino = be64_to_cpu(dip->di_ino);
-                        uuid_copy(&ip->i_d.di_uuid, &dip->di_uuid);
-                }
                /*
                 * Make sure to pull in the mode here as well in
                 * case the inode is released without being used.
@@ -434,25 +501,10 @@ xfs_iread(
                 * the inode is already free and not try to mess
                 * with the uninitialized part of it.
                 */
-                ip->i_d.di_mode = 0;
+                VFS_I(ip)->i_mode = 0;
-        }
-        /*
-         * Automatically convert version 1 inode formats in memory to version 2
-         * inode format. If the inode is modified, it will get logged and
-         * rewritten as a version 2 inode. We can do this because we set the
-         * superblock feature bit for v2 inodes unconditionally during mount
-         * and it means the reast of the code can assume the inode version is 2
-         * or higher.
-         */
-        if (ip->i_d.di_version == 1) {
-                ip->i_d.di_version = 2;
-                memset(&(ip->i_d.di_pad[0]), 0, sizeof(ip->i_d.di_pad));
-                ip->i_d.di_nlink = ip->i_d.di_onlink;
-                ip->i_d.di_onlink = 0;
-                xfs_set_projid(ip, 0);
        }
+        ASSERT(ip->i_d.di_version >= 2);
        ip->i_delayed_blks = 0;
        /*
diff --git a/fs/xfs/libxfs/xfs_inode_buf.h b/fs/xfs/libxfs/xfs_inode_buf.h
index 9308c47f2a52..7c4dd321b215 100644
--- a/fs/xfs/libxfs/xfs_inode_buf.h
+++ b/fs/xfs/libxfs/xfs_inode_buf.h
@@ -20,7 +20,36 @@
 struct xfs_inode;
 struct xfs_dinode;
-struct xfs_icdinode;
+/*
+ * In memory representation of the XFS inode. This is held in the in-core struct
+ * xfs_inode and represents the current on disk values but the structure is not
+ * in on-disk format.  That is, this structure is always translated to on-disk
+ * format specific structures at the appropriate time.
+ */
+struct xfs_icdinode {
+        __int8_t        di_version;     /* inode version */
+        __int8_t        di_format;      /* format of di_c data */
+        __uint16_t      di_flushiter;   /* incremented on flush */
+        __uint32_t      di_uid;         /* owner's user id */
+        __uint32_t      di_gid;         /* owner's group id */
+        __uint16_t      di_projid_lo;   /* lower part of owner's project id */
+        __uint16_t      di_projid_hi;   /* higher part of owner's project id */
+        xfs_fsize_t     di_size;        /* number of bytes in file */
+        xfs_rfsblock_t  di_nblocks;     /* # of direct & btree blocks used */
+        xfs_extlen_t    di_extsize;     /* basic/minimum extent size for file */
+        xfs_extnum_t    di_nextents;    /* number of extents in data fork */
+        xfs_aextnum_t   di_anextents;   /* number of extents in attribute fork*/
+        __uint8_t       di_forkoff;     /* attr fork offs, <<3 for 64b align */
+        __int8_t        di_aformat;     /* format of attr fork's data */
+        __uint32_t      di_dmevmask;    /* DMIG event mask */
+        __uint16_t      di_dmstate;     /* DMIG state info */
+        __uint16_t      di_flags;       /* random flags, XFS_DIFLAG_... */
+        __uint64_t      di_flags2;      /* more random flags */
+        xfs_ictimestamp_t di_crtime;    /* time created */
+};
 /*
 * Inode location information.  Stored in the inode and passed to
@@ -38,8 +67,11 @@ int	xfs_imap_to_bp(struct xfs_mount *, struct xfs_trans *,
 int     xfs_iread(struct xfs_mount *, struct xfs_trans *,
                  struct xfs_inode *, uint);
 void    xfs_dinode_calc_crc(struct xfs_mount *, struct xfs_dinode *);
-void    xfs_dinode_to_disk(struct xfs_dinode *to, struct xfs_icdinode *from);
+void    xfs_inode_to_disk(struct xfs_inode *ip, struct xfs_dinode *to,
-void    xfs_dinode_from_disk(struct xfs_icdinode *to, struct xfs_dinode *from);
+                          xfs_lsn_t lsn);
+void    xfs_inode_from_disk(struct xfs_inode *ip, struct xfs_dinode *from);
+void    xfs_log_dinode_to_disk(struct xfs_log_dinode *from,
+                               struct xfs_dinode *to);
 #if defined(DEBUG)
 void    xfs_inobp_check(struct xfs_mount *, struct xfs_buf *);
diff --git a/fs/xfs/libxfs/xfs_inode_fork.c b/fs/xfs/libxfs/xfs_inode_fork.c
index 0defbd02f62d..11faf7df14c8 100644
--- a/fs/xfs/libxfs/xfs_inode_fork.c
+++ b/fs/xfs/libxfs/xfs_inode_fork.c
@@ -31,6 +31,7 @@
 #include "xfs_error.h"
 #include "xfs_trace.h"
 #include "xfs_attr_sf.h"
+#include "xfs_da_format.h"
 kmem_zone_t *xfs_ifork_zone;
@@ -120,7 +121,7 @@ xfs_iformat_fork(
                return -EFSCORRUPTED;
        }
-        switch (ip->i_d.di_mode & S_IFMT) {
+        switch (VFS_I(ip)->i_mode & S_IFMT) {
        case S_IFIFO:
        case S_IFCHR:
        case S_IFBLK:
diff --git a/fs/xfs/libxfs/xfs_log_format.h b/fs/xfs/libxfs/xfs_log_format.h
index 265314690415..d54a8018b079 100644
--- a/fs/xfs/libxfs/xfs_log_format.h
+++ b/fs/xfs/libxfs/xfs_log_format.h
@@ -290,6 +290,7 @@ typedef struct xfs_inode_log_format_64 {
        __int32_t               ilf_boffset;    /* off of inode in buffer */
 } xfs_inode_log_format_64_t;
 /*
 * Flags for xfs_trans_log_inode flags field.
 */
@@ -360,15 +361,15 @@ typedef struct xfs_ictimestamp {
 } xfs_ictimestamp_t;
 /*
- * NOTE:  This structure must be kept identical to struct xfs_dinode
+ * Define the format of the inode core that is logged. This structure must be
- *        except for the endianness annotations.
+ * kept identical to struct xfs_dinode except for the endianness annotations.
 */
-typedef struct xfs_icdinode {
+struct xfs_log_dinode {
        __uint16_t      di_magic;       /* inode magic # = XFS_DINODE_MAGIC */
        __uint16_t      di_mode;        /* mode and type of file */
        __int8_t        di_version;     /* inode version */
        __int8_t        di_format;      /* format of di_c data */
-        __uint16_t      di_onlink;      /* old number of links to file */
+        __uint8_t       di_pad3[2];     /* unused in v2/3 inodes */
        __uint32_t      di_uid;         /* owner's user id */
        __uint32_t      di_gid;         /* owner's group id */
        __uint32_t      di_nlink;       /* number of links to file */
@@ -407,13 +408,13 @@ typedef struct xfs_icdinode {
        uuid_t          di_uuid;        /* UUID of the filesystem */
        /* structure must be padded to 64 bit alignment */
-} xfs_icdinode_t;
+};
-static inline uint xfs_icdinode_size(int version)
+static inline uint xfs_log_dinode_size(int version)
 {
        if (version == 3)
-                return sizeof(struct xfs_icdinode);
+                return sizeof(struct xfs_log_dinode);
-        return offsetof(struct xfs_icdinode, di_next_unlinked);
+        return offsetof(struct xfs_log_dinode, di_next_unlinked);
 }
 /*
@@ -495,6 +496,8 @@ enum xfs_blft {
        XFS_BLFT_ATTR_LEAF_BUF,
        XFS_BLFT_ATTR_RMT_BUF,
        XFS_BLFT_SB_BUF,
+        XFS_BLFT_RTBITMAP_BUF,
+        XFS_BLFT_RTSUMMARY_BUF,
        XFS_BLFT_MAX_BUF = (1 << XFS_BLFT_BITS),
 };
diff --git a/fs/xfs/libxfs/xfs_quota_defs.h b/fs/xfs/libxfs/xfs_quota_defs.h
index f51078f1e92a..8eed51275bb3 100644
--- a/fs/xfs/libxfs/xfs_quota_defs.h
+++ b/fs/xfs/libxfs/xfs_quota_defs.h
@@ -37,7 +37,7 @@ typedef __uint16_t	xfs_qwarncnt_t;
 #define XFS_DQ_PROJ             0x0002          /* project quota */
 #define XFS_DQ_GROUP            0x0004          /* a group quota */
 #define XFS_DQ_DIRTY            0x0008          /* dquot is dirty */
-#define XFS_DQ_FREEING          0x0010          /* dquot is beeing torn down */
+#define XFS_DQ_FREEING          0x0010          /* dquot is being torn down */
 #define XFS_DQ_ALLTYPES         (XFS_DQ_USER|XFS_DQ_PROJ|XFS_DQ_GROUP)
@@ -116,6 +116,7 @@ typedef __uint16_t	xfs_qwarncnt_t;
 #define XFS_QMOPT_DQREPAIR      0x0001000 /* repair dquot if damaged */
 #define XFS_QMOPT_GQUOTA        0x0002000 /* group dquot requested */
 #define XFS_QMOPT_ENOSPC        0x0004000 /* enospc instead of edquot (prj) */
+#define XFS_QMOPT_DQNEXT        0x0008000 /* return next dquot >= this ID */
 /*
 * flags to xfs_trans_mod_dquot to indicate which field needs to be
diff --git a/fs/xfs/libxfs/xfs_rtbitmap.c b/fs/xfs/libxfs/xfs_rtbitmap.c
index 9b59ffa1fc19..951c044e24e4 100644
--- a/fs/xfs/libxfs/xfs_rtbitmap.c
+++ b/fs/xfs/libxfs/xfs_rtbitmap.c
@@ -42,6 +42,31 @@
 */
 /*
+ * Real time buffers need verifiers to avoid runtime warnings during IO.
+ * We don't have anything to verify, however, so these are just dummy
+ * operations.
+ */
+static void
+xfs_rtbuf_verify_read(
+        struct xfs_buf  *bp)
+{
+        return;
+}
+static void
+xfs_rtbuf_verify_write(
+        struct xfs_buf  *bp)
+{
+        return;
+}
+const struct xfs_buf_ops xfs_rtbuf_ops = {
+        .name = "rtbuf",
+        .verify_read = xfs_rtbuf_verify_read,
+        .verify_write = xfs_rtbuf_verify_write,
+};
+/*
 * Get a buffer for the bitmap or summary file block specified.
 * The buffer is returned read and locked.
 */
@@ -68,9 +93,12 @@ xfs_rtbuf_get(
        ASSERT(map.br_startblock != NULLFSBLOCK);
        error = xfs_trans_read_buf(mp, tp, mp->m_ddev_targp,
                                   XFS_FSB_TO_DADDR(mp, map.br_startblock),
-                                   mp->m_bsize, 0, &bp, NULL);
+                                   mp->m_bsize, 0, &bp, &xfs_rtbuf_ops);
        if (error)
                return error;
+        xfs_trans_buf_set_type(tp, bp, issum ? XFS_BLFT_RTSUMMARY_BUF
+                                             : XFS_BLFT_RTBITMAP_BUF);
        *bpp = bp;
        return 0;
 }
@@ -983,7 +1011,7 @@ xfs_rtfree_extent(
            mp->m_sb.sb_rextents) {
                if (!(mp->m_rbmip->i_d.di_flags & XFS_DIFLAG_NEWRTBM))
                        mp->m_rbmip->i_d.di_flags |= XFS_DIFLAG_NEWRTBM;
-                *(__uint64_t *)&mp->m_rbmip->i_d.di_atime = 0;
+                *(__uint64_t *)&VFS_I(mp->m_rbmip)->i_atime = 0;
                xfs_trans_log_inode(tp, mp->m_rbmip, XFS_ILOG_CORE);
        }
        return 0;
diff --git a/fs/xfs/libxfs/xfs_sb.h b/fs/xfs/libxfs/xfs_sb.h
index b25bb9a343f3..961e6475a309 100644
--- a/fs/xfs/libxfs/xfs_sb.h
+++ b/fs/xfs/libxfs/xfs_sb.h
@@ -27,7 +27,6 @@ extern struct xfs_perag *xfs_perag_get_tag(struct xfs_mount *, xfs_agnumber_t,
 extern void     xfs_perag_put(struct xfs_perag *pag);
 extern int      xfs_initialize_perag_data(struct xfs_mount *, xfs_agnumber_t);
-extern void     xfs_sb_calc_crc(struct xfs_buf *bp);
 extern void     xfs_log_sb(struct xfs_trans *tp);
 extern int      xfs_sync_sb(struct xfs_mount *mp, bool wait);
 extern void     xfs_sb_mount_common(struct xfs_mount *mp, struct xfs_sb *sbp);
diff --git a/fs/xfs/libxfs/xfs_shared.h b/fs/xfs/libxfs/xfs_shared.h
index 15c3ceb845b9..81ac870834da 100644
--- a/fs/xfs/libxfs/xfs_shared.h
+++ b/fs/xfs/libxfs/xfs_shared.h
@@ -53,6 +53,7 @@ extern const struct xfs_buf_ops xfs_dquot_buf_ra_ops;
 extern const struct xfs_buf_ops xfs_sb_buf_ops;
 extern const struct xfs_buf_ops xfs_sb_quiet_buf_ops;
 extern const struct xfs_buf_ops xfs_symlink_buf_ops;
+extern const struct xfs_buf_ops xfs_rtbuf_ops;
 /*
 * Transaction types.  Used to distinguish types of buffers. These never reach
diff --git a/fs/xfs/xfs_aops.c b/fs/xfs/xfs_aops.c
index a9ebabfe7587..d445a64b979e 100644
--- a/fs/xfs/xfs_aops.c
+++ b/fs/xfs/xfs_aops.c
@@ -36,6 +36,21 @@
 #include <linux/pagevec.h>
 #include <linux/writeback.h>
+/* flags for direct write completions */
+#define XFS_DIO_FLAG_UNWRITTEN  (1 << 0)
+#define XFS_DIO_FLAG_APPEND     (1 << 1)
+/*
+ * structure owned by writepages passed to individual writepage calls
+ */
+struct xfs_writepage_ctx {
+        struct xfs_bmbt_irec    imap;
+        bool                    imap_valid;
+        unsigned int            io_type;
+        struct xfs_ioend        *ioend;
+        sector_t                last_block;
+};
 void
 xfs_count_page_state(
        struct page             *page,
@@ -214,10 +229,12 @@ xfs_end_io(
        struct xfs_inode *ip = XFS_I(ioend->io_inode);
        int             error = 0;
-        if (XFS_FORCED_SHUTDOWN(ip->i_mount)) {
+        /*
+         * Set an error if the mount has shut down and proceed with end I/O
+         * processing so it can perform whatever cleanups are necessary.
+         */
+        if (XFS_FORCED_SHUTDOWN(ip->i_mount))
                ioend->io_error = -EIO;
-                goto done;
-        }
        /*
         * For unwritten extents we need to issue transactions to convert a
@@ -265,7 +282,7 @@ xfs_alloc_ioend(
         */
        atomic_set(&ioend->io_remaining, 1);
        ioend->io_error = 0;
-        ioend->io_list = NULL;
+        INIT_LIST_HEAD(&ioend->io_list);
        ioend->io_type = type;
        ioend->io_inode = inode;
        ioend->io_buffer_head = NULL;
@@ -283,8 +300,7 @@ xfs_map_blocks(
        struct inode            *inode,
        loff_t                  offset,
        struct xfs_bmbt_irec    *imap,
-        int                     type,
+        int                     type)
-        int                     nonblocking)
 {
        struct xfs_inode        *ip = XFS_I(inode);
        struct xfs_mount        *mp = ip->i_mount;
@@ -300,12 +316,7 @@ xfs_map_blocks(
        if (type == XFS_IO_UNWRITTEN)
                bmapi_flags |= XFS_BMAPI_IGSTATE;
-        if (!xfs_ilock_nowait(ip, XFS_ILOCK_SHARED)) {
+        xfs_ilock(ip, XFS_ILOCK_SHARED);
-                if (nonblocking)
-                        return -EAGAIN;
-                xfs_ilock(ip, XFS_ILOCK_SHARED);
-        }
        ASSERT(ip->i_d.di_format != XFS_DINODE_FMT_BTREE ||
               (ip->i_df.if_flags & XFS_IFEXTENTS));
        ASSERT(offset <= mp->m_super->s_maxbytes);
@@ -341,7 +352,7 @@ xfs_map_blocks(
        return 0;
 }
-STATIC int
+STATIC bool
 xfs_imap_valid(
        struct inode            *inode,
        struct xfs_bmbt_irec    *imap,
@@ -414,8 +425,7 @@ xfs_start_buffer_writeback(
 STATIC void
 xfs_start_page_writeback(
        struct page             *page,
-        int                     clear_dirty,
+        int                     clear_dirty)
-        int                     buffers)
 {
        ASSERT(PageLocked(page));
        ASSERT(!PageWriteback(page));
@@ -434,10 +444,6 @@ xfs_start_page_writeback(
                set_page_writeback_keepwrite(page);
        unlock_page(page);
-        /* If no buffers on the page are to be written, finish it here */
-        if (!buffers)
-                end_page_writeback(page);
 }
 static inline int xfs_bio_add_buffer(struct bio *bio, struct buffer_head *bh)
@@ -446,153 +452,101 @@ static inline int xfs_bio_add_buffer(struct bio *bio, struct buffer_head *bh)
 }
 /*
- * Submit all of the bios for all of the ioends we have saved up, covering the
+ * Submit all of the bios for an ioend. We are only passed a single ioend at a
- * initial writepage page and also any probed pages.
+ * time; the caller is responsible for chaining prior to submission.
- *
- * Because we may have multiple ioends spanning a page, we need to start
- * writeback on all the buffers before we submit them for I/O. If we mark the
- * buffers as we got, then we can end up with a page that only has buffers
- * marked async write and I/O complete on can occur before we mark the other
- * buffers async write.
- *
- * The end result of this is that we trip a bug in end_page_writeback() because
- * we call it twice for the one page as the code in end_buffer_async_write()
- * assumes that all buffers on the page are started at the same time.
- *
- * The fix is two passes across the ioend list - one to start writeback on the
- * buffer_heads, and then submit them for I/O on the second pass.
 *
 * If @fail is non-zero, it means that we have a situation where some part of
 * the submission process has failed after we have marked paged for writeback
 * and unlocked them. In this situation, we need to fail the ioend chain rather
 * than submit it to IO. This typically only happens on a filesystem shutdown.
 */
-STATIC void
+STATIC int
 xfs_submit_ioend(
        struct writeback_control *wbc,
        xfs_ioend_t             *ioend,
-        int                     fail)
+        int                     status)
 {
-        xfs_ioend_t             *head = ioend;
-        xfs_ioend_t             *next;
        struct buffer_head      *bh;
        struct bio              *bio;
        sector_t                lastblock = 0;
-        /* Pass 1 - start writeback */
+        /* Reserve log space if we might write beyond the on-disk inode size. */
-        do {
+        if (!status &&
-                next = ioend->io_list;
+             ioend->io_type != XFS_IO_UNWRITTEN && xfs_ioend_is_append(ioend))
-                for (bh = ioend->io_buffer_head; bh; bh = bh->b_private)
+                status = xfs_setfilesize_trans_alloc(ioend);
-                        xfs_start_buffer_writeback(bh);
+        /*
-        } while ((ioend = next) != NULL);
+         * If we are failing the IO now, just mark the ioend with an
+         * error and finish it. This will run IO completion immediately
+         * as there is only one reference to the ioend at this point in
+         * time.
+         */
+        if (status) {
+                ioend->io_error = status;
+                xfs_finish_ioend(ioend);
+                return status;
+        }
-        /* Pass 2 - submit I/O */
+        bio = NULL;
-        ioend = head;
+        for (bh = ioend->io_buffer_head; bh; bh = bh->b_private) {
-        do {
-                next = ioend->io_list;
-                bio = NULL;
-                /*
+                if (!bio) {
-                 * If we are failing the IO now, just mark the ioend with an
+retry:
-                 * error and finish it. This will run IO completion immediately
+                        bio = xfs_alloc_ioend_bio(bh);
-                 * as there is only one reference to the ioend at this point in
+                } else if (bh->b_blocknr != lastblock + 1) {
-                 * time.
+                        xfs_submit_ioend_bio(wbc, ioend, bio);
-                 */
+                        goto retry;
-                if (fail) {
-                        ioend->io_error = fail;
-                        xfs_finish_ioend(ioend);
-                        continue;
                }
-                for (bh = ioend->io_buffer_head; bh; bh = bh->b_private) {
+                if (xfs_bio_add_buffer(bio, bh) != bh->b_size) {
-                        if (!bio) {
- retry:
-                                bio = xfs_alloc_ioend_bio(bh);
-                        } else if (bh->b_blocknr != lastblock + 1) {
-                                xfs_submit_ioend_bio(wbc, ioend, bio);
-                                goto retry;
-                        }
-                        if (xfs_bio_add_buffer(bio, bh) != bh->b_size) {
-                                xfs_submit_ioend_bio(wbc, ioend, bio);
-                                goto retry;
-                        }
-                        lastblock = bh->b_blocknr;
-                }
-                if (bio)
                        xfs_submit_ioend_bio(wbc, ioend, bio);
-                xfs_finish_ioend(ioend);
+                        goto retry;
-        } while ((ioend = next) != NULL);
+                }
-}
-/*
- * Cancel submission of all buffer_heads so far in this endio.
- * Toss the endio too.  Only ever called for the initial page
- * in a writepage request, so only ever one page.
- */
-STATIC void
-xfs_cancel_ioend(
-        xfs_ioend_t             *ioend)
-{
-        xfs_ioend_t             *next;
-        struct buffer_head      *bh, *next_bh;
-        do {
-                next = ioend->io_list;
-                bh = ioend->io_buffer_head;
-                do {
-                        next_bh = bh->b_private;
-                        clear_buffer_async_write(bh);
-                        /*
-                         * The unwritten flag is cleared when added to the
-                         * ioend. We're not submitting for I/O so mark the
-                         * buffer unwritten again for next time around.
-                         */
-                        if (ioend->io_type == XFS_IO_UNWRITTEN)
-                                set_buffer_unwritten(bh);
-                        unlock_buffer(bh);
-                } while ((bh = next_bh) != NULL);
-                mempool_free(ioend, xfs_ioend_pool);
+                lastblock = bh->b_blocknr;
-        } while ((ioend = next) != NULL);
+        }
+        if (bio)
+                xfs_submit_ioend_bio(wbc, ioend, bio);
+        xfs_finish_ioend(ioend);
+        return 0;
 }
 /*
 * Test to see if we've been building up a completion structure for
 * earlier buffers -- if so, we try to append to this ioend if we
 * can, otherwise we finish off any current ioend and start another.
- * Return true if we've finished the given ioend.
+ * Return the ioend we finished off so that the caller can submit it
+ * once it has finished processing the dirty page.
 */
 STATIC void
 xfs_add_to_ioend(
        struct inode            *inode,
        struct buffer_head      *bh,
        xfs_off_t               offset,
-        unsigned int            type,
+        struct xfs_writepage_ctx *wpc,
-        xfs_ioend_t             **result,
+        struct list_head        *iolist)
-        int                     need_ioend)
 {
-        xfs_ioend_t             *ioend = *result;
+        if (!wpc->ioend || wpc->io_type != wpc->ioend->io_type ||
+            bh->b_blocknr != wpc->last_block + 1 ||
-        if (!ioend || need_ioend || type != ioend->io_type) {
+            offset != wpc->ioend->io_offset + wpc->ioend->io_size) {
-                xfs_ioend_t     *previous = *result;
+                struct xfs_ioend        *new;
-                ioend = xfs_alloc_ioend(inode, type);
+                if (wpc->ioend)
-                ioend->io_offset = offset;
+                        list_add(&wpc->ioend->io_list, iolist);
-                ioend->io_buffer_head = bh;
-                ioend->io_buffer_tail = bh;
+                new = xfs_alloc_ioend(inode, wpc->io_type);
-                if (previous)
+                new->io_offset = offset;
-                        previous->io_list = ioend;
+                new->io_buffer_head = bh;
-                *result = ioend;
+                new->io_buffer_tail = bh;
+                wpc->ioend = new;
        } else {
-                ioend->io_buffer_tail->b_private = bh;
+                wpc->ioend->io_buffer_tail->b_private = bh;
-                ioend->io_buffer_tail = bh;
+                wpc->ioend->io_buffer_tail = bh;
        }
        bh->b_private = NULL;
-        ioend->io_size += bh->b_size;
+        wpc->ioend->io_size += bh->b_size;
+        wpc->last_block = bh->b_blocknr;
+        xfs_start_buffer_writeback(bh);
 }
 STATIC void
@@ -678,183 +632,6 @@ xfs_check_page_type(
        return false;
 }
-/*
- * Allocate & map buffers for page given the extent map. Write it out.
- * except for the original page of a writepage, this is called on
- * delalloc/unwritten pages only, for the original page it is possible
- * that the page has no mapping at all.
- */
-STATIC int
-xfs_convert_page(
-        struct inode            *inode,
-        struct page             *page,
-        loff_t                  tindex,
-        struct xfs_bmbt_irec    *imap,
-        xfs_ioend_t             **ioendp,
-        struct writeback_control *wbc)
-{
-        struct buffer_head      *bh, *head;
-        xfs_off_t               end_offset;
-        unsigned long           p_offset;
-        unsigned int            type;
-        int                     len, page_dirty;
-        int                     count = 0, done = 0, uptodate = 1;
-        xfs_off_t               offset = page_offset(page);
-        if (page->index != tindex)
-                goto fail;
-        if (!trylock_page(page))
-                goto fail;
-        if (PageWriteback(page))
-                goto fail_unlock_page;
-        if (page->mapping != inode->i_mapping)
-                goto fail_unlock_page;
-        if (!xfs_check_page_type(page, (*ioendp)->io_type, false))
-                goto fail_unlock_page;
-        /*
-         * page_dirty is initially a count of buffers on the page before
-         * EOF and is decremented as we move each into a cleanable state.
-         *
-         * Derivation:
-         *
-         * End offset is the highest offset that this page should represent.
-         * If we are on the last page, (end_offset & (PAGE_CACHE_SIZE - 1))
-         * will evaluate non-zero and be less than PAGE_CACHE_SIZE and
-         * hence give us the correct page_dirty count. On any other page,
-         * it will be zero and in that case we need page_dirty to be the
-         * count of buffers on the page.
-         */
-        end_offset = min_t(unsigned long long,
-                        (xfs_off_t)(page->index + 1) << PAGE_CACHE_SHIFT,
-                        i_size_read(inode));
-        /*
-         * If the current map does not span the entire page we are about to try
-         * to write, then give up. The only way we can write a page that spans
-         * multiple mappings in a single writeback iteration is via the
-         * xfs_vm_writepage() function. Data integrity writeback requires the
-         * entire page to be written in a single attempt, otherwise the part of
-         * the page we don't write here doesn't get written as part of the data
-         * integrity sync.
-         *
-         * For normal writeback, we also don't attempt to write partial pages
-         * here as it simply means that write_cache_pages() will see it under
-         * writeback and ignore the page until some point in the future, at
-         * which time this will be the only page in the file that needs
-         * writeback.  Hence for more optimal IO patterns, we should always
-         * avoid partial page writeback due to multiple mappings on a page here.
-         */
-        if (!xfs_imap_valid(inode, imap, end_offset))
-                goto fail_unlock_page;
-        len = 1 << inode->i_blkbits;
-        p_offset = min_t(unsigned long, end_offset & (PAGE_CACHE_SIZE - 1),
-                                        PAGE_CACHE_SIZE);
-        p_offset = p_offset ? roundup(p_offset, len) : PAGE_CACHE_SIZE;
-        page_dirty = p_offset / len;
-        /*
-         * The moment we find a buffer that doesn't match our current type
-         * specification or can't be written, abort the loop and start
-         * writeback. As per the above xfs_imap_valid() check, only
-         * xfs_vm_writepage() can handle partial page writeback fully - we are
-         * limited here to the buffers that are contiguous with the current
-         * ioend, and hence a buffer we can't write breaks that contiguity and
-         * we have to defer the rest of the IO to xfs_vm_writepage().
-         */
-        bh = head = page_buffers(page);
-        do {
-                if (offset >= end_offset)
-                        break;
-                if (!buffer_uptodate(bh))
-                        uptodate = 0;
-                if (!(PageUptodate(page) || buffer_uptodate(bh))) {
-                        done = 1;
-                        break;
-                }
-                if (buffer_unwritten(bh) || buffer_delay(bh) ||
-                    buffer_mapped(bh)) {
-                        if (buffer_unwritten(bh))
-                                type = XFS_IO_UNWRITTEN;
-                        else if (buffer_delay(bh))
-                                type = XFS_IO_DELALLOC;
-                        else
-                                type = XFS_IO_OVERWRITE;
-                        /*
-                         * imap should always be valid because of the above
-                         * partial page end_offset check on the imap.
-                         */
-                        ASSERT(xfs_imap_valid(inode, imap, offset));
-                        lock_buffer(bh);
-                        if (type != XFS_IO_OVERWRITE)
-                                xfs_map_at_offset(inode, bh, imap, offset);
-                        xfs_add_to_ioend(inode, bh, offset, type,
-                                         ioendp, done);
-                        page_dirty--;
-                        count++;
-                } else {
-                        done = 1;
-                        break;
-                }
-        } while (offset += len, (bh = bh->b_this_page) != head);
-        if (uptodate && bh == head)
-                SetPageUptodate(page);
-        if (count) {
-                if (--wbc->nr_to_write <= 0 &&
-                    wbc->sync_mode == WB_SYNC_NONE)
-                        done = 1;
-        }
-        xfs_start_page_writeback(page, !page_dirty, count);
-        return done;
- fail_unlock_page:
-        unlock_page(page);
- fail:
-        return 1;
-}
-/*
- * Convert & write out a cluster of pages in the same extent as defined
- * by mp and following the start page.
- */
-STATIC void
-xfs_cluster_write(
-        struct inode            *inode,
-        pgoff_t                 tindex,
-        struct xfs_bmbt_irec    *imap,
-        xfs_ioend_t             **ioendp,
-        struct writeback_control *wbc,
-        pgoff_t                 tlast)
-{
-        struct pagevec          pvec;
-        int                     done = 0, i;
-        pagevec_init(&pvec, 0);
-        while (!done && tindex <= tlast) {
-                unsigned len = min_t(pgoff_t, PAGEVEC_SIZE, tlast - tindex + 1);
-                if (!pagevec_lookup(&pvec, inode->i_mapping, tindex, len))
-                        break;
-                for (i = 0; i < pagevec_count(&pvec); i++) {
-                        done = xfs_convert_page(inode, pvec.pages[i], tindex++,
-                                        imap, ioendp, wbc);
-                        if (done)
-                                break;
-                }
-                pagevec_release(&pvec);
-                cond_resched();
-        }
-}
 STATIC void
 xfs_vm_invalidatepage(
        struct page             *page,
@@ -932,6 +709,164 @@ out_invalidate:
 }
 /*
+ * We implement an immediate ioend submission policy here to avoid needing to
+ * chain multiple ioends and hence nest mempool allocations which can violate
+ * forward progress guarantees we need to provide. The current ioend we are
+ * adding buffers to is cached on the writepage context, and if the new buffer
+ * does not append to the cached ioend it will create a new ioend and cache that
+ * instead.
+ *
+ * If a new ioend is created and cached, the old ioend is returned and queued
+ * locally for submission once the entire page is processed or an error has been
+ * detected.  While ioends are submitted immediately after they are completed,
+ * batching optimisations are provided by higher level block plugging.
+ *
+ * At the end of a writeback pass, there will be a cached ioend remaining on the
+ * writepage context that the caller will need to submit.
+ */
+static int
+xfs_writepage_map(
+        struct xfs_writepage_ctx *wpc,
+        struct writeback_control *wbc,
+        struct inode            *inode,
+        struct page             *page,
+        loff_t                  offset,
+        __uint64_t              end_offset)
+{
+        LIST_HEAD(submit_list);
+        struct xfs_ioend        *ioend, *next;
+        struct buffer_head      *bh, *head;
+        ssize_t                 len = 1 << inode->i_blkbits;
+        int                     error = 0;
+        int                     count = 0;
+        int                     uptodate = 1;
+        bh = head = page_buffers(page);
+        offset = page_offset(page);
+        do {
+                if (offset >= end_offset)
+                        break;
+                if (!buffer_uptodate(bh))
+                        uptodate = 0;
+                /*
+                 * set_page_dirty dirties all buffers in a page, independent
+                 * of their state.  The dirty state however is entirely
+                 * meaningless for holes (!mapped && uptodate), so skip
+                 * buffers covering holes here.
+                 */
+                if (!buffer_mapped(bh) && buffer_uptodate(bh)) {
+                        wpc->imap_valid = false;
+                        continue;
+                }
+                if (buffer_unwritten(bh)) {
+                        if (wpc->io_type != XFS_IO_UNWRITTEN) {
+                                wpc->io_type = XFS_IO_UNWRITTEN;
+                                wpc->imap_valid = false;
+                        }
+                } else if (buffer_delay(bh)) {
+                        if (wpc->io_type != XFS_IO_DELALLOC) {
+                                wpc->io_type = XFS_IO_DELALLOC;
+                                wpc->imap_valid = false;
+                        }
+                } else if (buffer_uptodate(bh)) {
+                        if (wpc->io_type != XFS_IO_OVERWRITE) {
+                                wpc->io_type = XFS_IO_OVERWRITE;
+                                wpc->imap_valid = false;
+                        }
+                } else {
+                        if (PageUptodate(page))
+                                ASSERT(buffer_mapped(bh));
+                        /*
+                         * This buffer is not uptodate and will not be
+                         * written to disk.  Ensure that we will put any
+                         * subsequent writeable buffers into a new
+                         * ioend.
+                         */
+                        wpc->imap_valid = false;
+                        continue;
+                }
+                if (wpc->imap_valid)
+                        wpc->imap_valid = xfs_imap_valid(inode, &wpc->imap,
+                                                         offset);
+                if (!wpc->imap_valid) {
+                        error = xfs_map_blocks(inode, offset, &wpc->imap,
+                                             wpc->io_type);
+                        if (error)
+                                goto out;
+                        wpc->imap_valid = xfs_imap_valid(inode, &wpc->imap,
+                                                         offset);
+                }
+                if (wpc->imap_valid) {
+                        lock_buffer(bh);
+                        if (wpc->io_type != XFS_IO_OVERWRITE)
+                                xfs_map_at_offset(inode, bh, &wpc->imap, offset);
+                        xfs_add_to_ioend(inode, bh, offset, wpc, &submit_list);
+                        count++;
+                }
+        } while (offset += len, ((bh = bh->b_this_page) != head));
+        if (uptodate && bh == head)
+                SetPageUptodate(page);
+        ASSERT(wpc->ioend || list_empty(&submit_list));
+out:
+        /*
+         * On error, we have to fail the ioend here because we have locked
+         * buffers in the ioend. If we don't do this, we'll deadlock
+         * invalidating the page as that tries to lock the buffers on the page.
+         * Also, because we may have set pages under writeback, we have to make
+         * sure we run IO completion to mark the error state of the IO
+         * appropriately, so we can't cancel the ioend directly here. That means
+         * we have to mark this page as under writeback if we included any
+         * buffers from it in the ioend chain so that completion treats it
+         * correctly.
+         *
+         * If we didn't include the page in the ioend, the on error we can
+         * simply discard and unlock it as there are no other users of the page
+         * or it's buffers right now. The caller will still need to trigger
+         * submission of outstanding ioends on the writepage context so they are
+         * treated correctly on error.
+         */
+        if (count) {
+                xfs_start_page_writeback(page, !error);
+                /*
+                 * Preserve the original error if there was one, otherwise catch
+                 * submission errors here and propagate into subsequent ioend
+                 * submissions.
+                 */
+                list_for_each_entry_safe(ioend, next, &submit_list, io_list) {
+                        int error2;
+                        list_del_init(&ioend->io_list);
+                        error2 = xfs_submit_ioend(wbc, ioend, error);
+                        if (error2 && !error)
+                                error = error2;
+                }
+        } else if (error) {
+                xfs_aops_discard_page(page);
+                ClearPageUptodate(page);
+                unlock_page(page);
+        } else {
+                /*
+                 * We can end up here with no error and nothing to write if we
+                 * race with a partial page truncate on a sub-page block sized
+                 * filesystem. In that case we need to mark the page clean.
+                 */
+                xfs_start_page_writeback(page, 1);
+                end_page_writeback(page);
+        }
+        mapping_set_error(page->mapping, error);
+        return error;
+}
+/*
 * Write out a dirty page.
 *
 * For delalloc space on the page we need to allocate space and flush it.
@@ -940,22 +875,16 @@ out_invalidate:
 * For any other dirty buffer heads on the page we should flush them.
 */
 STATIC int
-xfs_vm_writepage(
+xfs_do_writepage(
        struct page             *page,
-        struct writeback_control *wbc)
+        struct writeback_control *wbc,
+        void                    *data)
 {
+        struct xfs_writepage_ctx *wpc = data;
        struct inode            *inode = page->mapping->host;
-        struct buffer_head      *bh, *head;
-        struct xfs_bmbt_irec    imap;
-        xfs_ioend_t             *ioend = NULL, *iohead = NULL;
        loff_t                  offset;
-        unsigned int            type;
        __uint64_t              end_offset;
-        pgoff_t                 end_index, last_index;
+        pgoff_t                 end_index;
-        ssize_t                 len;
-        int                     err, imap_valid = 0, uptodate = 1;
-        int                     count = 0;
-        int                     nonblocking = 0;
        trace_xfs_writepage(inode, page, 0, 0);
@@ -982,12 +911,9 @@ xfs_vm_writepage(
        if (WARN_ON_ONCE(current->flags & PF_FSTRANS))
                goto redirty;
-        /* Is this page beyond the end of the file? */
-        offset = i_size_read(inode);
-        end_index = offset >> PAGE_CACHE_SHIFT;
-        last_index = (offset - 1) >> PAGE_CACHE_SHIFT;
        /*
+         * Is this page beyond the end of the file?
+         *
         * The page index is less than the end_index, adjust the end_offset
         * to the highest offset that this page should represent.
         * -----------------------------------------------------
@@ -998,6 +924,8 @@ xfs_vm_writepage(
         * |     desired writeback range    |      see else    |
         * ---------------------------------^------------------|
         */
+        offset = i_size_read(inode);
+        end_index = offset >> PAGE_CACHE_SHIFT;
        if (page->index < end_index)
                end_offset = (xfs_off_t)(page->index + 1) << PAGE_CACHE_SHIFT;
        else {
@@ -1049,152 +977,7 @@ xfs_vm_writepage(
                end_offset = offset;
        }
-        len = 1 << inode->i_blkbits;
+        return xfs_writepage_map(wpc, wbc, inode, page, offset, end_offset);
-        bh = head = page_buffers(page);
-        offset = page_offset(page);
-        type = XFS_IO_OVERWRITE;
-        if (wbc->sync_mode == WB_SYNC_NONE)
-                nonblocking = 1;
-        do {
-                int new_ioend = 0;
-                if (offset >= end_offset)
-                        break;
-                if (!buffer_uptodate(bh))
-                        uptodate = 0;
-                /*
-                 * set_page_dirty dirties all buffers in a page, independent
-                 * of their state.  The dirty state however is entirely
-                 * meaningless for holes (!mapped && uptodate), so skip
-                 * buffers covering holes here.
-                 */
-                if (!buffer_mapped(bh) && buffer_uptodate(bh)) {
-                        imap_valid = 0;
-                        continue;
-                }
-                if (buffer_unwritten(bh)) {
-                        if (type != XFS_IO_UNWRITTEN) {
-                                type = XFS_IO_UNWRITTEN;
-                                imap_valid = 0;
-                        }
-                } else if (buffer_delay(bh)) {
-                        if (type != XFS_IO_DELALLOC) {
-                                type = XFS_IO_DELALLOC;
-                                imap_valid = 0;
-                        }
-                } else if (buffer_uptodate(bh)) {
-                        if (type != XFS_IO_OVERWRITE) {
-                                type = XFS_IO_OVERWRITE;
-                                imap_valid = 0;
-                        }
-                } else {
-                        if (PageUptodate(page))
-                                ASSERT(buffer_mapped(bh));
-                        /*
-                         * This buffer is not uptodate and will not be
-                         * written to disk.  Ensure that we will put any
-                         * subsequent writeable buffers into a new
-                         * ioend.
-                         */
-                        imap_valid = 0;
-                        continue;
-                }
-                if (imap_valid)
-                        imap_valid = xfs_imap_valid(inode, &imap, offset);
-                if (!imap_valid) {
-                        /*
-                         * If we didn't have a valid mapping then we need to
-                         * put the new mapping into a separate ioend structure.
-                         * This ensures non-contiguous extents always have
-                         * separate ioends, which is particularly important
-                         * for unwritten extent conversion at I/O completion
-                         * time.
-                         */
-                        new_ioend = 1;
-                        err = xfs_map_blocks(inode, offset, &imap, type,
-                                             nonblocking);
-                        if (err)
-                                goto error;
-                        imap_valid = xfs_imap_valid(inode, &imap, offset);
-                }
-                if (imap_valid) {
-                        lock_buffer(bh);
-                        if (type != XFS_IO_OVERWRITE)
-                                xfs_map_at_offset(inode, bh, &imap, offset);
-                        xfs_add_to_ioend(inode, bh, offset, type, &ioend,
-                                         new_ioend);
-                        count++;
-                }
-                if (!iohead)
-                        iohead = ioend;
-        } while (offset += len, ((bh = bh->b_this_page) != head));
-        if (uptodate && bh == head)
-                SetPageUptodate(page);
-        xfs_start_page_writeback(page, 1, count);
-        /* if there is no IO to be submitted for this page, we are done */
-        if (!ioend)
-                return 0;
-        ASSERT(iohead);
-        /*
-         * Any errors from this point onwards need tobe reported through the IO
-         * completion path as we have marked the initial page as under writeback
-         * and unlocked it.
-         */
-        if (imap_valid) {
-                xfs_off_t               end_index;
-                end_index = imap.br_startoff + imap.br_blockcount;
-                /* to bytes */
-                end_index <<= inode->i_blkbits;
-                /* to pages */
-                end_index = (end_index - 1) >> PAGE_CACHE_SHIFT;
-                /* check against file size */
-                if (end_index > last_index)
-                        end_index = last_index;
-                xfs_cluster_write(inode, page->index + 1, &imap, &ioend,
-                                  wbc, end_index);
-        }
-        /*
-         * Reserve log space if we might write beyond the on-disk inode size.
-         */
-        err = 0;
-        if (ioend->io_type != XFS_IO_UNWRITTEN && xfs_ioend_is_append(ioend))
-                err = xfs_setfilesize_trans_alloc(ioend);
-        xfs_submit_ioend(wbc, iohead, err);
-        return 0;
-error:
-        if (iohead)
-                xfs_cancel_ioend(iohead);
-        if (err == -EAGAIN)
-                goto redirty;
-        xfs_aops_discard_page(page);
-        ClearPageUptodate(page);
-        unlock_page(page);
-        return err;
 redirty:
        redirty_page_for_writepage(wbc, page);
@@ -1203,16 +986,40 @@ redirty:
 }
 STATIC int
+xfs_vm_writepage(
+        struct page             *page,
+        struct writeback_control *wbc)
+{
+        struct xfs_writepage_ctx wpc = {
+                .io_type = XFS_IO_INVALID,
+        };
+        int                     ret;
+        ret = xfs_do_writepage(page, wbc, &wpc);
+        if (wpc.ioend)
+                ret = xfs_submit_ioend(wbc, wpc.ioend, ret);
+        return ret;
+}
+STATIC int
 xfs_vm_writepages(
        struct address_space    *mapping,
        struct writeback_control *wbc)
 {
+        struct xfs_writepage_ctx wpc = {
+                .io_type = XFS_IO_INVALID,
+        };
+        int                     ret;
        xfs_iflags_clear(XFS_I(mapping->host), XFS_ITRUNCATED);
        if (dax_mapping(mapping))
                return dax_writeback_mapping_range(mapping,
                                xfs_find_bdev_for_inode(mapping->host), wbc);
-        return generic_writepages(mapping, wbc);
+        ret = write_cache_pages(mapping, wbc, xfs_do_writepage, &wpc);
+        if (wpc.ioend)
+                ret = xfs_submit_ioend(wbc, wpc.ioend, ret);
+        return ret;
 }
 /*
@@ -1242,27 +1049,8 @@ xfs_vm_releasepage(
 }
 /*
- * When we map a DIO buffer, we may need to attach an ioend that describes the
+ * When we map a DIO buffer, we may need to pass flags to
- * type of write IO we are doing. This passes to the completion function the
+ * xfs_end_io_direct_write to tell it what kind of write IO we are doing.
- * operations it needs to perform. If the mapping is for an overwrite wholly
- * within the EOF then we don't need an ioend and so we don't allocate one.
- * This avoids the unnecessary overhead of allocating and freeing ioends for
- * workloads that don't require transactions on IO completion.
- *
- * If we get multiple mappings in a single IO, we might be mapping different
- * types. But because the direct IO can only have a single private pointer, we
- * need to ensure that:
- *
- * a) i) the ioend spans the entire region of unwritten mappings; or
- *    ii) the ioend spans all the mappings that cross or are beyond EOF; and
- * b) if it contains unwritten extents, it is *permanently* marked as such
- *
- * We could do this by chaining ioends like buffered IO does, but we only
- * actually get one IO completion callback from the direct IO, and that spans
- * the entire IO regardless of how many mappings and IOs are needed to complete
- * the DIO. There is only going to be one reference to the ioend and its life
- * cycle is constrained by the DIO completion code. hence we don't need
- * reference counting here.
 *
 * Note that for DIO, an IO to the highest supported file block offset (i.e.
 * 2^63 - 1FSB bytes) will result in the offset + count overflowing a signed 64
@@ -1270,68 +1058,26 @@ xfs_vm_releasepage(
 * extending the file size. We won't know for sure until IO completion is run
 * and the actual max write offset is communicated to the IO completion
 * routine.
- *
- * For DAX page faults, we are preparing to never see unwritten extents here,
- * nor should we ever extend the inode size. Hence we will soon have nothing to
- * do here for this case, ensuring we don't have to provide an IO completion
- * callback to free an ioend that we don't actually need for a fault into the
- * page at offset (2^63 - 1FSB) bytes.
 */
 static void
 xfs_map_direct(
        struct inode            *inode,
        struct buffer_head      *bh_result,
        struct xfs_bmbt_irec    *imap,
-        xfs_off_t               offset,
+        xfs_off_t               offset)
-        bool                    dax_fault)
 {
-        struct xfs_ioend        *ioend;
+        uintptr_t               *flags = (uintptr_t *)&bh_result->b_private;
        xfs_off_t               size = bh_result->b_size;
-        int                     type;
-        if (ISUNWRITTEN(imap))
-                type = XFS_IO_UNWRITTEN;
-        else
-                type = XFS_IO_OVERWRITE;
-        trace_xfs_gbmap_direct(XFS_I(inode), offset, size, type, imap);
+        trace_xfs_get_blocks_map_direct(XFS_I(inode), offset, size,
+                ISUNWRITTEN(imap) ? XFS_IO_UNWRITTEN : XFS_IO_OVERWRITE, imap);
-        if (dax_fault) {
-                ASSERT(type == XFS_IO_OVERWRITE);
-                trace_xfs_gbmap_direct_none(XFS_I(inode), offset, size, type,
-                                            imap);
-                return;
-        }
-        if (bh_result->b_private) {
+        if (ISUNWRITTEN(imap)) {
-                ioend = bh_result->b_private;
+                *flags |= XFS_DIO_FLAG_UNWRITTEN;
-                ASSERT(ioend->io_size > 0);
+                set_buffer_defer_completion(bh_result);
-                ASSERT(offset >= ioend->io_offset);
+        } else if (offset + size > i_size_read(inode) || offset + size < 0) {
-                if (offset + size > ioend->io_offset + ioend->io_size)
+                *flags |= XFS_DIO_FLAG_APPEND;
-                        ioend->io_size = offset - ioend->io_offset + size;
-                if (type == XFS_IO_UNWRITTEN && type != ioend->io_type)
-                        ioend->io_type = XFS_IO_UNWRITTEN;
-                trace_xfs_gbmap_direct_update(XFS_I(inode), ioend->io_offset,
-                                              ioend->io_size, ioend->io_type,
-                                              imap);
-        } else if (type == XFS_IO_UNWRITTEN ||
-                   offset + size > i_size_read(inode) ||
-                   offset + size < 0) {
-                ioend = xfs_alloc_ioend(inode, type);
-                ioend->io_offset = offset;
-                ioend->io_size = size;
-                bh_result->b_private = ioend;
                set_buffer_defer_completion(bh_result);
-                trace_xfs_gbmap_direct_new(XFS_I(inode), offset, size, type,
-                                           imap);
-        } else {
-                trace_xfs_gbmap_direct_none(XFS_I(inode), offset, size, type,
-                                            imap);
        }
 }
@@ -1502,9 +1248,12 @@ __xfs_get_blocks(
                if (ISUNWRITTEN(&imap))
                        set_buffer_unwritten(bh_result);
                /* direct IO needs special help */
-                if (create && direct)
+                if (create && direct) {
-                        xfs_map_direct(inode, bh_result, &imap, offset,
+                        if (dax_fault)
-                                       dax_fault);
+                                ASSERT(!ISUNWRITTEN(&imap));
+                        else
+                                xfs_map_direct(inode, bh_result, &imap, offset);
+                }
        }
        /*
@@ -1574,42 +1323,50 @@ xfs_get_blocks_dax_fault(
        return __xfs_get_blocks(inode, iblock, bh_result, create, true, true);
 }
-static void
+/*
-__xfs_end_io_direct_write(
+ * Complete a direct I/O write request.
-        struct inode            *inode,
+ *
-        struct xfs_ioend        *ioend,
+ * xfs_map_direct passes us some flags in the private data to tell us what to
+ * do.  If no flags are set, then the write IO is an overwrite wholly within
+ * the existing allocated file size and so there is nothing for us to do.
+ *
+ * Note that in this case the completion can be called in interrupt context,
+ * whereas if we have flags set we will always be called in task context
+ * (i.e. from a workqueue).
+ */
+STATIC int
+xfs_end_io_direct_write(
+        struct kiocb            *iocb,
        loff_t                  offset,
-        ssize_t                 size)
+        ssize_t                 size,
+        void                    *private)
 {
-        struct xfs_mount        *mp = XFS_I(inode)->i_mount;
+        struct inode            *inode = file_inode(iocb->ki_filp);
+        struct xfs_inode        *ip = XFS_I(inode);
+        struct xfs_mount        *mp = ip->i_mount;
+        uintptr_t               flags = (uintptr_t)private;
+        int                     error = 0;
-        if (XFS_FORCED_SHUTDOWN(mp) || ioend->io_error)
+        trace_xfs_end_io_direct_write(ip, offset, size);
-                goto out_end_io;
-        /*
+        if (XFS_FORCED_SHUTDOWN(mp))
-         * dio completion end_io functions are only called on writes if more
+                return -EIO;
-         * than 0 bytes was written.
-         */
-        ASSERT(size > 0);
-        /*
+        if (size <= 0)
-         * The ioend only maps whole blocks, while the IO may be sector aligned.
+                return size;
-         * Hence the ioend offset/size may not match the IO offset/size exactly.
-         * Because we don't map overwrites within EOF into the ioend, the offset
-         * may not match, but only if the endio spans EOF.  Either way, write
-         * the IO sizes into the ioend so that completion processing does the
-         * right thing.
-         */
-        ASSERT(offset + size <= ioend->io_offset + ioend->io_size);
-        ioend->io_size = size;
-        ioend->io_offset = offset;
        /*
-         * The ioend tells us whether we are doing unwritten extent conversion
+         * The flags tell us whether we are doing unwritten extent conversions
         * or an append transaction that updates the on-disk file size. These
         * cases are the only cases where we should *potentially* be needing
         * to update the VFS inode size.
-         *
+         */
+        if (flags == 0) {
+                ASSERT(offset + size <= i_size_read(inode));
+                return 0;
+        }
+        /*
         * We need to update the in-core inode size here so that we don't end up
         * with the on-disk inode size being outside the in-core inode size. We
         * have no other method of updating EOF for AIO, so always do it here
@@ -1620,91 +1377,56 @@ __xfs_end_io_direct_write(
         * here can result in EOF moving backwards and Bad Things Happen when
         * that occurs.
         */
-        spin_lock(&XFS_I(inode)->i_flags_lock);
+        spin_lock(&ip->i_flags_lock);
        if (offset + size > i_size_read(inode))
                i_size_write(inode, offset + size);
-        spin_unlock(&XFS_I(inode)->i_flags_lock);
+        spin_unlock(&ip->i_flags_lock);
-        /*
+        if (flags & XFS_DIO_FLAG_UNWRITTEN) {
-         * If we are doing an append IO that needs to update the EOF on disk,
+                trace_xfs_end_io_direct_write_unwritten(ip, offset, size);
-         * do the transaction reserve now so we can use common end io
-         * processing. Stashing the error (if there is one) in the ioend will
-         * result in the ioend processing passing on the error if it is
-         * possible as we can't return it from here.
-         */
-        if (ioend->io_type == XFS_IO_OVERWRITE)
-                ioend->io_error = xfs_setfilesize_trans_alloc(ioend);
-out_end_io:
+                error = xfs_iomap_write_unwritten(ip, offset, size);
-        xfs_end_io(&ioend->io_work);
+        } else if (flags & XFS_DIO_FLAG_APPEND) {
-        return;
+                struct xfs_trans *tp;
-}
-/*
+                trace_xfs_end_io_direct_write_append(ip, offset, size);
- * Complete a direct I/O write request.
- *
- * The ioend structure is passed from __xfs_get_blocks() to tell us what to do.
- * If no ioend exists (i.e. @private == NULL) then the write IO is an overwrite
- * wholly within the EOF and so there is nothing for us to do. Note that in this
- * case the completion can be called in interrupt context, whereas if we have an
- * ioend we will always be called in task context (i.e. from a workqueue).
- */
-STATIC void
-xfs_end_io_direct_write(
-        struct kiocb            *iocb,
-        loff_t                  offset,
-        ssize_t                 size,
-        void                    *private)
-{
-        struct inode            *inode = file_inode(iocb->ki_filp);
-        struct xfs_ioend        *ioend = private;
-        trace_xfs_gbmap_direct_endio(XFS_I(inode), offset, size,
-                                     ioend ? ioend->io_type : 0, NULL);
-        if (!ioend) {
+                tp = xfs_trans_alloc(mp, XFS_TRANS_FSYNC_TS);
-                ASSERT(offset + size <= i_size_read(inode));
+                error = xfs_trans_reserve(tp, &M_RES(mp)->tr_fsyncts, 0, 0);
-                return;
+                if (error) {
+                        xfs_trans_cancel(tp);
+                        return error;
+                }
+                error = xfs_setfilesize(ip, tp, offset, size);
        }
-        __xfs_end_io_direct_write(inode, ioend, offset, size);
+        return error;
 }
-static inline ssize_t
+STATIC ssize_t
-xfs_vm_do_dio(
+xfs_vm_direct_IO(
-        struct inode            *inode,
        struct kiocb            *iocb,
        struct iov_iter         *iter,
-        loff_t                  offset,
+        loff_t                  offset)
-        void                    (*endio)(struct kiocb   *iocb,
-                                         loff_t         offset,
-                                         ssize_t        size,
-                                         void           *private),
-        int                     flags)
 {
+        struct inode            *inode = iocb->ki_filp->f_mapping->host;
+        dio_iodone_t            *endio = NULL;
+        int                     flags = 0;
        struct block_device     *bdev;
-        if (IS_DAX(inode))
+        if (iov_iter_rw(iter) == WRITE) {
+                endio = xfs_end_io_direct_write;
+                flags = DIO_ASYNC_EXTEND;
+        }
+        if (IS_DAX(inode)) {
                return dax_do_io(iocb, inode, iter, offset,
                                 xfs_get_blocks_direct, endio, 0);
+        }
        bdev = xfs_find_bdev_for_inode(inode);
        return  __blockdev_direct_IO(iocb, inode, bdev, iter, offset,
-                                     xfs_get_blocks_direct, endio, NULL, flags);
+                        xfs_get_blocks_direct, endio, NULL, flags);
-}
-STATIC ssize_t
-xfs_vm_direct_IO(
-        struct kiocb            *iocb,
-        struct iov_iter         *iter,
-        loff_t                  offset)
-{
-        struct inode            *inode = iocb->ki_filp->f_mapping->host;
-        if (iov_iter_rw(iter) == WRITE)
-                return xfs_vm_do_dio(inode, iocb, iter, offset,
-                                     xfs_end_io_direct_write, DIO_ASYNC_EXTEND);
-        return xfs_vm_do_dio(inode, iocb, iter, offset, NULL, 0);
 }
 /*
@@ -1756,6 +1478,7 @@ xfs_vm_write_failed(
        loff_t                  from = pos & (PAGE_CACHE_SIZE - 1);
        loff_t                  to = from + len;
        struct buffer_head      *bh, *head;
+        struct xfs_mount        *mp = XFS_I(inode)->i_mount;
        /*
         * The request pos offset might be 32 or 64 bit, this is all fine
@@ -1787,14 +1510,23 @@ xfs_vm_write_failed(
                if (block_start >= to)
                        break;
-                if (!buffer_delay(bh))
+                /*
+                 * Process delalloc and unwritten buffers beyond EOF. We can
+                 * encounter unwritten buffers in the event that a file has
+                 * post-EOF unwritten extents and an extending write happens to
+                 * fail (e.g., an unaligned write that also involves a delalloc
+                 * to the same page).
+                 */
+                if (!buffer_delay(bh) && !buffer_unwritten(bh))
                        continue;
-                if (!buffer_new(bh) && block_offset < i_size_read(inode))
+                if (!xfs_mp_fail_writes(mp) && !buffer_new(bh) &&
+                    block_offset < i_size_read(inode))
                        continue;
-                xfs_vm_kill_delalloc_range(inode, block_offset,
+                if (buffer_delay(bh))
-                                           block_offset + bh->b_size);
+                        xfs_vm_kill_delalloc_range(inode, block_offset,
+                                                   block_offset + bh->b_size);
                /*
                 * This buffer does not contain data anymore. make sure anyone
@@ -1805,6 +1537,7 @@ xfs_vm_write_failed(
                clear_buffer_mapped(bh);
                clear_buffer_new(bh);
                clear_buffer_dirty(bh);
+                clear_buffer_unwritten(bh);
        }
 }
@@ -1828,6 +1561,7 @@ xfs_vm_write_begin(
        pgoff_t                 index = pos >> PAGE_CACHE_SHIFT;
        struct page             *page;
        int                     status;
+        struct xfs_mount        *mp = XFS_I(mapping->host)->i_mount;
        ASSERT(len <= PAGE_CACHE_SIZE);
@@ -1836,6 +1570,8 @@ xfs_vm_write_begin(
                return -ENOMEM;
        status = __block_write_begin(page, pos, len, xfs_get_blocks);
+        if (xfs_mp_fail_writes(mp))
+                status = -EIO;
        if (unlikely(status)) {
                struct inode    *inode = mapping->host;
                size_t          isize = i_size_read(inode);
@@ -1848,6 +1584,8 @@ xfs_vm_write_begin(
                 * allocated in this write, not blocks that were previously
                 * written successfully.
                 */
+                if (xfs_mp_fail_writes(mp))
+                        isize = 0;
                if (pos + len > isize) {
                        ssize_t start = max_t(ssize_t, pos, isize);
@@ -1957,7 +1695,6 @@ xfs_vm_set_page_dirty(
        loff_t                  end_offset;
        loff_t                  offset;
        int                     newly_dirty;
-        struct mem_cgroup       *memcg;
        if (unlikely(!mapping))
                return !TestSetPageDirty(page);
@@ -1978,10 +1715,10 @@ xfs_vm_set_page_dirty(
                } while (bh != head);
        }
        /*
-         * Use mem_group_begin_page_stat() to keep PageDirty synchronized with
+         * Lock out page->mem_cgroup migration to keep PageDirty
-         * per-memcg dirty page counters.
+         * synchronized with per-memcg dirty page counters.
         */
-        memcg = mem_cgroup_begin_page_stat(page);
+        lock_page_memcg(page);
        newly_dirty = !TestSetPageDirty(page);
        spin_unlock(&mapping->private_lock);
@@ -1992,13 +1729,13 @@ xfs_vm_set_page_dirty(
                spin_lock_irqsave(&mapping->tree_lock, flags);
                if (page->mapping) {    /* Race with truncate? */
                        WARN_ON_ONCE(!PageUptodate(page));
-                        account_page_dirtied(page, mapping, memcg);
+                        account_page_dirtied(page, mapping);
                        radix_tree_tag_set(&mapping->page_tree,
                                        page_index(page), PAGECACHE_TAG_DIRTY);
                }
                spin_unlock_irqrestore(&mapping->tree_lock, flags);
        }
-        mem_cgroup_end_page_stat(memcg);
+        unlock_page_memcg(page);
        if (newly_dirty)
                __mark_inode_dirty(mapping->host, I_DIRTY_PAGES);
        return newly_dirty;
diff --git a/fs/xfs/xfs_aops.h b/fs/xfs/xfs_aops.h
index a4343c63fb38..b4421177b68d 100644
--- a/fs/xfs/xfs_aops.h
+++ b/fs/xfs/xfs_aops.h
@@ -24,12 +24,14 @@ extern mempool_t *xfs_ioend_pool;
 * Types of I/O for bmap clustering and I/O completion tracking.
 */
 enum {
+        XFS_IO_INVALID,         /* initial state */
        XFS_IO_DELALLOC,        /* covers delalloc region */
        XFS_IO_UNWRITTEN,       /* covers allocated but uninitialized data */
        XFS_IO_OVERWRITE,       /* covers already allocated extent */
 };
 #define XFS_IO_TYPES \
+        { XFS_IO_INVALID,               "invalid" }, \
        { XFS_IO_DELALLOC,              "delalloc" }, \
        { XFS_IO_UNWRITTEN,             "unwritten" }, \
        { XFS_IO_OVERWRITE,             "overwrite" }
@@ -39,7 +41,7 @@ enum {
 * It can manage several multi-page bio's at once.
 */
 typedef struct xfs_ioend {
-        struct xfs_ioend        *io_list;       /* next ioend in chain */
+        struct list_head        io_list;        /* next ioend in chain */
        unsigned int            io_type;        /* delalloc / unwritten */
        int                     io_error;       /* I/O error code */
        atomic_t                io_remaining;   /* hold count */
diff --git a/fs/xfs/xfs_attr_list.c b/fs/xfs/xfs_attr_list.c
index 0ef7c2ed3f8a..4fa14820e2e2 100644
--- a/fs/xfs/xfs_attr_list.c
+++ b/fs/xfs/xfs_attr_list.c
@@ -202,8 +202,10 @@ xfs_attr_shortform_list(xfs_attr_list_context_t *context)
                                        sbp->namelen,
                                        sbp->valuelen,
                                        &sbp->name[sbp->namelen]);
-                if (error)
+                if (error) {
+                        kmem_free(sbuf);
                        return error;
+                }
                if (context->seen_enough)
                        break;
                cursor->offset++;
@@ -454,14 +456,13 @@ xfs_attr3_leaf_list_int(
                                args.rmtblkcnt = xfs_attr3_rmt_blocks(
                                                        args.dp->i_mount, valuelen);
                                retval = xfs_attr_rmtval_get(&args);
-                                if (retval)
+                                if (!retval)
-                                        return retval;
+                                        retval = context->put_listent(context,
-                                retval = context->put_listent(context,
+                                                        entry->flags,
-                                                entry->flags,
+                                                        name_rmt->name,
-                                                name_rmt->name,
+                                                        (int)name_rmt->namelen,
-                                                (int)name_rmt->namelen,
+                                                        valuelen,
-                                                valuelen,
+                                                        args.value);
-                                                args.value);
                                kmem_free(args.value);
                        } else {
                                retval = context->put_listent(context,
diff --git a/fs/xfs/xfs_bmap_util.c b/fs/xfs/xfs_bmap_util.c
index 6c876012b2e5..a32c1dcae2ff 100644
--- a/fs/xfs/xfs_bmap_util.c
+++ b/fs/xfs/xfs_bmap_util.c
@@ -203,10 +203,12 @@ xfs_bmap_rtalloc(
                ralen = MAXEXTLEN / mp->m_sb.sb_rextsize;
        /*
-         * Lock out other modifications to the RT bitmap inode.
+         * Lock out modifications to both the RT bitmap and summary inodes
         */
        xfs_ilock(mp->m_rbmip, XFS_ILOCK_EXCL);
        xfs_trans_ijoin(ap->tp, mp->m_rbmip, XFS_ILOCK_EXCL);
+        xfs_ilock(mp->m_rsumip, XFS_ILOCK_EXCL);
+        xfs_trans_ijoin(ap->tp, mp->m_rsumip, XFS_ILOCK_EXCL);
        /*
         * If it's an allocation to an empty file at offset 0,
@@ -822,7 +824,7 @@ bool
 xfs_can_free_eofblocks(struct xfs_inode *ip, bool force)
 {
        /* prealloc/delalloc exists only on regular files */
-        if (!S_ISREG(ip->i_d.di_mode))
+        if (!S_ISREG(VFS_I(ip)->i_mode))
                return false;
        /*
@@ -1727,7 +1729,7 @@ xfs_swap_extents(
        xfs_lock_two_inodes(ip, tip, XFS_MMAPLOCK_EXCL);
        /* Verify that both files have the same format */
-        if ((ip->i_d.di_mode & S_IFMT) != (tip->i_d.di_mode & S_IFMT)) {
+        if ((VFS_I(ip)->i_mode & S_IFMT) != (VFS_I(tip)->i_mode & S_IFMT)) {
                error = -EINVAL;
                goto out_unlock;
        }
diff --git a/fs/xfs/xfs_buf.c b/fs/xfs/xfs_buf.c
index 435c7de42e5f..9a2191b91137 100644
--- a/fs/xfs/xfs_buf.c
+++ b/fs/xfs/xfs_buf.c
@@ -650,7 +650,7 @@ xfs_buf_read_map(
        if (bp) {
                trace_xfs_buf_read(bp, flags, _RET_IP_);
-                if (!XFS_BUF_ISDONE(bp)) {
+                if (!(bp->b_flags & XBF_DONE)) {
                        XFS_STATS_INC(target->bt_mount, xb_get_read);
                        bp->b_ops = ops;
                        _xfs_buf_read(bp, flags);
diff --git a/fs/xfs/xfs_buf.h b/fs/xfs/xfs_buf.h
index c75721acd867..4eb89bd4ee73 100644
--- a/fs/xfs/xfs_buf.h
+++ b/fs/xfs/xfs_buf.h
@@ -302,6 +302,7 @@ extern void xfs_buf_iomove(xfs_buf_t *, size_t, size_t, void *,
 /* Buffer Utility Routines */
 extern void *xfs_buf_offset(struct xfs_buf *, size_t);
+extern void xfs_buf_stale(struct xfs_buf *bp);
 /* Delayed Write Buffer Routines */
 extern bool xfs_buf_delwri_queue(struct xfs_buf *, struct list_head *);
@@ -312,31 +313,6 @@ extern int xfs_buf_delwri_submit_nowait(struct list_head *);
 extern int xfs_buf_init(void);
 extern void xfs_buf_terminate(void);
-#define XFS_BUF_ZEROFLAGS(bp) \
-        ((bp)->b_flags &= ~(XBF_READ|XBF_WRITE|XBF_ASYNC| \
-                            XBF_SYNCIO|XBF_FUA|XBF_FLUSH| \
-                            XBF_WRITE_FAIL))
-void xfs_buf_stale(struct xfs_buf *bp);
-#define XFS_BUF_UNSTALE(bp)     ((bp)->b_flags &= ~XBF_STALE)
-#define XFS_BUF_ISSTALE(bp)     ((bp)->b_flags & XBF_STALE)
-#define XFS_BUF_DONE(bp)        ((bp)->b_flags |= XBF_DONE)
-#define XFS_BUF_UNDONE(bp)      ((bp)->b_flags &= ~XBF_DONE)
-#define XFS_BUF_ISDONE(bp)      ((bp)->b_flags & XBF_DONE)
-#define XFS_BUF_ASYNC(bp)       ((bp)->b_flags |= XBF_ASYNC)
-#define XFS_BUF_UNASYNC(bp)     ((bp)->b_flags &= ~XBF_ASYNC)
-#define XFS_BUF_ISASYNC(bp)     ((bp)->b_flags & XBF_ASYNC)
-#define XFS_BUF_READ(bp)        ((bp)->b_flags |= XBF_READ)
-#define XFS_BUF_UNREAD(bp)      ((bp)->b_flags &= ~XBF_READ)
-#define XFS_BUF_ISREAD(bp)      ((bp)->b_flags & XBF_READ)
-#define XFS_BUF_WRITE(bp)       ((bp)->b_flags |= XBF_WRITE)
-#define XFS_BUF_UNWRITE(bp)     ((bp)->b_flags &= ~XBF_WRITE)
-#define XFS_BUF_ISWRITE(bp)     ((bp)->b_flags & XBF_WRITE)
 /*
 * These macros use the IO block map rather than b_bn. b_bn is now really
 * just for the buffer cache index for cached buffers. As IO does not use b_bn
diff --git a/fs/xfs/xfs_buf_item.c b/fs/xfs/xfs_buf_item.c
index 7e986da34f6c..99e91a0e554e 100644
--- a/fs/xfs/xfs_buf_item.c
+++ b/fs/xfs/xfs_buf_item.c
@@ -431,7 +431,7 @@ xfs_buf_item_unpin(
        if (freed && stale) {
                ASSERT(bip->bli_flags & XFS_BLI_STALE);
                ASSERT(xfs_buf_islocked(bp));
-                ASSERT(XFS_BUF_ISSTALE(bp));
+                ASSERT(bp->b_flags & XBF_STALE);
                ASSERT(bip->__bli_format.blf_flags & XFS_BLF_CANCEL);
                trace_xfs_buf_item_unpin_stale(bip);
@@ -493,7 +493,7 @@ xfs_buf_item_unpin(
                xfs_buf_hold(bp);
                bp->b_flags |= XBF_ASYNC;
                xfs_buf_ioerror(bp, -EIO);
-                XFS_BUF_UNDONE(bp);
+                bp->b_flags &= ~XBF_DONE;
                xfs_buf_stale(bp);
                xfs_buf_ioend(bp);
        }
@@ -1067,7 +1067,7 @@ xfs_buf_iodone_callbacks(
         */
        if (XFS_FORCED_SHUTDOWN(mp)) {
                xfs_buf_stale(bp);
-                XFS_BUF_DONE(bp);
+                bp->b_flags |= XBF_DONE;
                trace_xfs_buf_item_iodone(bp, _RET_IP_);
                goto do_callbacks;
        }
@@ -1090,7 +1090,7 @@ xfs_buf_iodone_callbacks(
         * errors tend to affect the whole device and a failing log write
         * will make us give up.  But we really ought to do better here.
         */
-        if (XFS_BUF_ISASYNC(bp)) {
+        if (bp->b_flags & XBF_ASYNC) {
                ASSERT(bp->b_iodone != NULL);
                trace_xfs_buf_item_iodone_async(bp, _RET_IP_);
@@ -1113,7 +1113,7 @@ xfs_buf_iodone_callbacks(
         * sure to return the error to the caller of xfs_bwrite().
         */
        xfs_buf_stale(bp);
-        XFS_BUF_DONE(bp);
+        bp->b_flags |= XBF_DONE;
        trace_xfs_buf_error_relse(bp, _RET_IP_);
diff --git a/fs/xfs/xfs_dir2_readdir.c b/fs/xfs/xfs_dir2_readdir.c
index 642d55d10075..93b3ab0c5435 100644
--- a/fs/xfs/xfs_dir2_readdir.c
+++ b/fs/xfs/xfs_dir2_readdir.c
@@ -665,7 +665,7 @@ xfs_readdir(
        if (XFS_FORCED_SHUTDOWN(dp->i_mount))
                return -EIO;
-        ASSERT(S_ISDIR(dp->i_d.di_mode));
+        ASSERT(S_ISDIR(VFS_I(dp)->i_mode));
        XFS_STATS_INC(dp->i_mount, xs_dir_getdents);
        args.dp = dp;
diff --git a/fs/xfs/xfs_discard.c b/fs/xfs/xfs_discard.c
index e85a9519a5ae..272c3f8b6f7d 100644
--- a/fs/xfs/xfs_discard.c
+++ b/fs/xfs/xfs_discard.c
@@ -227,7 +227,7 @@ xfs_discard_extents(
                                GFP_NOFS, 0);
                if (error && error != -EOPNOTSUPP) {
                        xfs_info(mp,
-         "discard failed for extent [0x%llu,%u], error %d",
+         "discard failed for extent [0x%llx,%u], error %d",
                                 (unsigned long long)busyp->bno,
                                 busyp->length,
                                 error);
diff --git a/fs/xfs/xfs_dquot.c b/fs/xfs/xfs_dquot.c
index 9c44d38dcd1f..316b2a1bdba5 100644
--- a/fs/xfs/xfs_dquot.c
+++ b/fs/xfs/xfs_dquot.c
@@ -92,26 +92,28 @@ xfs_qm_adjust_dqlimits(
 {
        struct xfs_quotainfo    *q = mp->m_quotainfo;
        struct xfs_disk_dquot   *d = &dq->q_core;
+        struct xfs_def_quota    *defq;
        int                     prealloc = 0;
        ASSERT(d->d_id);
+        defq = xfs_get_defquota(dq, q);
-        if (q->qi_bsoftlimit && !d->d_blk_softlimit) {
+        if (defq->bsoftlimit && !d->d_blk_softlimit) {
-                d->d_blk_softlimit = cpu_to_be64(q->qi_bsoftlimit);
+                d->d_blk_softlimit = cpu_to_be64(defq->bsoftlimit);
                prealloc = 1;
        }
-        if (q->qi_bhardlimit && !d->d_blk_hardlimit) {
+        if (defq->bhardlimit && !d->d_blk_hardlimit) {
-                d->d_blk_hardlimit = cpu_to_be64(q->qi_bhardlimit);
+                d->d_blk_hardlimit = cpu_to_be64(defq->bhardlimit);
                prealloc = 1;
        }
-        if (q->qi_isoftlimit && !d->d_ino_softlimit)
+        if (defq->isoftlimit && !d->d_ino_softlimit)
-                d->d_ino_softlimit = cpu_to_be64(q->qi_isoftlimit);
+                d->d_ino_softlimit = cpu_to_be64(defq->isoftlimit);
-        if (q->qi_ihardlimit && !d->d_ino_hardlimit)
+        if (defq->ihardlimit && !d->d_ino_hardlimit)
-                d->d_ino_hardlimit = cpu_to_be64(q->qi_ihardlimit);
+                d->d_ino_hardlimit = cpu_to_be64(defq->ihardlimit);
-        if (q->qi_rtbsoftlimit && !d->d_rtb_softlimit)
+        if (defq->rtbsoftlimit && !d->d_rtb_softlimit)
-                d->d_rtb_softlimit = cpu_to_be64(q->qi_rtbsoftlimit);
+                d->d_rtb_softlimit = cpu_to_be64(defq->rtbsoftlimit);
-        if (q->qi_rtbhardlimit && !d->d_rtb_hardlimit)
+        if (defq->rtbhardlimit && !d->d_rtb_hardlimit)
-                d->d_rtb_hardlimit = cpu_to_be64(q->qi_rtbhardlimit);
+                d->d_rtb_hardlimit = cpu_to_be64(defq->rtbhardlimit);
        if (prealloc)
                xfs_dquot_set_prealloc_limits(dq);
@@ -232,7 +234,8 @@ xfs_qm_init_dquot_blk(
 {
        struct xfs_quotainfo    *q = mp->m_quotainfo;
        xfs_dqblk_t     *d;
-        int             curid, i;
+        xfs_dqid_t      curid;
+        int             i;
        ASSERT(tp);
        ASSERT(xfs_buf_islocked(bp));
@@ -243,7 +246,6 @@ xfs_qm_init_dquot_blk(
         * ID of the first dquot in the block - id's are zero based.
         */
        curid = id - (id % q->qi_dqperchunk);
-        ASSERT(curid >= 0);
        memset(d, 0, BBTOB(q->qi_dqchunklen));
        for (i = 0; i < q->qi_dqperchunk; i++, d++, curid++) {
                d->dd_diskdq.d_magic = cpu_to_be16(XFS_DQUOT_MAGIC);
@@ -464,12 +466,13 @@ xfs_qm_dqtobp(
        struct xfs_bmbt_irec    map;
        int                     nmaps = 1, error;
        struct xfs_buf          *bp;
-        struct xfs_inode        *quotip = xfs_dq_to_quota_inode(dqp);
+        struct xfs_inode        *quotip;
        struct xfs_mount        *mp = dqp->q_mount;
        xfs_dqid_t              id = be32_to_cpu(dqp->q_core.d_id);
        struct xfs_trans        *tp = (tpp ? *tpp : NULL);
        uint                    lock_mode;
+        quotip = xfs_quota_inode(dqp->q_mount, dqp->dq_flags);
        dqp->q_fileoffset = (xfs_fileoff_t)id / mp->m_quotainfo->qi_dqperchunk;
        lock_mode = xfs_ilock_data_map_shared(quotip);
@@ -685,6 +688,56 @@ error0:
 }
 /*
+ * Advance to the next id in the current chunk, or if at the
+ * end of the chunk, skip ahead to first id in next allocated chunk
+ * using the SEEK_DATA interface.
+ */
+int
+xfs_dq_get_next_id(
+        xfs_mount_t             *mp,
+        uint                    type,
+        xfs_dqid_t              *id,
+        loff_t                  eof)
+{
+        struct xfs_inode        *quotip;
+        xfs_fsblock_t           start;
+        loff_t                  offset;
+        uint                    lock;
+        xfs_dqid_t              next_id;
+        int                     error = 0;
+        /* Simple advance */
+        next_id = *id + 1;
+        /* If new ID is within the current chunk, advancing it sufficed */
+        if (next_id % mp->m_quotainfo->qi_dqperchunk) {
+                *id = next_id;
+                return 0;
+        }
+        /* Nope, next_id is now past the current chunk, so find the next one */
+        start = (xfs_fsblock_t)next_id / mp->m_quotainfo->qi_dqperchunk;
+        quotip = xfs_quota_inode(mp, type);
+        lock = xfs_ilock_data_map_shared(quotip);
+        offset = __xfs_seek_hole_data(VFS_I(quotip), XFS_FSB_TO_B(mp, start),
+                                      eof, SEEK_DATA);
+        if (offset < 0)
+                error = offset;
+        xfs_iunlock(quotip, lock);
+        /* -ENXIO is essentially "no more data" */
+        if (error)
+                return (error == -ENXIO ? -ENOENT: error);
+        /* Convert next data offset back to a quota id */
+        *id = XFS_B_TO_FSB(mp, offset) * mp->m_quotainfo->qi_dqperchunk;
+        return 0;
+}
+/*
 * Given the file system, inode OR id, and type (UDQUOT/GDQUOT), return a
 * a locked dquot, doing an allocation (if requested) as needed.
 * When both an inode and an id are given, the inode's id takes precedence.
@@ -704,6 +757,7 @@ xfs_qm_dqget(
        struct xfs_quotainfo    *qi = mp->m_quotainfo;
        struct radix_tree_root *tree = xfs_dquot_tree(qi, type);
        struct xfs_dquot        *dqp;
+        loff_t                  eof = 0;
        int                     error;
        ASSERT(XFS_IS_QUOTA_RUNNING(mp));
@@ -731,6 +785,21 @@ xfs_qm_dqget(
        }
 #endif
+        /* Get the end of the quota file if we need it */
+        if (flags & XFS_QMOPT_DQNEXT) {
+                struct xfs_inode        *quotip;
+                xfs_fileoff_t           last;
+                uint                    lock_mode;
+                quotip = xfs_quota_inode(mp, type);
+                lock_mode = xfs_ilock_data_map_shared(quotip);
+                error = xfs_bmap_last_offset(quotip, &last, XFS_DATA_FORK);
+                xfs_iunlock(quotip, lock_mode);
+                if (error)
+                        return error;
+                eof = XFS_FSB_TO_B(mp, last);
+        }
 restart:
        mutex_lock(&qi->qi_tree_lock);
        dqp = radix_tree_lookup(tree, id);
@@ -744,6 +813,18 @@ restart:
                        goto restart;
                }
+                /* uninit / unused quota found in radix tree, keep looking  */
+                if (flags & XFS_QMOPT_DQNEXT) {
+                        if (XFS_IS_DQUOT_UNINITIALIZED(dqp)) {
+                                xfs_dqunlock(dqp);
+                                mutex_unlock(&qi->qi_tree_lock);
+                                error = xfs_dq_get_next_id(mp, type, &id, eof);
+                                if (error)
+                                        return error;
+                                goto restart;
+                        }
+                }
                dqp->q_nrefs++;
                mutex_unlock(&qi->qi_tree_lock);
@@ -770,6 +851,13 @@ restart:
        if (ip)
                xfs_ilock(ip, XFS_ILOCK_EXCL);
+        /* If we are asked to find next active id, keep looking */
+        if (error == -ENOENT && (flags & XFS_QMOPT_DQNEXT)) {
+                error = xfs_dq_get_next_id(mp, type, &id, eof);
+                if (!error)
+                        goto restart;
+        }
        if (error)
                return error;
@@ -820,6 +908,17 @@ restart:
        qi->qi_dquots++;
        mutex_unlock(&qi->qi_tree_lock);
+        /* If we are asked to find next active id, keep looking */
+        if (flags & XFS_QMOPT_DQNEXT) {
+                if (XFS_IS_DQUOT_UNINITIALIZED(dqp)) {
+                        xfs_qm_dqput(dqp);
+                        error = xfs_dq_get_next_id(mp, type, &id, eof);
+                        if (error)
+                                return error;
+                        goto restart;
+                }
+        }
 dqret:
        ASSERT((ip == NULL) || xfs_isilocked(ip, XFS_ILOCK_EXCL));
        trace_xfs_dqget_miss(dqp);
diff --git a/fs/xfs/xfs_export.c b/fs/xfs/xfs_export.c
index 652cd3c5b58c..a1b2dd828b9d 100644
--- a/fs/xfs/xfs_export.c
+++ b/fs/xfs/xfs_export.c
@@ -152,7 +152,7 @@ xfs_nfs_get_inode(
                return ERR_PTR(error);
        }
-        if (ip->i_d.di_gen != generation) {
+        if (VFS_I(ip)->i_generation != generation) {
                IRELE(ip);
                return ERR_PTR(-ESTALE);
        }
@@ -246,7 +246,7 @@ const struct export_operations xfs_export_operations = {
        .fh_to_parent           = xfs_fs_fh_to_parent,
        .get_parent             = xfs_fs_get_parent,
        .commit_metadata        = xfs_fs_nfs_commit_metadata,
-#ifdef CONFIG_NFSD_PNFS
+#ifdef CONFIG_NFSD_BLOCKLAYOUT
        .get_uuid               = xfs_fs_get_uuid,
        .map_blocks             = xfs_fs_map_blocks,
        .commit_blocks          = xfs_fs_commit_blocks,
diff --git a/fs/xfs/xfs_file.c b/fs/xfs/xfs_file.c
index 52883ac3cf84..ac0fd32de31e 100644
--- a/fs/xfs/xfs_file.c
+++ b/fs/xfs/xfs_file.c
@@ -156,9 +156,9 @@ xfs_update_prealloc_flags(
        xfs_trans_ijoin(tp, ip, XFS_ILOCK_EXCL);
        if (!(flags & XFS_PREALLOC_INVISIBLE)) {
-                ip->i_d.di_mode &= ~S_ISUID;
+                VFS_I(ip)->i_mode &= ~S_ISUID;
-                if (ip->i_d.di_mode & S_IXGRP)
+                if (VFS_I(ip)->i_mode & S_IXGRP)
-                        ip->i_d.di_mode &= ~S_ISGID;
+                        VFS_I(ip)->i_mode &= ~S_ISGID;
                xfs_trans_ichgtime(tp, ip, XFS_ICHGTIME_MOD | XFS_ICHGTIME_CHG);
        }
@@ -1337,31 +1337,31 @@ out:
        return found;
 }
-STATIC loff_t
+/*
-xfs_seek_hole_data(
+ * caller must lock inode with xfs_ilock_data_map_shared,
-        struct file             *file,
+ * can we craft an appropriate ASSERT?
+ *
+ * end is because the VFS-level lseek interface is defined such that any
+ * offset past i_size shall return -ENXIO, but we use this for quota code
+ * which does not maintain i_size, and we want to SEEK_DATA past i_size.
+ */
+loff_t
+__xfs_seek_hole_data(
+        struct inode            *inode,
        loff_t                  start,
+        loff_t                  end,
        int                     whence)
 {
-        struct inode            *inode = file->f_mapping->host;
        struct xfs_inode        *ip = XFS_I(inode);
        struct xfs_mount        *mp = ip->i_mount;
        loff_t                  uninitialized_var(offset);
-        xfs_fsize_t             isize;
        xfs_fileoff_t           fsbno;
-        xfs_filblks_t           end;
+        xfs_filblks_t           lastbno;
-        uint                    lock;
        int                     error;
-        if (XFS_FORCED_SHUTDOWN(mp))
+        if (start >= end) {
-                return -EIO;
-        lock = xfs_ilock_data_map_shared(ip);
-        isize = i_size_read(inode);
-        if (start >= isize) {
                error = -ENXIO;
-                goto out_unlock;
+                goto out_error;
        }
        /*
@@ -1369,22 +1369,22 @@ xfs_seek_hole_data(
         * by fsbno to the end block of the file.
         */
        fsbno = XFS_B_TO_FSBT(mp, start);
-        end = XFS_B_TO_FSB(mp, isize);
+        lastbno = XFS_B_TO_FSB(mp, end);
        for (;;) {
                struct xfs_bmbt_irec    map[2];
                int                     nmap = 2;
                unsigned int            i;
-                error = xfs_bmapi_read(ip, fsbno, end - fsbno, map, &nmap,
+                error = xfs_bmapi_read(ip, fsbno, lastbno - fsbno, map, &nmap,
                                       XFS_BMAPI_ENTIRE);
                if (error)
-                        goto out_unlock;
+                        goto out_error;
                /* No extents at given offset, must be beyond EOF */
                if (nmap == 0) {
                        error = -ENXIO;
-                        goto out_unlock;
+                        goto out_error;
                }
                for (i = 0; i < nmap; i++) {
@@ -1426,7 +1426,7 @@ xfs_seek_hole_data(
                         * hole at the end of any file).
                         */
                        if (whence == SEEK_HOLE) {
-                                offset = isize;
+                                offset = end;
                                break;
                        }
                        /*
@@ -1434,7 +1434,7 @@ xfs_seek_hole_data(
                         */
                        ASSERT(whence == SEEK_DATA);
                        error = -ENXIO;
-                        goto out_unlock;
+                        goto out_error;
                }
                ASSERT(i > 1);
@@ -1445,14 +1445,14 @@ xfs_seek_hole_data(
                 */
                fsbno = map[i - 1].br_startoff + map[i - 1].br_blockcount;
                start = XFS_FSB_TO_B(mp, fsbno);
-                if (start >= isize) {
+                if (start >= end) {
                        if (whence == SEEK_HOLE) {
-                                offset = isize;
+                                offset = end;
                                break;
                        }
                        ASSERT(whence == SEEK_DATA);
                        error = -ENXIO;
-                        goto out_unlock;
+                        goto out_error;
                }
        }
@@ -1464,7 +1464,39 @@ out:
         * situation in particular.
         */
        if (whence == SEEK_HOLE)
-                offset = min_t(loff_t, offset, isize);
+                offset = min_t(loff_t, offset, end);
+        return offset;
+out_error:
+        return error;
+}
+STATIC loff_t
+xfs_seek_hole_data(
+        struct file             *file,
+        loff_t                  start,
+        int                     whence)
+{
+        struct inode            *inode = file->f_mapping->host;
+        struct xfs_inode        *ip = XFS_I(inode);
+        struct xfs_mount        *mp = ip->i_mount;
+        uint                    lock;
+        loff_t                  offset, end;
+        int                     error = 0;
+        if (XFS_FORCED_SHUTDOWN(mp))
+                return -EIO;
+        lock = xfs_ilock_data_map_shared(ip);
+        end = i_size_read(inode);
+        offset = __xfs_seek_hole_data(inode, start, end, whence);
+        if (offset < 0) {
+                error = offset;
+                goto out_unlock;
+        }
        offset = vfs_setpos(file, offset, inode->i_sb->s_maxbytes);
 out_unlock:
diff --git a/fs/xfs/xfs_filestream.c b/fs/xfs/xfs_filestream.c
index c4c130f9bfb6..a51353a1f87f 100644
--- a/fs/xfs/xfs_filestream.c
+++ b/fs/xfs/xfs_filestream.c
@@ -151,7 +151,7 @@ xfs_filestream_pick_ag(
        xfs_agnumber_t          ag, max_ag = NULLAGNUMBER;
        int                     err, trylock, nscan;
-        ASSERT(S_ISDIR(ip->i_d.di_mode));
+        ASSERT(S_ISDIR(VFS_I(ip)->i_mode));
        /* 2% of an AG's blocks must be free for it to be chosen. */
        minfree = mp->m_sb.sb_agblocks / 50;
@@ -319,7 +319,7 @@ xfs_filestream_lookup_ag(
        xfs_agnumber_t          startag, ag = NULLAGNUMBER;
        struct xfs_mru_cache_elem *mru;
-        ASSERT(S_ISREG(ip->i_d.di_mode));
+        ASSERT(S_ISREG(VFS_I(ip)->i_mode));
        pip = xfs_filestream_get_parent(ip);
        if (!pip)
diff --git a/fs/xfs/xfs_fsops.h b/fs/xfs/xfs_fsops.h
index 1b6a98b66886..f32713f14f9a 100644
--- a/fs/xfs/xfs_fsops.h
+++ b/fs/xfs/xfs_fsops.h
@@ -25,6 +25,5 @@ extern int xfs_fs_counts(xfs_mount_t *mp, xfs_fsop_counts_t *cnt);
 extern int xfs_reserve_blocks(xfs_mount_t *mp, __uint64_t *inval,
                                xfs_fsop_resblks_t *outval);
 extern int xfs_fs_goingdown(xfs_mount_t *mp, __uint32_t inflags);
-extern int xfs_fs_log_dummy(struct xfs_mount *mp);
 #endif  /* __XFS_FSOPS_H__ */
diff --git a/fs/xfs/xfs_icache.c b/fs/xfs/xfs_icache.c
index d7a490f24ead..bf2d60749278 100644
--- a/fs/xfs/xfs_icache.c
+++ b/fs/xfs/xfs_icache.c
@@ -63,6 +63,9 @@ xfs_inode_alloc(
                return NULL;
        }
+        /* VFS doesn't initialise i_mode! */
+        VFS_I(ip)->i_mode = 0;
        XFS_STATS_INC(mp, vn_active);
        ASSERT(atomic_read(&ip->i_pincount) == 0);
        ASSERT(!spin_is_locked(&ip->i_flags_lock));
@@ -79,7 +82,7 @@ xfs_inode_alloc(
        memset(&ip->i_df, 0, sizeof(xfs_ifork_t));
        ip->i_flags = 0;
        ip->i_delayed_blks = 0;
-        memset(&ip->i_d, 0, sizeof(xfs_icdinode_t));
+        memset(&ip->i_d, 0, sizeof(ip->i_d));
        return ip;
 }
@@ -98,7 +101,7 @@ void
 xfs_inode_free(
        struct xfs_inode        *ip)
 {
-        switch (ip->i_d.di_mode & S_IFMT) {
+        switch (VFS_I(ip)->i_mode & S_IFMT) {
        case S_IFREG:
        case S_IFDIR:
        case S_IFLNK:
@@ -135,6 +138,34 @@ xfs_inode_free(
 }
 /*
+ * When we recycle a reclaimable inode, we need to re-initialise the VFS inode
+ * part of the structure. This is made more complex by the fact we store
+ * information about the on-disk values in the VFS inode and so we can't just
+ * overwrite the values unconditionally. Hence we save the parameters we
+ * need to retain across reinitialisation, and rewrite them into the VFS inode
+ * after reinitialisation even if it fails.
+ */
+static int
+xfs_reinit_inode(
+        struct xfs_mount        *mp,
+        struct inode            *inode)
+{
+        int             error;
+        uint32_t        nlink = inode->i_nlink;
+        uint32_t        generation = inode->i_generation;
+        uint64_t        version = inode->i_version;
+        umode_t         mode = inode->i_mode;
+        error = inode_init_always(mp->m_super, inode);
+        set_nlink(inode, nlink);
+        inode->i_generation = generation;
+        inode->i_version = version;
+        inode->i_mode = mode;
+        return error;
+}
+/*
 * Check the validity of the inode we just found it the cache
 */
 static int
@@ -185,7 +216,7 @@ xfs_iget_cache_hit(
        /*
         * If lookup is racing with unlink return an error immediately.
         */
-        if (ip->i_d.di_mode == 0 && !(flags & XFS_IGET_CREATE)) {
+        if (VFS_I(ip)->i_mode == 0 && !(flags & XFS_IGET_CREATE)) {
                error = -ENOENT;
                goto out_error;
        }
@@ -208,7 +239,7 @@ xfs_iget_cache_hit(
                spin_unlock(&ip->i_flags_lock);
                rcu_read_unlock();
-                error = inode_init_always(mp->m_super, inode);
+                error = xfs_reinit_inode(mp, inode);
                if (error) {
                        /*
                         * Re-initializing the inode failed, and we are in deep
@@ -295,7 +326,7 @@ xfs_iget_cache_miss(
        trace_xfs_iget_miss(ip);
-        if ((ip->i_d.di_mode == 0) && !(flags & XFS_IGET_CREATE)) {
+        if ((VFS_I(ip)->i_mode == 0) && !(flags & XFS_IGET_CREATE)) {
                error = -ENOENT;
                goto out_destroy;
        }
@@ -444,7 +475,7 @@ again:
         * If we have a real type for an on-disk inode, we can setup the inode
         * now.  If it's a new inode being created, xfs_ialloc will handle it.
         */
-        if (xfs_iflags_test(ip, XFS_INEW) && ip->i_d.di_mode != 0)
+        if (xfs_iflags_test(ip, XFS_INEW) && VFS_I(ip)->i_mode != 0)
                xfs_setup_existing_inode(ip);
        return 0;
diff --git a/fs/xfs/xfs_inode.c b/fs/xfs/xfs_inode.c
index ceba1a83cacc..96f606deee31 100644
--- a/fs/xfs/xfs_inode.c
+++ b/fs/xfs/xfs_inode.c
@@ -57,9 +57,9 @@ kmem_zone_t *xfs_inode_zone;
 */
 #define XFS_ITRUNC_MAX_EXTENTS  2
-STATIC int xfs_iflush_int(xfs_inode_t *, xfs_buf_t *);
+STATIC int xfs_iflush_int(struct xfs_inode *, struct xfs_buf *);
+STATIC int xfs_iunlink(struct xfs_trans *, struct xfs_inode *);
-STATIC int xfs_iunlink_remove(xfs_trans_t *, xfs_inode_t *);
+STATIC int xfs_iunlink_remove(struct xfs_trans *, struct xfs_inode *);
 /*
 * helper function to extract extent size hint from inode
@@ -766,6 +766,7 @@ xfs_ialloc(
        uint            flags;
        int             error;
        struct timespec tv;
+        struct inode    *inode;
        /*
         * Call the space management code to pick
@@ -791,6 +792,7 @@ xfs_ialloc(
        if (error)
                return error;
        ASSERT(ip != NULL);
+        inode = VFS_I(ip);
        /*
         * We always convert v1 inodes to v2 now - we only support filesystems
@@ -800,20 +802,16 @@ xfs_ialloc(
        if (ip->i_d.di_version == 1)
                ip->i_d.di_version = 2;
-        ip->i_d.di_mode = mode;
+        inode->i_mode = mode;
-        ip->i_d.di_onlink = 0;
+        set_nlink(inode, nlink);
-        ip->i_d.di_nlink = nlink;
-        ASSERT(ip->i_d.di_nlink == nlink);
        ip->i_d.di_uid = xfs_kuid_to_uid(current_fsuid());
        ip->i_d.di_gid = xfs_kgid_to_gid(current_fsgid());
        xfs_set_projid(ip, prid);
-        memset(&(ip->i_d.di_pad[0]), 0, sizeof(ip->i_d.di_pad));
        if (pip && XFS_INHERIT_GID(pip)) {
                ip->i_d.di_gid = pip->i_d.di_gid;
-                if ((pip->i_d.di_mode & S_ISGID) && S_ISDIR(mode)) {
+                if ((VFS_I(pip)->i_mode & S_ISGID) && S_ISDIR(mode))
-                        ip->i_d.di_mode |= S_ISGID;
+                        inode->i_mode |= S_ISGID;
-                }
        }
        /*
@@ -822,38 +820,29 @@ xfs_ialloc(
         * (and only if the irix_sgid_inherit compatibility variable is set).
         */
        if ((irix_sgid_inherit) &&
-            (ip->i_d.di_mode & S_ISGID) &&
+            (inode->i_mode & S_ISGID) &&
-            (!in_group_p(xfs_gid_to_kgid(ip->i_d.di_gid)))) {
+            (!in_group_p(xfs_gid_to_kgid(ip->i_d.di_gid))))
-                ip->i_d.di_mode &= ~S_ISGID;
+                inode->i_mode &= ~S_ISGID;
-        }
        ip->i_d.di_size = 0;
        ip->i_d.di_nextents = 0;
        ASSERT(ip->i_d.di_nblocks == 0);
        tv = current_fs_time(mp->m_super);
-        ip->i_d.di_mtime.t_sec = (__int32_t)tv.tv_sec;
+        inode->i_mtime = tv;
-        ip->i_d.di_mtime.t_nsec = (__int32_t)tv.tv_nsec;
+        inode->i_atime = tv;
-        ip->i_d.di_atime = ip->i_d.di_mtime;
+        inode->i_ctime = tv;
-        ip->i_d.di_ctime = ip->i_d.di_mtime;
-        /*
-         * di_gen will have been taken care of in xfs_iread.
-         */
        ip->i_d.di_extsize = 0;
        ip->i_d.di_dmevmask = 0;
        ip->i_d.di_dmstate = 0;
        ip->i_d.di_flags = 0;
        if (ip->i_d.di_version == 3) {
-                ASSERT(ip->i_d.di_ino == ino);
+                inode->i_version = 1;
-                ASSERT(uuid_equal(&ip->i_d.di_uuid, &mp->m_sb.sb_meta_uuid));
-                ip->i_d.di_crc = 0;
-                ip->i_d.di_changecount = 1;
-                ip->i_d.di_lsn = 0;
                ip->i_d.di_flags2 = 0;
-                memset(&(ip->i_d.di_pad2[0]), 0, sizeof(ip->i_d.di_pad2));
+                ip->i_d.di_crtime.t_sec = (__int32_t)tv.tv_sec;
-                ip->i_d.di_crtime = ip->i_d.di_mtime;
+                ip->i_d.di_crtime.t_nsec = (__int32_t)tv.tv_nsec;
        }
@@ -1092,35 +1081,24 @@ xfs_dir_ialloc(
 }
 /*
- * Decrement the link count on an inode & log the change.
+ * Decrement the link count on an inode & log the change.  If this causes the
- * If this causes the link count to go to zero, initiate the
+ * link count to go to zero, move the inode to AGI unlinked list so that it can
- * logging activity required to truncate a file.
+ * be freed when the last active reference goes away via xfs_inactive().
 */
 int                             /* error */
 xfs_droplink(
        xfs_trans_t *tp,
        xfs_inode_t *ip)
 {
-        int     error;
        xfs_trans_ichgtime(tp, ip, XFS_ICHGTIME_CHG);
-        ASSERT (ip->i_d.di_nlink > 0);
-        ip->i_d.di_nlink--;
        drop_nlink(VFS_I(ip));
        xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE);
-        error = 0;
+        if (VFS_I(ip)->i_nlink)
-        if (ip->i_d.di_nlink == 0) {
+                return 0;
-                /*
-                 * We're dropping the last link to this file.
+        return xfs_iunlink(tp, ip);
-                 * Move the on-disk inode to the AGI unlinked list.
-                 * From xfs_inactive() we will pull the inode from
-                 * the list and free it.
-                 */
-                error = xfs_iunlink(tp, ip);
-        }
-        return error;
 }
 /*
@@ -1134,8 +1112,6 @@ xfs_bumplink(
        xfs_trans_ichgtime(tp, ip, XFS_ICHGTIME_CHG);
        ASSERT(ip->i_d.di_version > 1);
-        ASSERT(ip->i_d.di_nlink > 0 || (VFS_I(ip)->i_state & I_LINKABLE));
-        ip->i_d.di_nlink++;
        inc_nlink(VFS_I(ip));
        xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE);
        return 0;
@@ -1393,7 +1369,6 @@ xfs_create_tmpfile(
         */
        xfs_qm_vop_create_dqattach(tp, ip, udqp, gdqp, pdqp);
-        ip->i_d.di_nlink--;
        error = xfs_iunlink(tp, ip);
        if (error)
                goto out_trans_cancel;
@@ -1444,7 +1419,7 @@ xfs_link(
        trace_xfs_link(tdp, target_name);
-        ASSERT(!S_ISDIR(sip->i_d.di_mode));
+        ASSERT(!S_ISDIR(VFS_I(sip)->i_mode));
        if (XFS_FORCED_SHUTDOWN(mp))
                return -EIO;
@@ -1492,7 +1467,10 @@ xfs_link(
        xfs_bmap_init(&free_list, &first_block);
-        if (sip->i_d.di_nlink == 0) {
+        /*
+         * Handle initial link state of O_TMPFILE inode
+         */
+        if (VFS_I(sip)->i_nlink == 0) {
                error = xfs_iunlink_remove(tp, sip);
                if (error)
                        goto error_return;
@@ -1648,7 +1626,7 @@ xfs_release(
        xfs_mount_t     *mp = ip->i_mount;
        int             error;
-        if (!S_ISREG(ip->i_d.di_mode) || (ip->i_d.di_mode == 0))
+        if (!S_ISREG(VFS_I(ip)->i_mode) || (VFS_I(ip)->i_mode == 0))
                return 0;
        /* If this is a read-only mount, don't do this (would generate I/O) */
@@ -1679,7 +1657,7 @@ xfs_release(
                }
        }
-        if (ip->i_d.di_nlink == 0)
+        if (VFS_I(ip)->i_nlink == 0)
                return 0;
        if (xfs_can_free_eofblocks(ip, false)) {
@@ -1883,7 +1861,7 @@ xfs_inactive(
         * If the inode is already free, then there can be nothing
         * to clean up here.
         */
-        if (ip->i_d.di_mode == 0) {
+        if (VFS_I(ip)->i_mode == 0) {
                ASSERT(ip->i_df.if_real_bytes == 0);
                ASSERT(ip->i_df.if_broot_bytes == 0);
                return;
@@ -1895,7 +1873,7 @@ xfs_inactive(
        if (mp->m_flags & XFS_MOUNT_RDONLY)
                return;
-        if (ip->i_d.di_nlink != 0) {
+        if (VFS_I(ip)->i_nlink != 0) {
                /*
                 * force is true because we are evicting an inode from the
                 * cache. Post-eof blocks must be freed, lest we end up with
@@ -1907,7 +1885,7 @@ xfs_inactive(
                return;
        }
-        if (S_ISREG(ip->i_d.di_mode) &&
+        if (S_ISREG(VFS_I(ip)->i_mode) &&
            (ip->i_d.di_size != 0 || XFS_ISIZE(ip) != 0 ||
             ip->i_d.di_nextents > 0 || ip->i_delayed_blks > 0))
                truncate = 1;
@@ -1916,7 +1894,7 @@ xfs_inactive(
        if (error)
                return;
-        if (S_ISLNK(ip->i_d.di_mode))
+        if (S_ISLNK(VFS_I(ip)->i_mode))
                error = xfs_inactive_symlink(ip);
        else if (truncate)
                error = xfs_inactive_truncate(ip);
@@ -1952,16 +1930,21 @@ xfs_inactive(
 }
 /*
- * This is called when the inode's link count goes to 0.
+ * This is called when the inode's link count goes to 0 or we are creating a
- * We place the on-disk inode on a list in the AGI.  It
+ * tmpfile via O_TMPFILE. In the case of a tmpfile, @ignore_linkcount will be
- * will be pulled from this list when the inode is freed.
+ * set to true as the link count is dropped to zero by the VFS after we've
+ * created the file successfully, so we have to add it to the unlinked list
+ * while the link count is non-zero.
+ *
+ * We place the on-disk inode on a list in the AGI.  It will be pulled from this
+ * list when the inode is freed.
 */
-int
+STATIC int
 xfs_iunlink(
-        xfs_trans_t     *tp,
+        struct xfs_trans *tp,
-        xfs_inode_t     *ip)
+        struct xfs_inode *ip)
 {
-        xfs_mount_t     *mp;
+        xfs_mount_t     *mp = tp->t_mountp;
        xfs_agi_t       *agi;
        xfs_dinode_t    *dip;
        xfs_buf_t       *agibp;
@@ -1971,10 +1954,7 @@ xfs_iunlink(
        int             offset;
        int             error;
-        ASSERT(ip->i_d.di_nlink == 0);
+        ASSERT(VFS_I(ip)->i_mode != 0);
-        ASSERT(ip->i_d.di_mode != 0);
-        mp = tp->t_mountp;
        /*
         * Get the agi buffer first.  It ensures lock ordering
@@ -2412,10 +2392,10 @@ xfs_ifree(
        struct xfs_icluster     xic = { 0 };
        ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL));
-        ASSERT(ip->i_d.di_nlink == 0);
+        ASSERT(VFS_I(ip)->i_nlink == 0);
        ASSERT(ip->i_d.di_nextents == 0);
        ASSERT(ip->i_d.di_anextents == 0);
-        ASSERT(ip->i_d.di_size == 0 || !S_ISREG(ip->i_d.di_mode));
+        ASSERT(ip->i_d.di_size == 0 || !S_ISREG(VFS_I(ip)->i_mode));
        ASSERT(ip->i_d.di_nblocks == 0);
        /*
@@ -2429,7 +2409,7 @@ xfs_ifree(
        if (error)
                return error;
-        ip->i_d.di_mode = 0;            /* mark incore inode as free */
+        VFS_I(ip)->i_mode = 0;          /* mark incore inode as free */
        ip->i_d.di_flags = 0;
        ip->i_d.di_dmevmask = 0;
        ip->i_d.di_forkoff = 0;         /* mark the attr fork not in use */
@@ -2439,7 +2419,7 @@ xfs_ifree(
         * Bump the generation count so no one will be confused
         * by reincarnations of this inode.
         */
-        ip->i_d.di_gen++;
+        VFS_I(ip)->i_generation++;
        xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE);
        if (xic.deleted)
@@ -2526,7 +2506,7 @@ xfs_remove(
 {
        xfs_mount_t             *mp = dp->i_mount;
        xfs_trans_t             *tp = NULL;
-        int                     is_dir = S_ISDIR(ip->i_d.di_mode);
+        int                     is_dir = S_ISDIR(VFS_I(ip)->i_mode);
        int                     error = 0;
        xfs_bmap_free_t         free_list;
        xfs_fsblock_t           first_block;
@@ -2580,8 +2560,8 @@ xfs_remove(
         * If we're removing a directory perform some additional validation.
         */
        if (is_dir) {
-                ASSERT(ip->i_d.di_nlink >= 2);
+                ASSERT(VFS_I(ip)->i_nlink >= 2);
-                if (ip->i_d.di_nlink != 2) {
+                if (VFS_I(ip)->i_nlink != 2) {
                        error = -ENOTEMPTY;
                        goto out_trans_cancel;
                }
@@ -2771,7 +2751,7 @@ xfs_cross_rename(
        if (dp1 != dp2) {
                dp2_flags = XFS_ICHGTIME_MOD | XFS_ICHGTIME_CHG;
-                if (S_ISDIR(ip2->i_d.di_mode)) {
+                if (S_ISDIR(VFS_I(ip2)->i_mode)) {
                        error = xfs_dir_replace(tp, ip2, &xfs_name_dotdot,
                                                dp1->i_ino, first_block,
                                                free_list, spaceres);
@@ -2779,7 +2759,7 @@ xfs_cross_rename(
                                goto out_trans_abort;
                        /* transfer ip2 ".." reference to dp1 */
-                        if (!S_ISDIR(ip1->i_d.di_mode)) {
+                        if (!S_ISDIR(VFS_I(ip1)->i_mode)) {
                                error = xfs_droplink(tp, dp2);
                                if (error)
                                        goto out_trans_abort;
@@ -2798,7 +2778,7 @@ xfs_cross_rename(
                        ip2_flags |= XFS_ICHGTIME_MOD | XFS_ICHGTIME_CHG;
                }
-                if (S_ISDIR(ip1->i_d.di_mode)) {
+                if (S_ISDIR(VFS_I(ip1)->i_mode)) {
                        error = xfs_dir_replace(tp, ip1, &xfs_name_dotdot,
                                                dp2->i_ino, first_block,
                                                free_list, spaceres);
@@ -2806,7 +2786,7 @@ xfs_cross_rename(
                                goto out_trans_abort;
                        /* transfer ip1 ".." reference to dp2 */
-                        if (!S_ISDIR(ip2->i_d.di_mode)) {
+                        if (!S_ISDIR(VFS_I(ip2)->i_mode)) {
                                error = xfs_droplink(tp, dp1);
                                if (error)
                                        goto out_trans_abort;
@@ -2903,7 +2883,7 @@ xfs_rename(
        struct xfs_inode        *inodes[__XFS_SORT_INODES];
        int                     num_inodes = __XFS_SORT_INODES;
        bool                    new_parent = (src_dp != target_dp);
-        bool                    src_is_directory = S_ISDIR(src_ip->i_d.di_mode);
+        bool                    src_is_directory = S_ISDIR(VFS_I(src_ip)->i_mode);
        int                     spaceres;
        int                     error;
@@ -3032,12 +3012,12 @@ xfs_rename(
                 * target and source are directories and that target can be
                 * destroyed, or that neither is a directory.
                 */
-                if (S_ISDIR(target_ip->i_d.di_mode)) {
+                if (S_ISDIR(VFS_I(target_ip)->i_mode)) {
                        /*
                         * Make sure target dir is empty.
                         */
                        if (!(xfs_dir_isempty(target_ip)) ||
-                            (target_ip->i_d.di_nlink > 2)) {
+                            (VFS_I(target_ip)->i_nlink > 2)) {
                                error = -EEXIST;
                                goto out_trans_cancel;
                        }
@@ -3144,7 +3124,7 @@ xfs_rename(
         * intermediate state on disk.
         */
        if (wip) {
-                ASSERT(VFS_I(wip)->i_nlink == 0 && wip->i_d.di_nlink == 0);
+                ASSERT(VFS_I(wip)->i_nlink == 0);
                error = xfs_bumplink(tp, wip);
                if (error)
                        goto out_bmap_cancel;
@@ -3313,7 +3293,7 @@ cluster_corrupt_out:
                 * mark it as stale and brelse.
                 */
                if (bp->b_iodone) {
-                        XFS_BUF_UNDONE(bp);
+                        bp->b_flags &= ~XBF_DONE;
                        xfs_buf_stale(bp);
                        xfs_buf_ioerror(bp, -EIO);
                        xfs_buf_ioend(bp);
@@ -3462,14 +3442,7 @@ xfs_iflush_int(
                        __func__, ip->i_ino, be16_to_cpu(dip->di_magic), dip);
                goto corrupt_out;
        }
-        if (XFS_TEST_ERROR(ip->i_d.di_magic != XFS_DINODE_MAGIC,
+        if (S_ISREG(VFS_I(ip)->i_mode)) {
-                                mp, XFS_ERRTAG_IFLUSH_2, XFS_RANDOM_IFLUSH_2)) {
-                xfs_alert_tag(mp, XFS_PTAG_IFLUSH,
-                        "%s: Bad inode %Lu, ptr 0x%p, magic number 0x%x",
-                        __func__, ip->i_ino, ip, ip->i_d.di_magic);
-                goto corrupt_out;
-        }
-        if (S_ISREG(ip->i_d.di_mode)) {
                if (XFS_TEST_ERROR(
                    (ip->i_d.di_format != XFS_DINODE_FMT_EXTENTS) &&
                    (ip->i_d.di_format != XFS_DINODE_FMT_BTREE),
@@ -3479,7 +3452,7 @@ xfs_iflush_int(
                                __func__, ip->i_ino, ip);
                        goto corrupt_out;
                }
-        } else if (S_ISDIR(ip->i_d.di_mode)) {
+        } else if (S_ISDIR(VFS_I(ip)->i_mode)) {
                if (XFS_TEST_ERROR(
                    (ip->i_d.di_format != XFS_DINODE_FMT_EXTENTS) &&
                    (ip->i_d.di_format != XFS_DINODE_FMT_BTREE) &&
@@ -3523,12 +3496,11 @@ xfs_iflush_int(
                ip->i_d.di_flushiter++;
        /*
-         * Copy the dirty parts of the inode into the on-disk
+         * Copy the dirty parts of the inode into the on-disk inode.  We always
-         * inode.  We always copy out the core of the inode,
+         * copy out the core of the inode, because if the inode is dirty at all
-         * because if the inode is dirty at all the core must
+         * the core must be.
-         * be.
         */
-        xfs_dinode_to_disk(dip, &ip->i_d);
+        xfs_inode_to_disk(ip, dip, iip->ili_item.li_lsn);
        /* Wrap, we never let the log put out DI_MAX_FLUSH */
        if (ip->i_d.di_flushiter == DI_MAX_FLUSH)
@@ -3580,10 +3552,6 @@ xfs_iflush_int(
         */
        xfs_buf_attach_iodone(bp, xfs_iflush_done, &iip->ili_item);
-        /* update the lsn in the on disk inode if required */
-        if (ip->i_d.di_version == 3)
-                dip->di_lsn = cpu_to_be64(iip->ili_item.li_lsn);
        /* generate the checksum. */
        xfs_dinode_calc_crc(mp, dip);
diff --git a/fs/xfs/xfs_inode.h b/fs/xfs/xfs_inode.h
index ca9e11989cbd..43e1d51b15eb 100644
--- a/fs/xfs/xfs_inode.h
+++ b/fs/xfs/xfs_inode.h
@@ -63,7 +63,7 @@ typedef struct xfs_inode {
        unsigned long           i_flags;        /* see defined flags below */
        unsigned int            i_delayed_blks; /* count of delay alloc blks */
-        xfs_icdinode_t          i_d;            /* most of ondisk inode */
+        struct xfs_icdinode     i_d;            /* most of ondisk inode */
        /* VFS inode */
        struct inode            i_vnode;        /* embedded VFS inode */
@@ -88,7 +88,7 @@ static inline struct inode *VFS_I(struct xfs_inode *ip)
 */
 static inline xfs_fsize_t XFS_ISIZE(struct xfs_inode *ip)
 {
-        if (S_ISREG(ip->i_d.di_mode))
+        if (S_ISREG(VFS_I(ip)->i_mode))
                return i_size_read(VFS_I(ip));
        return ip->i_d.di_size;
 }
@@ -369,7 +369,7 @@ static inline int xfs_isiflocked(struct xfs_inode *ip)
 */
 #define XFS_INHERIT_GID(pip)    \
        (((pip)->i_mount->m_flags & XFS_MOUNT_GRPID) || \
-         ((pip)->i_d.di_mode & S_ISGID))
+         (VFS_I(pip)->i_mode & S_ISGID))
 int             xfs_release(struct xfs_inode *ip);
 void            xfs_inactive(struct xfs_inode *ip);
@@ -405,8 +405,6 @@ int		xfs_ifree(struct xfs_trans *, xfs_inode_t *,
                           struct xfs_bmap_free *);
 int             xfs_itruncate_extents(struct xfs_trans **, struct xfs_inode *,
                                      int, xfs_fsize_t);
-int             xfs_iunlink(struct xfs_trans *, xfs_inode_t *);
 void            xfs_iext_realloc(xfs_inode_t *, int, int);
 void            xfs_iunpin_wait(xfs_inode_t *);
@@ -437,6 +435,8 @@ int	xfs_update_prealloc_flags(struct xfs_inode *ip,
 int     xfs_zero_eof(struct xfs_inode *ip, xfs_off_t offset,
                     xfs_fsize_t isize, bool *did_zeroing);
 int     xfs_iozero(struct xfs_inode *ip, loff_t pos, size_t count);
+loff_t  __xfs_seek_hole_data(struct inode *inode, loff_t start,
+                             loff_t eof, int whence);
 /* from xfs_iops.c */
diff --git a/fs/xfs/xfs_inode_item.c b/fs/xfs/xfs_inode_item.c
index d14b12b8cfef..c48b5b18d771 100644
--- a/fs/xfs/xfs_inode_item.c
+++ b/fs/xfs/xfs_inode_item.c
@@ -135,7 +135,7 @@ xfs_inode_item_size(
        *nvecs += 2;
        *nbytes += sizeof(struct xfs_inode_log_format) +
-                   xfs_icdinode_size(ip->i_d.di_version);
+                   xfs_log_dinode_size(ip->i_d.di_version);
        xfs_inode_item_data_fork_size(iip, nvecs, nbytes);
        if (XFS_IFORK_Q(ip))
@@ -322,6 +322,81 @@ xfs_inode_item_format_attr_fork(
        }
 }
+static void
+xfs_inode_to_log_dinode(
+        struct xfs_inode        *ip,
+        struct xfs_log_dinode   *to,
+        xfs_lsn_t               lsn)
+{
+        struct xfs_icdinode     *from = &ip->i_d;
+        struct inode            *inode = VFS_I(ip);
+        to->di_magic = XFS_DINODE_MAGIC;
+        to->di_version = from->di_version;
+        to->di_format = from->di_format;
+        to->di_uid = from->di_uid;
+        to->di_gid = from->di_gid;
+        to->di_projid_lo = from->di_projid_lo;
+        to->di_projid_hi = from->di_projid_hi;
+        memset(to->di_pad, 0, sizeof(to->di_pad));
+        memset(to->di_pad3, 0, sizeof(to->di_pad3));
+        to->di_atime.t_sec = inode->i_atime.tv_sec;
+        to->di_atime.t_nsec = inode->i_atime.tv_nsec;
+        to->di_mtime.t_sec = inode->i_mtime.tv_sec;
+        to->di_mtime.t_nsec = inode->i_mtime.tv_nsec;
+        to->di_ctime.t_sec = inode->i_ctime.tv_sec;
+        to->di_ctime.t_nsec = inode->i_ctime.tv_nsec;
+        to->di_nlink = inode->i_nlink;
+        to->di_gen = inode->i_generation;
+        to->di_mode = inode->i_mode;
+        to->di_size = from->di_size;
+        to->di_nblocks = from->di_nblocks;
+        to->di_extsize = from->di_extsize;
+        to->di_nextents = from->di_nextents;
+        to->di_anextents = from->di_anextents;
+        to->di_forkoff = from->di_forkoff;
+        to->di_aformat = from->di_aformat;
+        to->di_dmevmask = from->di_dmevmask;
+        to->di_dmstate = from->di_dmstate;
+        to->di_flags = from->di_flags;
+        if (from->di_version == 3) {
+                to->di_changecount = inode->i_version;
+                to->di_crtime.t_sec = from->di_crtime.t_sec;
+                to->di_crtime.t_nsec = from->di_crtime.t_nsec;
+                to->di_flags2 = from->di_flags2;
+                to->di_ino = ip->i_ino;
+                to->di_lsn = lsn;
+                memset(to->di_pad2, 0, sizeof(to->di_pad2));
+                uuid_copy(&to->di_uuid, &ip->i_mount->m_sb.sb_meta_uuid);
+                to->di_flushiter = 0;
+        } else {
+                to->di_flushiter = from->di_flushiter;
+        }
+}
+/*
+ * Format the inode core. Current timestamp data is only in the VFS inode
+ * fields, so we need to grab them from there. Hence rather than just copying
+ * the XFS inode core structure, format the fields directly into the iovec.
+ */
+static void
+xfs_inode_item_format_core(
+        struct xfs_inode        *ip,
+        struct xfs_log_vec      *lv,
+        struct xfs_log_iovec    **vecp)
+{
+        struct xfs_log_dinode   *dic;
+        dic = xlog_prepare_iovec(lv, vecp, XLOG_REG_TYPE_ICORE);
+        xfs_inode_to_log_dinode(ip, dic, ip->i_itemp->ili_item.li_lsn);
+        xlog_finish_iovec(lv, *vecp, xfs_log_dinode_size(ip->i_d.di_version));
+}
 /*
 * This is called to fill in the vector of log iovecs for the given inode
 * log item.  It fills the first item with an inode log format structure,
@@ -351,10 +426,7 @@ xfs_inode_item_format(
        ilf->ilf_size = 2; /* format + core */
        xlog_finish_iovec(lv, vecp, sizeof(struct xfs_inode_log_format));
-        xlog_copy_iovec(lv, &vecp, XLOG_REG_TYPE_ICORE,
+        xfs_inode_item_format_core(ip, lv, &vecp);
-                        &ip->i_d,
-                        xfs_icdinode_size(ip->i_d.di_version));
        xfs_inode_item_format_data_fork(iip, ilf, lv, &vecp);
        if (XFS_IFORK_Q(ip)) {
                xfs_inode_item_format_attr_fork(iip, ilf, lv, &vecp);
diff --git a/fs/xfs/xfs_ioctl.c b/fs/xfs/xfs_ioctl.c
index 478d04e07f95..bcb6c19ce3ea 100644
--- a/fs/xfs/xfs_ioctl.c
+++ b/fs/xfs/xfs_ioctl.c
@@ -114,7 +114,7 @@ xfs_find_handle(
                handle.ha_fid.fid_len = sizeof(xfs_fid_t) -
                                        sizeof(handle.ha_fid.fid_len);
                handle.ha_fid.fid_pad = 0;
-                handle.ha_fid.fid_gen = ip->i_d.di_gen;
+                handle.ha_fid.fid_gen = inode->i_generation;
                handle.ha_fid.fid_ino = ip->i_ino;
                hsize = XFS_HSIZE(handle);
@@ -963,7 +963,7 @@ xfs_set_diflags(
                di_flags |= XFS_DIFLAG_NODEFRAG;
        if (xflags & FS_XFLAG_FILESTREAM)
                di_flags |= XFS_DIFLAG_FILESTREAM;
-        if (S_ISDIR(ip->i_d.di_mode)) {
+        if (S_ISDIR(VFS_I(ip)->i_mode)) {
                if (xflags & FS_XFLAG_RTINHERIT)
                        di_flags |= XFS_DIFLAG_RTINHERIT;
                if (xflags & FS_XFLAG_NOSYMLINKS)
@@ -972,7 +972,7 @@ xfs_set_diflags(
                        di_flags |= XFS_DIFLAG_EXTSZINHERIT;
                if (xflags & FS_XFLAG_PROJINHERIT)
                        di_flags |= XFS_DIFLAG_PROJINHERIT;
-        } else if (S_ISREG(ip->i_d.di_mode)) {
+        } else if (S_ISREG(VFS_I(ip)->i_mode)) {
                if (xflags & FS_XFLAG_REALTIME)
                        di_flags |= XFS_DIFLAG_REALTIME;
                if (xflags & FS_XFLAG_EXTSIZE)
@@ -1060,23 +1060,86 @@ xfs_ioctl_setattr_xflags(
 }
 /*
+ * If we are changing DAX flags, we have to ensure the file is clean and any
+ * cached objects in the address space are invalidated and removed. This
+ * requires us to lock out other IO and page faults similar to a truncate
+ * operation. The locks need to be held until the transaction has been committed
+ * so that the cache invalidation is atomic with respect to the DAX flag
+ * manipulation.
+ */
+static int
+xfs_ioctl_setattr_dax_invalidate(
+        struct xfs_inode        *ip,
+        struct fsxattr          *fa,
+        int                     *join_flags)
+{
+        struct inode            *inode = VFS_I(ip);
+        int                     error;
+        *join_flags = 0;
+        /*
+         * It is only valid to set the DAX flag on regular files and
+         * directories on filesystems where the block size is equal to the page
+         * size. On directories it serves as an inherit hint.
+         */
+        if (fa->fsx_xflags & FS_XFLAG_DAX) {
+                if (!(S_ISREG(inode->i_mode) || S_ISDIR(inode->i_mode)))
+                        return -EINVAL;
+                if (ip->i_mount->m_sb.sb_blocksize != PAGE_SIZE)
+                        return -EINVAL;
+        }
+        /* If the DAX state is not changing, we have nothing to do here. */
+        if ((fa->fsx_xflags & FS_XFLAG_DAX) && IS_DAX(inode))
+                return 0;
+        if (!(fa->fsx_xflags & FS_XFLAG_DAX) && !IS_DAX(inode))
+                return 0;
+        /* lock, flush and invalidate mapping in preparation for flag change */
+        xfs_ilock(ip, XFS_MMAPLOCK_EXCL | XFS_IOLOCK_EXCL);
+        error = filemap_write_and_wait(inode->i_mapping);
+        if (error)
+                goto out_unlock;
+        error = invalidate_inode_pages2(inode->i_mapping);
+        if (error)
+                goto out_unlock;
+        *join_flags = XFS_MMAPLOCK_EXCL | XFS_IOLOCK_EXCL;
+        return 0;
+out_unlock:
+        xfs_iunlock(ip, XFS_MMAPLOCK_EXCL | XFS_IOLOCK_EXCL);
+        return error;
+}
+/*
 * Set up the transaction structure for the setattr operation, checking that we
 * have permission to do so. On success, return a clean transaction and the
 * inode locked exclusively ready for further operation specific checks. On
 * failure, return an error without modifying or locking the inode.
+ *
+ * The inode might already be IO locked on call. If this is the case, it is
+ * indicated in @join_flags and we take full responsibility for ensuring they
+ * are unlocked from now on. Hence if we have an error here, we still have to
+ * unlock them. Otherwise, once they are joined to the transaction, they will
+ * be unlocked on commit/cancel.
 */
 static struct xfs_trans *
 xfs_ioctl_setattr_get_trans(
-        struct xfs_inode        *ip)
+        struct xfs_inode        *ip,
+        int                     join_flags)
 {
        struct xfs_mount        *mp = ip->i_mount;
        struct xfs_trans        *tp;
-        int                     error;
+        int                     error = -EROFS;
        if (mp->m_flags & XFS_MOUNT_RDONLY)
-                return ERR_PTR(-EROFS);
+                goto out_unlock;
+        error = -EIO;
        if (XFS_FORCED_SHUTDOWN(mp))
-                return ERR_PTR(-EIO);
+                goto out_unlock;
        tp = xfs_trans_alloc(mp, XFS_TRANS_SETATTR_NOT_SIZE);
        error = xfs_trans_reserve(tp, &M_RES(mp)->tr_ichange, 0, 0);
@@ -1084,7 +1147,8 @@ xfs_ioctl_setattr_get_trans(
                goto out_cancel;
        xfs_ilock(ip, XFS_ILOCK_EXCL);
-        xfs_trans_ijoin(tp, ip, XFS_ILOCK_EXCL);
+        xfs_trans_ijoin(tp, ip, XFS_ILOCK_EXCL | join_flags);
+        join_flags = 0;
        /*
         * CAP_FOWNER overrides the following restrictions:
@@ -1104,6 +1168,9 @@ xfs_ioctl_setattr_get_trans(
 out_cancel:
        xfs_trans_cancel(tp);
+out_unlock:
+        if (join_flags)
+                xfs_iunlock(ip, join_flags);
        return ERR_PTR(error);
 }
@@ -1128,14 +1195,14 @@ xfs_ioctl_setattr_check_extsize(
 {
        struct xfs_mount        *mp = ip->i_mount;
-        if ((fa->fsx_xflags & FS_XFLAG_EXTSIZE) && !S_ISREG(ip->i_d.di_mode))
+        if ((fa->fsx_xflags & FS_XFLAG_EXTSIZE) && !S_ISREG(VFS_I(ip)->i_mode))
                return -EINVAL;
        if ((fa->fsx_xflags & FS_XFLAG_EXTSZINHERIT) &&
-            !S_ISDIR(ip->i_d.di_mode))
+            !S_ISDIR(VFS_I(ip)->i_mode))
                return -EINVAL;
-        if (S_ISREG(ip->i_d.di_mode) && ip->i_d.di_nextents &&
+        if (S_ISREG(VFS_I(ip)->i_mode) && ip->i_d.di_nextents &&
            ((ip->i_d.di_extsize << mp->m_sb.sb_blocklog) != fa->fsx_extsize))
                return -EINVAL;
@@ -1202,6 +1269,7 @@ xfs_ioctl_setattr(
        struct xfs_dquot        *pdqp = NULL;
        struct xfs_dquot        *olddquot = NULL;
        int                     code;
+        int                     join_flags = 0;
        trace_xfs_ioctl_setattr(ip);
@@ -1225,7 +1293,18 @@ xfs_ioctl_setattr(
                        return code;
        }
-        tp = xfs_ioctl_setattr_get_trans(ip);
+        /*
+         * Changing DAX config may require inode locking for mapping
+         * invalidation. These need to be held all the way to transaction commit
+         * or cancel time, so need to be passed through to
+         * xfs_ioctl_setattr_get_trans() so it can apply them to the join call
+         * appropriately.
+         */
+        code = xfs_ioctl_setattr_dax_invalidate(ip, fa, &join_flags);
+        if (code)
+                goto error_free_dquots;
+        tp = xfs_ioctl_setattr_get_trans(ip, join_flags);
        if (IS_ERR(tp)) {
                code = PTR_ERR(tp);
                goto error_free_dquots;
@@ -1256,9 +1335,9 @@ xfs_ioctl_setattr(
         * successful return from chown()
         */
-        if ((ip->i_d.di_mode & (S_ISUID|S_ISGID)) &&
+        if ((VFS_I(ip)->i_mode & (S_ISUID|S_ISGID)) &&
            !capable_wrt_inode_uidgid(VFS_I(ip), CAP_FSETID))
-                ip->i_d.di_mode &= ~(S_ISUID|S_ISGID);
+                VFS_I(ip)->i_mode &= ~(S_ISUID|S_ISGID);
        /* Change the ownerships and register project quota modifications */
        if (xfs_get_projid(ip) != fa->fsx_projid) {
@@ -1341,6 +1420,7 @@ xfs_ioc_setxflags(
        struct xfs_trans        *tp;
        struct fsxattr          fa;
        unsigned int            flags;
+        int                     join_flags = 0;
        int                     error;
        if (copy_from_user(&flags, arg, sizeof(flags)))
@@ -1357,7 +1437,18 @@ xfs_ioc_setxflags(
        if (error)
                return error;
-        tp = xfs_ioctl_setattr_get_trans(ip);
+        /*
+         * Changing DAX config may require inode locking for mapping
+         * invalidation. These need to be held all the way to transaction commit
+         * or cancel time, so need to be passed through to
+         * xfs_ioctl_setattr_get_trans() so it can apply them to the join call
+         * appropriately.
+         */
+        error = xfs_ioctl_setattr_dax_invalidate(ip, &fa, &join_flags);
+        if (error)
+                goto out_drop_write;
+        tp = xfs_ioctl_setattr_get_trans(ip, join_flags);
        if (IS_ERR(tp)) {
                error = PTR_ERR(tp);
                goto out_drop_write;
diff --git a/fs/xfs/xfs_iops.c b/fs/xfs/xfs_iops.c
index 76b71a1c6c32..fb7dc61f4a29 100644
--- a/fs/xfs/xfs_iops.c
+++ b/fs/xfs/xfs_iops.c
@@ -459,8 +459,8 @@ xfs_vn_getattr(
        stat->size = XFS_ISIZE(ip);
        stat->dev = inode->i_sb->s_dev;
-        stat->mode = ip->i_d.di_mode;
+        stat->mode = inode->i_mode;
-        stat->nlink = ip->i_d.di_nlink;
+        stat->nlink = inode->i_nlink;
        stat->uid = inode->i_uid;
        stat->gid = inode->i_gid;
        stat->ino = ip->i_ino;
@@ -506,9 +506,6 @@ xfs_setattr_mode(
        ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL));
-        ip->i_d.di_mode &= S_IFMT;
-        ip->i_d.di_mode |= mode & ~S_IFMT;
        inode->i_mode &= S_IFMT;
        inode->i_mode |= mode & ~S_IFMT;
 }
@@ -522,21 +519,12 @@ xfs_setattr_time(
        ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL));
-        if (iattr->ia_valid & ATTR_ATIME) {
+        if (iattr->ia_valid & ATTR_ATIME)
                inode->i_atime = iattr->ia_atime;
-                ip->i_d.di_atime.t_sec = iattr->ia_atime.tv_sec;
+        if (iattr->ia_valid & ATTR_CTIME)
-                ip->i_d.di_atime.t_nsec = iattr->ia_atime.tv_nsec;
-        }
-        if (iattr->ia_valid & ATTR_CTIME) {
                inode->i_ctime = iattr->ia_ctime;
-                ip->i_d.di_ctime.t_sec = iattr->ia_ctime.tv_sec;
+        if (iattr->ia_valid & ATTR_MTIME)
-                ip->i_d.di_ctime.t_nsec = iattr->ia_ctime.tv_nsec;
-        }
-        if (iattr->ia_valid & ATTR_MTIME) {
                inode->i_mtime = iattr->ia_mtime;
-                ip->i_d.di_mtime.t_sec = iattr->ia_mtime.tv_sec;
-                ip->i_d.di_mtime.t_nsec = iattr->ia_mtime.tv_nsec;
-        }
 }
 int
@@ -661,9 +649,9 @@ xfs_setattr_nonsize(
                 * The set-user-ID and set-group-ID bits of a file will be
                 * cleared upon successful return from chown()
                 */
-                if ((ip->i_d.di_mode & (S_ISUID|S_ISGID)) &&
+                if ((inode->i_mode & (S_ISUID|S_ISGID)) &&
                    !capable(CAP_FSETID))
-                        ip->i_d.di_mode &= ~(S_ISUID|S_ISGID);
+                        inode->i_mode &= ~(S_ISUID|S_ISGID);
                /*
                 * Change the ownerships and register quota modifications
@@ -773,7 +761,7 @@ xfs_setattr_size(
        ASSERT(xfs_isilocked(ip, XFS_IOLOCK_EXCL));
        ASSERT(xfs_isilocked(ip, XFS_MMAPLOCK_EXCL));
-        ASSERT(S_ISREG(ip->i_d.di_mode));
+        ASSERT(S_ISREG(inode->i_mode));
        ASSERT((iattr->ia_valid & (ATTR_UID|ATTR_GID|ATTR_ATIME|ATTR_ATIME_SET|
                ATTR_MTIME_SET|ATTR_KILL_PRIV|ATTR_TIMES_SET)) == 0);
@@ -991,21 +979,13 @@ xfs_vn_update_time(
        }
        xfs_ilock(ip, XFS_ILOCK_EXCL);
-        if (flags & S_CTIME) {
+        if (flags & S_CTIME)
                inode->i_ctime = *now;
-                ip->i_d.di_ctime.t_sec = (__int32_t)now->tv_sec;
+        if (flags & S_MTIME)
-                ip->i_d.di_ctime.t_nsec = (__int32_t)now->tv_nsec;
-        }
-        if (flags & S_MTIME) {
                inode->i_mtime = *now;
-                ip->i_d.di_mtime.t_sec = (__int32_t)now->tv_sec;
+        if (flags & S_ATIME)
-                ip->i_d.di_mtime.t_nsec = (__int32_t)now->tv_nsec;
-        }
-        if (flags & S_ATIME) {
                inode->i_atime = *now;
-                ip->i_d.di_atime.t_sec = (__int32_t)now->tv_sec;
-                ip->i_d.di_atime.t_nsec = (__int32_t)now->tv_nsec;
-        }
        xfs_trans_ijoin(tp, ip, XFS_ILOCK_EXCL);
        xfs_trans_log_inode(tp, ip, XFS_ILOG_TIMESTAMP);
        return xfs_trans_commit(tp);
@@ -1205,8 +1185,10 @@ xfs_diflags_to_iflags(
                inode->i_flags |= S_SYNC;
        if (flags & XFS_DIFLAG_NOATIME)
                inode->i_flags |= S_NOATIME;
-        if (ip->i_mount->m_flags & XFS_MOUNT_DAX ||
+        if (S_ISREG(inode->i_mode) &&
-            ip->i_d.di_flags2 & XFS_DIFLAG2_DAX)
+            ip->i_mount->m_sb.sb_blocksize == PAGE_SIZE &&
+            (ip->i_mount->m_flags & XFS_MOUNT_DAX ||
+             ip->i_d.di_flags2 & XFS_DIFLAG2_DAX))
                inode->i_flags |= S_DAX;
 }
@@ -1232,8 +1214,6 @@ xfs_setup_inode(
        /* make the inode look hashed for the writeback code */
        hlist_add_fake(&inode->i_hash);
-        inode->i_mode   = ip->i_d.di_mode;
-        set_nlink(inode, ip->i_d.di_nlink);
        inode->i_uid    = xfs_uid_to_kuid(ip->i_d.di_uid);
        inode->i_gid    = xfs_gid_to_kgid(ip->i_d.di_gid);
@@ -1249,14 +1229,7 @@ xfs_setup_inode(
                break;
        }
-        inode->i_generation = ip->i_d.di_gen;
        i_size_write(inode, ip->i_d.di_size);
-        inode->i_atime.tv_sec   = ip->i_d.di_atime.t_sec;
-        inode->i_atime.tv_nsec  = ip->i_d.di_atime.t_nsec;
-        inode->i_mtime.tv_sec   = ip->i_d.di_mtime.t_sec;
-        inode->i_mtime.tv_nsec  = ip->i_d.di_mtime.t_nsec;
-        inode->i_ctime.tv_sec   = ip->i_d.di_ctime.t_sec;
-        inode->i_ctime.tv_nsec  = ip->i_d.di_ctime.t_nsec;
        xfs_diflags_to_iflags(inode, ip);
        ip->d_ops = ip->i_mount->m_nondir_inode_ops;
diff --git a/fs/xfs/xfs_itable.c b/fs/xfs/xfs_itable.c
index 930ebd86beba..ce73eb34620d 100644
--- a/fs/xfs/xfs_itable.c
+++ b/fs/xfs/xfs_itable.c
@@ -57,6 +57,7 @@ xfs_bulkstat_one_int(
 {
        struct xfs_icdinode     *dic;           /* dinode core info pointer */
        struct xfs_inode        *ip;            /* incore inode pointer */
+        struct inode            *inode;
        struct xfs_bstat        *buf;           /* return buffer */
        int                     error = 0;      /* error value */
@@ -77,30 +78,33 @@ xfs_bulkstat_one_int(
        ASSERT(ip != NULL);
        ASSERT(ip->i_imap.im_blkno != 0);
+        inode = VFS_I(ip);
        dic = &ip->i_d;
        /* xfs_iget returns the following without needing
         * further change.
         */
-        buf->bs_nlink = dic->di_nlink;
        buf->bs_projid_lo = dic->di_projid_lo;
        buf->bs_projid_hi = dic->di_projid_hi;
        buf->bs_ino = ino;
-        buf->bs_mode = dic->di_mode;
        buf->bs_uid = dic->di_uid;
        buf->bs_gid = dic->di_gid;
        buf->bs_size = dic->di_size;
-        buf->bs_atime.tv_sec = dic->di_atime.t_sec;
-        buf->bs_atime.tv_nsec = dic->di_atime.t_nsec;
+        buf->bs_nlink = inode->i_nlink;
-        buf->bs_mtime.tv_sec = dic->di_mtime.t_sec;
+        buf->bs_atime.tv_sec = inode->i_atime.tv_sec;
-        buf->bs_mtime.tv_nsec = dic->di_mtime.t_nsec;
+        buf->bs_atime.tv_nsec = inode->i_atime.tv_nsec;
-        buf->bs_ctime.tv_sec = dic->di_ctime.t_sec;
+        buf->bs_mtime.tv_sec = inode->i_mtime.tv_sec;
-        buf->bs_ctime.tv_nsec = dic->di_ctime.t_nsec;
+        buf->bs_mtime.tv_nsec = inode->i_mtime.tv_nsec;
+        buf->bs_ctime.tv_sec = inode->i_ctime.tv_sec;
+        buf->bs_ctime.tv_nsec = inode->i_ctime.tv_nsec;
+        buf->bs_gen = inode->i_generation;
+        buf->bs_mode = inode->i_mode;
        buf->bs_xflags = xfs_ip2xflags(ip);
        buf->bs_extsize = dic->di_extsize << mp->m_sb.sb_blocklog;
        buf->bs_extents = dic->di_nextents;
-        buf->bs_gen = dic->di_gen;
        memset(buf->bs_pad, 0, sizeof(buf->bs_pad));
        buf->bs_dmevmask = dic->di_dmevmask;
        buf->bs_dmstate = dic->di_dmstate;
diff --git a/fs/xfs/xfs_log.c b/fs/xfs/xfs_log.c
index 9c9a1c9bcc7f..b49ccf5c1d75 100644
--- a/fs/xfs/xfs_log.c
+++ b/fs/xfs/xfs_log.c
@@ -1212,7 +1212,7 @@ xlog_iodone(xfs_buf_t *bp)
        }
        /* log I/O is always issued ASYNC */
-        ASSERT(XFS_BUF_ISASYNC(bp));
+        ASSERT(bp->b_flags & XBF_ASYNC);
        xlog_state_done_syncing(iclog, aborted);
        /*
@@ -1864,9 +1864,8 @@ xlog_sync(
        bp->b_io_length = BTOBB(count);
        bp->b_fspriv = iclog;
-        XFS_BUF_ZEROFLAGS(bp);
+        bp->b_flags &= ~(XBF_FUA | XBF_FLUSH);
-        XFS_BUF_ASYNC(bp);
+        bp->b_flags |= (XBF_ASYNC | XBF_SYNCIO | XBF_WRITE);
-        bp->b_flags |= XBF_SYNCIO;
        if (log->l_mp->m_flags & XFS_MOUNT_BARRIER) {
                bp->b_flags |= XBF_FUA;
@@ -1893,12 +1892,11 @@ xlog_sync(
        /* account for log which doesn't start at block #0 */
        XFS_BUF_SET_ADDR(bp, XFS_BUF_ADDR(bp) + log->l_logBBstart);
        /*
         * Don't call xfs_bwrite here. We do log-syncs even when the filesystem
         * is shutting down.
         */
-        XFS_BUF_WRITE(bp);
        error = xlog_bdstrat(bp);
        if (error) {
                xfs_buf_ioerror_alert(bp, "xlog_sync");
@@ -1910,9 +1908,8 @@ xlog_sync(
                xfs_buf_associate_memory(bp,
                                (char *)&iclog->ic_header + count, split);
                bp->b_fspriv = iclog;
-                XFS_BUF_ZEROFLAGS(bp);
+                bp->b_flags &= ~(XBF_FUA | XBF_FLUSH);
-                XFS_BUF_ASYNC(bp);
+                bp->b_flags |= (XBF_ASYNC | XBF_SYNCIO | XBF_WRITE);
-                bp->b_flags |= XBF_SYNCIO;
                if (log->l_mp->m_flags & XFS_MOUNT_BARRIER)
                        bp->b_flags |= XBF_FUA;
@@ -1921,7 +1918,6 @@ xlog_sync(
                /* account for internal log which doesn't start at block #0 */
                XFS_BUF_SET_ADDR(bp, XFS_BUF_ADDR(bp) + log->l_logBBstart);
-                XFS_BUF_WRITE(bp);
                error = xlog_bdstrat(bp);
                if (error) {
                        xfs_buf_ioerror_alert(bp, "xlog_sync (split)");
@@ -2012,77 +2008,81 @@ xlog_print_tic_res(
        uint ophdr_spc = ticket->t_res_num_ophdrs * (uint)sizeof(xlog_op_header_t);
        /* match with XLOG_REG_TYPE_* in xfs_log.h */
-        static char *res_type_str[XLOG_REG_TYPE_MAX] = {
+#define REG_TYPE_STR(type, str) [XLOG_REG_TYPE_##type] = str
-            "bformat",
+        static char *res_type_str[XLOG_REG_TYPE_MAX + 1] = {
-            "bchunk",
+            REG_TYPE_STR(BFORMAT, "bformat"),
-            "efi_format",
+            REG_TYPE_STR(BCHUNK, "bchunk"),
-            "efd_format",
+            REG_TYPE_STR(EFI_FORMAT, "efi_format"),
-            "iformat",
+            REG_TYPE_STR(EFD_FORMAT, "efd_format"),
-            "icore",
+            REG_TYPE_STR(IFORMAT, "iformat"),
-            "iext",
+            REG_TYPE_STR(ICORE, "icore"),
-            "ibroot",
+            REG_TYPE_STR(IEXT, "iext"),
-            "ilocal",
+            REG_TYPE_STR(IBROOT, "ibroot"),
-            "iattr_ext",
+            REG_TYPE_STR(ILOCAL, "ilocal"),
-            "iattr_broot",
+            REG_TYPE_STR(IATTR_EXT, "iattr_ext"),
-            "iattr_local",
+            REG_TYPE_STR(IATTR_BROOT, "iattr_broot"),
-            "qformat",
+            REG_TYPE_STR(IATTR_LOCAL, "iattr_local"),
-            "dquot",
+            REG_TYPE_STR(QFORMAT, "qformat"),
-            "quotaoff",
+            REG_TYPE_STR(DQUOT, "dquot"),
-            "LR header",
+            REG_TYPE_STR(QUOTAOFF, "quotaoff"),
-            "unmount",
+            REG_TYPE_STR(LRHEADER, "LR header"),
-            "commit",
+            REG_TYPE_STR(UNMOUNT, "unmount"),
-            "trans header"
+            REG_TYPE_STR(COMMIT, "commit"),
+            REG_TYPE_STR(TRANSHDR, "trans header"),
+            REG_TYPE_STR(ICREATE, "inode create")
        };
+#undef REG_TYPE_STR
+#define TRANS_TYPE_STR(type)    [XFS_TRANS_##type] = #type
        static char *trans_type_str[XFS_TRANS_TYPE_MAX] = {
-            "SETATTR_NOT_SIZE",
+            TRANS_TYPE_STR(SETATTR_NOT_SIZE),
-            "SETATTR_SIZE",
+            TRANS_TYPE_STR(SETATTR_SIZE),
-            "INACTIVE",
+            TRANS_TYPE_STR(INACTIVE),
-            "CREATE",
+            TRANS_TYPE_STR(CREATE),
-            "CREATE_TRUNC",
+            TRANS_TYPE_STR(CREATE_TRUNC),
-            "TRUNCATE_FILE",
+            TRANS_TYPE_STR(TRUNCATE_FILE),
-            "REMOVE",
+            TRANS_TYPE_STR(REMOVE),
-            "LINK",
+            TRANS_TYPE_STR(LINK),
-            "RENAME",
+            TRANS_TYPE_STR(RENAME),
-            "MKDIR",
+            TRANS_TYPE_STR(MKDIR),
-            "RMDIR",
+            TRANS_TYPE_STR(RMDIR),
-            "SYMLINK",
+            TRANS_TYPE_STR(SYMLINK),
-            "SET_DMATTRS",
+            TRANS_TYPE_STR(SET_DMATTRS),
-            "GROWFS",
+            TRANS_TYPE_STR(GROWFS),
-            "STRAT_WRITE",
+            TRANS_TYPE_STR(STRAT_WRITE),
-            "DIOSTRAT",
+            TRANS_TYPE_STR(DIOSTRAT),
-            "WRITE_SYNC",
+            TRANS_TYPE_STR(WRITEID),
-            "WRITEID",
+            TRANS_TYPE_STR(ADDAFORK),
-            "ADDAFORK",
+            TRANS_TYPE_STR(ATTRINVAL),
-            "ATTRINVAL",
+            TRANS_TYPE_STR(ATRUNCATE),
-            "ATRUNCATE",
+            TRANS_TYPE_STR(ATTR_SET),
-            "ATTR_SET",
+            TRANS_TYPE_STR(ATTR_RM),
-            "ATTR_RM",
+            TRANS_TYPE_STR(ATTR_FLAG),
-            "ATTR_FLAG",
+            TRANS_TYPE_STR(CLEAR_AGI_BUCKET),
-            "CLEAR_AGI_BUCKET",
+            TRANS_TYPE_STR(SB_CHANGE),
-            "QM_SBCHANGE",
+            TRANS_TYPE_STR(DUMMY1),
-            "DUMMY1",
+            TRANS_TYPE_STR(DUMMY2),
-            "DUMMY2",
+            TRANS_TYPE_STR(QM_QUOTAOFF),
-            "QM_QUOTAOFF",
+            TRANS_TYPE_STR(QM_DQALLOC),
-            "QM_DQALLOC",
+            TRANS_TYPE_STR(QM_SETQLIM),
-            "QM_SETQLIM",
+            TRANS_TYPE_STR(QM_DQCLUSTER),
-            "QM_DQCLUSTER",
+            TRANS_TYPE_STR(QM_QINOCREATE),
-            "QM_QINOCREATE",
+            TRANS_TYPE_STR(QM_QUOTAOFF_END),
-            "QM_QUOTAOFF_END",
+            TRANS_TYPE_STR(FSYNC_TS),
-            "FSYNC_TS",
+            TRANS_TYPE_STR(GROWFSRT_ALLOC),
-            "GROWFSRT_ALLOC",
+            TRANS_TYPE_STR(GROWFSRT_ZERO),
-            "GROWFSRT_ZERO",
+            TRANS_TYPE_STR(GROWFSRT_FREE),
-            "GROWFSRT_FREE",
+            TRANS_TYPE_STR(SWAPEXT),
-            "SWAPEXT",
+            TRANS_TYPE_STR(CHECKPOINT),
-            "CHECKPOINT",
+            TRANS_TYPE_STR(ICREATE),
-            "ICREATE",
+            TRANS_TYPE_STR(CREATE_TMPFILE)
-            "CREATE_TMPFILE"
        };
+#undef TRANS_TYPE_STR
        xfs_warn(mp, "xlog_write: reservation summary:");
        xfs_warn(mp, "  trans type  = %s (%u)",
                 ((ticket->t_trans_type <= 0 ||
                   ticket->t_trans_type > XFS_TRANS_TYPE_MAX) ?
-                  "bad-trans-type" : trans_type_str[ticket->t_trans_type-1]),
+                  "bad-trans-type" : trans_type_str[ticket->t_trans_type]),
                 ticket->t_trans_type);
        xfs_warn(mp, "  unit res    = %d bytes",
                 ticket->t_unit_res);
@@ -2101,7 +2101,7 @@ xlog_print_tic_res(
                uint r_type = ticket->t_res_arr[i].r_type;
                xfs_warn(mp, "region[%u]: %s - %u bytes", i,
                            ((r_type <= 0 || r_type > XLOG_REG_TYPE_MAX) ?
-                            "bad-rtype" : res_type_str[r_type-1]),
+                            "bad-rtype" : res_type_str[r_type]),
                            ticket->t_res_arr[i].r_len);
        }
@@ -3979,7 +3979,7 @@ xfs_log_force_umount(
            log->l_flags & XLOG_ACTIVE_RECOVERY) {
                mp->m_flags |= XFS_MOUNT_FS_SHUTDOWN;
                if (mp->m_sb_bp)
-                        XFS_BUF_DONE(mp->m_sb_bp);
+                        mp->m_sb_bp->b_flags |= XBF_DONE;
                return 0;
        }
@@ -4009,7 +4009,7 @@ xfs_log_force_umount(
        spin_lock(&log->l_icloglock);
        mp->m_flags |= XFS_MOUNT_FS_SHUTDOWN;
        if (mp->m_sb_bp)
-                XFS_BUF_DONE(mp->m_sb_bp);
+                mp->m_sb_bp->b_flags |= XBF_DONE;
        /*
         * Mark the log and the iclogs with IO error flags to prevent any
diff --git a/fs/xfs/xfs_log_recover.c b/fs/xfs/xfs_log_recover.c
index be5568839442..396565f43247 100644
--- a/fs/xfs/xfs_log_recover.c
+++ b/fs/xfs/xfs_log_recover.c
@@ -190,7 +190,7 @@ xlog_bread_noalign(
        ASSERT(nbblks <= bp->b_length);
        XFS_BUF_SET_ADDR(bp, log->l_logBBstart + blk_no);
-        XFS_BUF_READ(bp);
+        bp->b_flags |= XBF_READ;
        bp->b_io_length = nbblks;
        bp->b_error = 0;
@@ -275,7 +275,6 @@ xlog_bwrite(
        ASSERT(nbblks <= bp->b_length);
        XFS_BUF_SET_ADDR(bp, log->l_logBBstart + blk_no);
-        XFS_BUF_ZEROFLAGS(bp);
        xfs_buf_hold(bp);
        xfs_buf_lock(bp);
        bp->b_io_length = nbblks;
@@ -2538,6 +2537,13 @@ xlog_recover_validate_buf_type(
                }
                bp->b_ops = &xfs_sb_buf_ops;
                break;
+#ifdef CONFIG_XFS_RT
+        case XFS_BLFT_RTBITMAP_BUF:
+        case XFS_BLFT_RTSUMMARY_BUF:
+                /* no magic numbers for verification of RT buffers */
+                bp->b_ops = &xfs_rtbuf_ops;
+                break;
+#endif /* CONFIG_XFS_RT */
        default:
                xfs_warn(mp, "Unknown buffer type %d!",
                         xfs_blft_from_flags(buf_f));
@@ -2858,7 +2864,7 @@ xfs_recover_inode_owner_change(
                return -ENOMEM;
        /* instantiate the inode */
-        xfs_dinode_from_disk(&ip->i_d, dip);
+        xfs_inode_from_disk(ip, dip);
        ASSERT(ip->i_d.di_version >= 3);
        error = xfs_iformat_fork(ip, dip);
@@ -2904,7 +2910,7 @@ xlog_recover_inode_pass2(
        int                     error;
        int                     attr_index;
        uint                    fields;
-        xfs_icdinode_t          *dicp;
+        struct xfs_log_dinode   *ldip;
        uint                    isize;
        int                     need_free = 0;
@@ -2957,8 +2963,8 @@ xlog_recover_inode_pass2(
                error = -EFSCORRUPTED;
                goto out_release;
        }
-        dicp = item->ri_buf[1].i_addr;
+        ldip = item->ri_buf[1].i_addr;
-        if (unlikely(dicp->di_magic != XFS_DINODE_MAGIC)) {
+        if (unlikely(ldip->di_magic != XFS_DINODE_MAGIC)) {
                xfs_alert(mp,
                        "%s: Bad inode log record, rec ptr 0x%p, ino %Ld",
                        __func__, item, in_f->ilf_ino);
@@ -2994,13 +3000,13 @@ xlog_recover_inode_pass2(
         * to skip replay when the on disk inode is newer than the log one
         */
        if (!xfs_sb_version_hascrc(&mp->m_sb) &&
-            dicp->di_flushiter < be16_to_cpu(dip->di_flushiter)) {
+            ldip->di_flushiter < be16_to_cpu(dip->di_flushiter)) {
                /*
                 * Deal with the wrap case, DI_MAX_FLUSH is less
                 * than smaller numbers
                 */
                if (be16_to_cpu(dip->di_flushiter) == DI_MAX_FLUSH &&
-                    dicp->di_flushiter < (DI_MAX_FLUSH >> 1)) {
+                    ldip->di_flushiter < (DI_MAX_FLUSH >> 1)) {
                        /* do nothing */
                } else {
                        trace_xfs_log_recover_inode_skip(log, in_f);
@@ -3010,13 +3016,13 @@ xlog_recover_inode_pass2(
        }
        /* Take the opportunity to reset the flush iteration count */
-        dicp->di_flushiter = 0;
+        ldip->di_flushiter = 0;
-        if (unlikely(S_ISREG(dicp->di_mode))) {
+        if (unlikely(S_ISREG(ldip->di_mode))) {
-                if ((dicp->di_format != XFS_DINODE_FMT_EXTENTS) &&
+                if ((ldip->di_format != XFS_DINODE_FMT_EXTENTS) &&
-                    (dicp->di_format != XFS_DINODE_FMT_BTREE)) {
+                    (ldip->di_format != XFS_DINODE_FMT_BTREE)) {
                        XFS_CORRUPTION_ERROR("xlog_recover_inode_pass2(3)",
-                                         XFS_ERRLEVEL_LOW, mp, dicp);
+                                         XFS_ERRLEVEL_LOW, mp, ldip);
                        xfs_alert(mp,
                "%s: Bad regular inode log record, rec ptr 0x%p, "
                "ino ptr = 0x%p, ino bp = 0x%p, ino %Ld",
@@ -3024,12 +3030,12 @@ xlog_recover_inode_pass2(
                        error = -EFSCORRUPTED;
                        goto out_release;
                }
-        } else if (unlikely(S_ISDIR(dicp->di_mode))) {
+        } else if (unlikely(S_ISDIR(ldip->di_mode))) {
-                if ((dicp->di_format != XFS_DINODE_FMT_EXTENTS) &&
+                if ((ldip->di_format != XFS_DINODE_FMT_EXTENTS) &&
-                    (dicp->di_format != XFS_DINODE_FMT_BTREE) &&
+                    (ldip->di_format != XFS_DINODE_FMT_BTREE) &&
-                    (dicp->di_format != XFS_DINODE_FMT_LOCAL)) {
+                    (ldip->di_format != XFS_DINODE_FMT_LOCAL)) {
                        XFS_CORRUPTION_ERROR("xlog_recover_inode_pass2(4)",
-                                             XFS_ERRLEVEL_LOW, mp, dicp);
+                                             XFS_ERRLEVEL_LOW, mp, ldip);
                        xfs_alert(mp,
                "%s: Bad dir inode log record, rec ptr 0x%p, "
                "ino ptr = 0x%p, ino bp = 0x%p, ino %Ld",
@@ -3038,32 +3044,32 @@ xlog_recover_inode_pass2(
                        goto out_release;
                }
        }
-        if (unlikely(dicp->di_nextents + dicp->di_anextents > dicp->di_nblocks)){
+        if (unlikely(ldip->di_nextents + ldip->di_anextents > ldip->di_nblocks)){
                XFS_CORRUPTION_ERROR("xlog_recover_inode_pass2(5)",
-                                     XFS_ERRLEVEL_LOW, mp, dicp);
+                                     XFS_ERRLEVEL_LOW, mp, ldip);
                xfs_alert(mp,
        "%s: Bad inode log record, rec ptr 0x%p, dino ptr 0x%p, "
        "dino bp 0x%p, ino %Ld, total extents = %d, nblocks = %Ld",
                        __func__, item, dip, bp, in_f->ilf_ino,
-                        dicp->di_nextents + dicp->di_anextents,
+                        ldip->di_nextents + ldip->di_anextents,
-                        dicp->di_nblocks);
+                        ldip->di_nblocks);
                error = -EFSCORRUPTED;
                goto out_release;
        }
-        if (unlikely(dicp->di_forkoff > mp->m_sb.sb_inodesize)) {
+        if (unlikely(ldip->di_forkoff > mp->m_sb.sb_inodesize)) {
                XFS_CORRUPTION_ERROR("xlog_recover_inode_pass2(6)",
-                                     XFS_ERRLEVEL_LOW, mp, dicp);
+                                     XFS_ERRLEVEL_LOW, mp, ldip);
                xfs_alert(mp,
        "%s: Bad inode log record, rec ptr 0x%p, dino ptr 0x%p, "
        "dino bp 0x%p, ino %Ld, forkoff 0x%x", __func__,
-                        item, dip, bp, in_f->ilf_ino, dicp->di_forkoff);
+                        item, dip, bp, in_f->ilf_ino, ldip->di_forkoff);
                error = -EFSCORRUPTED;
                goto out_release;
        }
-        isize = xfs_icdinode_size(dicp->di_version);
+        isize = xfs_log_dinode_size(ldip->di_version);
        if (unlikely(item->ri_buf[1].i_len > isize)) {
                XFS_CORRUPTION_ERROR("xlog_recover_inode_pass2(7)",
-                                     XFS_ERRLEVEL_LOW, mp, dicp);
+                                     XFS_ERRLEVEL_LOW, mp, ldip);
                xfs_alert(mp,
                        "%s: Bad inode log record length %d, rec ptr 0x%p",
                        __func__, item->ri_buf[1].i_len, item);
@@ -3071,8 +3077,8 @@ xlog_recover_inode_pass2(
                goto out_release;
        }
-        /* The core is in in-core format */
+        /* recover the log dinode inode into the on disk inode */
-        xfs_dinode_to_disk(dip, dicp);
+        xfs_log_dinode_to_disk(ldip, dip);
        /* the rest is in on-disk format */
        if (item->ri_buf[1].i_len > isize) {
@@ -4402,8 +4408,8 @@ xlog_recover_process_one_iunlink(
        if (error)
                goto fail_iput;
-        ASSERT(ip->i_d.di_nlink == 0);
+        ASSERT(VFS_I(ip)->i_nlink == 0);
-        ASSERT(ip->i_d.di_mode != 0);
+        ASSERT(VFS_I(ip)->i_mode != 0);
        /* setup for the next pass */
        agino = be32_to_cpu(dip->di_next_unlinked);
@@ -4957,6 +4963,7 @@ xlog_do_recover(
        xfs_daddr_t     head_blk,
        xfs_daddr_t     tail_blk)
 {
+        struct xfs_mount *mp = log->l_mp;
        int             error;
        xfs_buf_t       *bp;
        xfs_sb_t        *sbp;
@@ -4971,7 +4978,7 @@ xlog_do_recover(
        /*
         * If IO errors happened during recovery, bail out.
         */
-        if (XFS_FORCED_SHUTDOWN(log->l_mp)) {
+        if (XFS_FORCED_SHUTDOWN(mp)) {
                return -EIO;
        }
@@ -4984,22 +4991,21 @@ xlog_do_recover(
         * or iunlinks they will have some entries in the AIL; so we look at
         * the AIL to determine how to set the tail_lsn.
         */
-        xlog_assign_tail_lsn(log->l_mp);
+        xlog_assign_tail_lsn(mp);
        /*
         * Now that we've finished replaying all buffer and inode
         * updates, re-read in the superblock and reverify it.
         */
-        bp = xfs_getsb(log->l_mp, 0);
+        bp = xfs_getsb(mp, 0);
-        XFS_BUF_UNDONE(bp);
+        bp->b_flags &= ~(XBF_DONE | XBF_ASYNC);
-        ASSERT(!(XFS_BUF_ISWRITE(bp)));
+        ASSERT(!(bp->b_flags & XBF_WRITE));
-        XFS_BUF_READ(bp);
+        bp->b_flags |= XBF_READ;
-        XFS_BUF_UNASYNC(bp);
        bp->b_ops = &xfs_sb_buf_ops;
        error = xfs_buf_submit_wait(bp);
        if (error) {
-                if (!XFS_FORCED_SHUTDOWN(log->l_mp)) {
+                if (!XFS_FORCED_SHUTDOWN(mp)) {
                        xfs_buf_ioerror_alert(bp, __func__);
                        ASSERT(0);
                }
@@ -5008,14 +5014,17 @@ xlog_do_recover(
        }
        /* Convert superblock from on-disk format */
-        sbp = &log->l_mp->m_sb;
+        sbp = &mp->m_sb;
        xfs_sb_from_disk(sbp, XFS_BUF_TO_SBP(bp));
-        ASSERT(sbp->sb_magicnum == XFS_SB_MAGIC);
-        ASSERT(xfs_sb_good_version(sbp));
-        xfs_reinit_percpu_counters(log->l_mp);
        xfs_buf_relse(bp);
+        /* re-initialise in-core superblock and geometry structures */
+        xfs_reinit_percpu_counters(mp);
+        error = xfs_initialize_perag(mp, sbp->sb_agcount, &mp->m_maxagi);
+        if (error) {
+                xfs_warn(mp, "Failed post-recovery per-ag init: %d", error);
+                return error;
+        }
        xlog_recover_check_summary(log);
diff --git a/fs/xfs/xfs_mount.c b/fs/xfs/xfs_mount.c
index bb753b359bee..536a0ee9cd5a 100644
--- a/fs/xfs/xfs_mount.c
+++ b/fs/xfs/xfs_mount.c
@@ -185,9 +185,6 @@ xfs_initialize_perag(
        xfs_agnumber_t  index;
        xfs_agnumber_t  first_initialised = 0;
        xfs_perag_t     *pag;
-        xfs_agino_t     agino;
-        xfs_ino_t       ino;
-        xfs_sb_t        *sbp = &mp->m_sb;
        int             error = -ENOMEM;
        /*
@@ -230,22 +227,7 @@ xfs_initialize_perag(
                radix_tree_preload_end();
        }
-        /*
+        index = xfs_set_inode_alloc(mp, agcount);
-         * If we mount with the inode64 option, or no inode overflows
-         * the legacy 32-bit address space clear the inode32 option.
-         */
-        agino = XFS_OFFBNO_TO_AGINO(mp, sbp->sb_agblocks - 1, 0);
-        ino = XFS_AGINO_TO_INO(mp, agcount - 1, agino);
-        if ((mp->m_flags & XFS_MOUNT_SMALL_INUMS) && ino > XFS_MAXINUMBER_32)
-                mp->m_flags |= XFS_MOUNT_32BITINODES;
-        else
-                mp->m_flags &= ~XFS_MOUNT_32BITINODES;
-        if (mp->m_flags & XFS_MOUNT_32BITINODES)
-                index = xfs_set_inode32(mp, agcount);
-        else
-                index = xfs_set_inode64(mp, agcount);
        if (maxagi)
                *maxagi = index;
@@ -865,7 +847,7 @@ xfs_mountfs(
        ASSERT(rip != NULL);
-        if (unlikely(!S_ISDIR(rip->i_d.di_mode))) {
+        if (unlikely(!S_ISDIR(VFS_I(rip)->i_mode))) {
                xfs_warn(mp, "corrupted root inode %llu: not a directory",
                        (unsigned long long)rip->i_ino);
                xfs_iunlock(rip, XFS_ILOCK_EXCL);
@@ -1284,7 +1266,7 @@ xfs_getsb(
        }
        xfs_buf_hold(bp);
-        ASSERT(XFS_BUF_ISDONE(bp));
+        ASSERT(bp->b_flags & XBF_DONE);
        return bp;
 }
diff --git a/fs/xfs/xfs_mount.h b/fs/xfs/xfs_mount.h
index b57098481c10..bac6b3435591 100644
--- a/fs/xfs/xfs_mount.h
+++ b/fs/xfs/xfs_mount.h
@@ -147,6 +147,17 @@ typedef struct xfs_mount {
         * to various other kinds of pain inflicted on the pNFS server.
         */
        __uint32_t              m_generation;
+#ifdef DEBUG
+        /*
+         * DEBUG mode instrumentation to test and/or trigger delayed allocation
+         * block killing in the event of failed writes. When enabled, all
+         * buffered writes are forced to fail. All delalloc blocks in the range
+         * of the write (including pre-existing delalloc blocks!) are tossed as
+         * part of the write failure error handling sequence.
+         */
+        bool                    m_fail_writes;
+#endif
 } xfs_mount_t;
 /*
@@ -166,9 +177,8 @@ typedef struct xfs_mount {
 #define XFS_MOUNT_GRPID         (1ULL << 9)     /* group-ID assigned from directory */
 #define XFS_MOUNT_NORECOVERY    (1ULL << 10)    /* no recovery - dirty fs */
 #define XFS_MOUNT_DFLT_IOSIZE   (1ULL << 12)    /* set default i/o size */
-#define XFS_MOUNT_32BITINODES   (1ULL << 14)    /* do not create inodes above
+#define XFS_MOUNT_SMALL_INUMS   (1ULL << 14)    /* user wants 32bit inodes */
-                                                 * 32 bits in size */
+#define XFS_MOUNT_32BITINODES   (1ULL << 15)    /* inode32 allocator active */
-#define XFS_MOUNT_SMALL_INUMS   (1ULL << 15)    /* users wants 32bit inodes */
 #define XFS_MOUNT_NOUUID        (1ULL << 16)    /* ignore uuid during mount */
 #define XFS_MOUNT_BARRIER       (1ULL << 17)
 #define XFS_MOUNT_IKEEP         (1ULL << 18)    /* keep empty inode clusters*/
@@ -264,6 +274,20 @@ xfs_daddr_to_agbno(struct xfs_mount *mp, xfs_daddr_t d)
        return (xfs_agblock_t) do_div(ld, mp->m_sb.sb_agblocks);
 }
+#ifdef DEBUG
+static inline bool
+xfs_mp_fail_writes(struct xfs_mount *mp)
+{
+        return mp->m_fail_writes;
+}
+#else
+static inline bool
+xfs_mp_fail_writes(struct xfs_mount *mp)
+{
+        return 0;
+}
+#endif
 /*
 * Per-ag incore structure, copies of information in agf and agi, to improve the
 * performance of allocation group selection.
@@ -327,7 +351,6 @@ extern int	xfs_mod_fdblocks(struct xfs_mount *mp, int64_t delta,
                                 bool reserved);
 extern int      xfs_mod_frextents(struct xfs_mount *mp, int64_t delta);
-extern int      xfs_mount_log_sb(xfs_mount_t *);
 extern struct xfs_buf *xfs_getsb(xfs_mount_t *, int);
 extern int      xfs_readsb(xfs_mount_t *, int);
 extern void     xfs_freesb(xfs_mount_t *);
diff --git a/fs/xfs/xfs_ondisk.h b/fs/xfs/xfs_ondisk.h
new file mode 100644
index 000000000000..184c44effdd5
--- /dev/null
+++ b/fs/xfs/xfs_ondisk.h
@@ -0,0 +1,117 @@
+/*
+ * Copyright (c) 2016 Oracle.
+ * All Rights Reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it would be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write the Free Software Foundation,
+ * Inc.,  51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
+ */
+#ifndef __XFS_ONDISK_H
+#define __XFS_ONDISK_H
+#define XFS_CHECK_STRUCT_SIZE(structname, size) \
+        BUILD_BUG_ON_MSG(sizeof(structname) != (size), "XFS: sizeof(" \
+                #structname ") is wrong, expected " #size)
+static inline void __init
+xfs_check_ondisk_structs(void)
+{
+        /* ag/file structures */
+        XFS_CHECK_STRUCT_SIZE(struct xfs_acl,                   4);
+        XFS_CHECK_STRUCT_SIZE(struct xfs_acl_entry,             12);
+        XFS_CHECK_STRUCT_SIZE(struct xfs_agf,                   224);
+        XFS_CHECK_STRUCT_SIZE(struct xfs_agfl,                  36);
+        XFS_CHECK_STRUCT_SIZE(struct xfs_agi,                   336);
+        XFS_CHECK_STRUCT_SIZE(struct xfs_bmbt_key,              8);
+        XFS_CHECK_STRUCT_SIZE(struct xfs_bmbt_rec,              16);
+        XFS_CHECK_STRUCT_SIZE(struct xfs_bmdr_block,            4);
+        XFS_CHECK_STRUCT_SIZE(struct xfs_btree_block,           72);
+        XFS_CHECK_STRUCT_SIZE(struct xfs_dinode,                176);
+        XFS_CHECK_STRUCT_SIZE(struct xfs_disk_dquot,            104);
+        XFS_CHECK_STRUCT_SIZE(struct xfs_dqblk,                 136);
+        XFS_CHECK_STRUCT_SIZE(struct xfs_dsb,                   264);
+        XFS_CHECK_STRUCT_SIZE(struct xfs_dsymlink_hdr,          56);
+        XFS_CHECK_STRUCT_SIZE(struct xfs_inobt_key,             4);
+        XFS_CHECK_STRUCT_SIZE(struct xfs_inobt_rec,             16);
+        XFS_CHECK_STRUCT_SIZE(struct xfs_timestamp,             8);
+        XFS_CHECK_STRUCT_SIZE(xfs_alloc_key_t,                  8);
+        XFS_CHECK_STRUCT_SIZE(xfs_alloc_ptr_t,                  4);
+        XFS_CHECK_STRUCT_SIZE(xfs_alloc_rec_t,                  8);
+        XFS_CHECK_STRUCT_SIZE(xfs_inobt_ptr_t,                  4);
+        /* dir/attr trees */
+        XFS_CHECK_STRUCT_SIZE(struct xfs_attr3_leaf_hdr,        80);
+        XFS_CHECK_STRUCT_SIZE(struct xfs_attr3_leafblock,       88);
+        XFS_CHECK_STRUCT_SIZE(struct xfs_attr3_rmt_hdr,         56);
+        XFS_CHECK_STRUCT_SIZE(struct xfs_da3_blkinfo,           56);
+        XFS_CHECK_STRUCT_SIZE(struct xfs_da3_intnode,           64);
+        XFS_CHECK_STRUCT_SIZE(struct xfs_da3_node_hdr,          64);
+        XFS_CHECK_STRUCT_SIZE(struct xfs_dir3_blk_hdr,          48);
+        XFS_CHECK_STRUCT_SIZE(struct xfs_dir3_data_hdr,         64);
+        XFS_CHECK_STRUCT_SIZE(struct xfs_dir3_free,             64);
+        XFS_CHECK_STRUCT_SIZE(struct xfs_dir3_free_hdr,         64);
+        XFS_CHECK_STRUCT_SIZE(struct xfs_dir3_leaf,             64);
+        XFS_CHECK_STRUCT_SIZE(struct xfs_dir3_leaf_hdr,         64);
+        XFS_CHECK_STRUCT_SIZE(xfs_attr_leaf_entry_t,            8);
+        XFS_CHECK_STRUCT_SIZE(xfs_attr_leaf_hdr_t,              32);
+        XFS_CHECK_STRUCT_SIZE(xfs_attr_leaf_map_t,              4);
+        XFS_CHECK_STRUCT_SIZE(xfs_attr_leaf_name_local_t,       4);
+        /*
+         * m68k has problems with xfs_attr_leaf_name_remote_t, but we pad it to
+         * 4 bytes anyway so it's not obviously a problem.  Hence for the moment
+         * we don't check this structure. This can be re-instated when the attr
+         * definitions are updated to use c99 VLA definitions.
+         *
+        XFS_CHECK_STRUCT_SIZE(xfs_attr_leaf_name_remote_t,      12);
+         */
+        XFS_CHECK_STRUCT_SIZE(xfs_attr_leafblock_t,             40);
+        XFS_CHECK_STRUCT_SIZE(xfs_attr_shortform_t,             8);
+        XFS_CHECK_STRUCT_SIZE(xfs_da_blkinfo_t,                 12);
+        XFS_CHECK_STRUCT_SIZE(xfs_da_intnode_t,                 16);
+        XFS_CHECK_STRUCT_SIZE(xfs_da_node_entry_t,              8);
+        XFS_CHECK_STRUCT_SIZE(xfs_da_node_hdr_t,                16);
+        XFS_CHECK_STRUCT_SIZE(xfs_dir2_data_free_t,             4);
+        XFS_CHECK_STRUCT_SIZE(xfs_dir2_data_hdr_t,              16);
+        XFS_CHECK_STRUCT_SIZE(xfs_dir2_data_unused_t,           6);
+        XFS_CHECK_STRUCT_SIZE(xfs_dir2_free_hdr_t,              16);
+        XFS_CHECK_STRUCT_SIZE(xfs_dir2_free_t,                  16);
+        XFS_CHECK_STRUCT_SIZE(xfs_dir2_ino4_t,                  4);
+        XFS_CHECK_STRUCT_SIZE(xfs_dir2_ino8_t,                  8);
+        XFS_CHECK_STRUCT_SIZE(xfs_dir2_inou_t,                  8);
+        XFS_CHECK_STRUCT_SIZE(xfs_dir2_leaf_entry_t,            8);
+        XFS_CHECK_STRUCT_SIZE(xfs_dir2_leaf_hdr_t,              16);
+        XFS_CHECK_STRUCT_SIZE(xfs_dir2_leaf_t,                  16);
+        XFS_CHECK_STRUCT_SIZE(xfs_dir2_leaf_tail_t,             4);
+        XFS_CHECK_STRUCT_SIZE(xfs_dir2_sf_entry_t,              3);
+        XFS_CHECK_STRUCT_SIZE(xfs_dir2_sf_hdr_t,                10);
+        XFS_CHECK_STRUCT_SIZE(xfs_dir2_sf_off_t,                2);
+        /* log structures */
+        XFS_CHECK_STRUCT_SIZE(struct xfs_dq_logformat,          24);
+        XFS_CHECK_STRUCT_SIZE(struct xfs_efd_log_format_32,     28);
+        XFS_CHECK_STRUCT_SIZE(struct xfs_efd_log_format_64,     32);
+        XFS_CHECK_STRUCT_SIZE(struct xfs_efi_log_format_32,     28);
+        XFS_CHECK_STRUCT_SIZE(struct xfs_efi_log_format_64,     32);
+        XFS_CHECK_STRUCT_SIZE(struct xfs_extent_32,             12);
+        XFS_CHECK_STRUCT_SIZE(struct xfs_extent_64,             16);
+        XFS_CHECK_STRUCT_SIZE(struct xfs_log_dinode,            176);
+        XFS_CHECK_STRUCT_SIZE(struct xfs_icreate_log,           28);
+        XFS_CHECK_STRUCT_SIZE(struct xfs_ictimestamp,           8);
+        XFS_CHECK_STRUCT_SIZE(struct xfs_inode_log_format_32,   52);
+        XFS_CHECK_STRUCT_SIZE(struct xfs_inode_log_format_64,   56);
+        XFS_CHECK_STRUCT_SIZE(struct xfs_qoff_logformat,        20);
+        XFS_CHECK_STRUCT_SIZE(struct xfs_trans_header,          16);
+}
+#endif /* __XFS_ONDISK_H */
diff --git a/fs/xfs/xfs_pnfs.h b/fs/xfs/xfs_pnfs.h
index 8147ac108820..93f74853961b 100644
--- a/fs/xfs/xfs_pnfs.h
+++ b/fs/xfs/xfs_pnfs.h
@@ -1,7 +1,7 @@
 #ifndef _XFS_PNFS_H
 #define _XFS_PNFS_H 1
-#ifdef CONFIG_NFSD_PNFS
+#if defined(CONFIG_NFSD_BLOCKLAYOUT) || defined(CONFIG_NFSD_SCSILAYOUT)
 int xfs_fs_get_uuid(struct super_block *sb, u8 *buf, u32 *len, u64 *offset);
 int xfs_fs_map_blocks(struct inode *inode, loff_t offset, u64 length,
                struct iomap *iomap, bool write, u32 *device_generation);
diff --git a/fs/xfs/xfs_qm.c b/fs/xfs/xfs_qm.c
index 532ab79d38fe..be125e1758c1 100644
--- a/fs/xfs/xfs_qm.c
+++ b/fs/xfs/xfs_qm.c
@@ -560,6 +560,37 @@ xfs_qm_shrink_count(
        return list_lru_shrink_count(&qi->qi_lru, sc);
 }
+STATIC void
+xfs_qm_set_defquota(
+        xfs_mount_t     *mp,
+        uint            type,
+        xfs_quotainfo_t *qinf)
+{
+        xfs_dquot_t             *dqp;
+        struct xfs_def_quota    *defq;
+        int                     error;
+        error = xfs_qm_dqread(mp, 0, type, XFS_QMOPT_DOWARN, &dqp);
+        if (!error) {
+                xfs_disk_dquot_t        *ddqp = &dqp->q_core;
+                defq = xfs_get_defquota(dqp, qinf);
+                /*
+                 * Timers and warnings have been already set, let's just set the
+                 * default limits for this quota type
+                 */
+                defq->bhardlimit = be64_to_cpu(ddqp->d_blk_hardlimit);
+                defq->bsoftlimit = be64_to_cpu(ddqp->d_blk_softlimit);
+                defq->ihardlimit = be64_to_cpu(ddqp->d_ino_hardlimit);
+                defq->isoftlimit = be64_to_cpu(ddqp->d_ino_softlimit);
+                defq->rtbhardlimit = be64_to_cpu(ddqp->d_rtb_hardlimit);
+                defq->rtbsoftlimit = be64_to_cpu(ddqp->d_rtb_softlimit);
+                xfs_qm_dqdestroy(dqp);
+        }
+}
 /*
 * This initializes all the quota information that's kept in the
 * mount structure
@@ -606,19 +637,19 @@ xfs_qm_init_quotainfo(
         * We try to get the limits from the superuser's limits fields.
         * This is quite hacky, but it is standard quota practice.
         *
-         * We look at the USR dquot with id == 0 first, but if user quotas
-         * are not enabled we goto the GRP dquot with id == 0.
-         * We don't really care to keep separate default limits for user
-         * and group quotas, at least not at this point.
-         *
         * Since we may not have done a quotacheck by this point, just read
         * the dquot without attaching it to any hashtables or lists.
+         *
+         * Timers and warnings are globally set by the first timer found in
+         * user/group/proj quota types, otherwise a default value is used.
+         * This should be split into different fields per quota type.
         */
        error = xfs_qm_dqread(mp, 0,
                        XFS_IS_UQUOTA_RUNNING(mp) ? XFS_DQ_USER :
                         (XFS_IS_GQUOTA_RUNNING(mp) ? XFS_DQ_GROUP :
                          XFS_DQ_PROJ),
                        XFS_QMOPT_DOWARN, &dqp);
        if (!error) {
                xfs_disk_dquot_t        *ddqp = &dqp->q_core;
@@ -639,13 +670,6 @@ xfs_qm_init_quotainfo(
                        be16_to_cpu(ddqp->d_iwarns) : XFS_QM_IWARNLIMIT;
                qinf->qi_rtbwarnlimit = ddqp->d_rtbwarns ?
                        be16_to_cpu(ddqp->d_rtbwarns) : XFS_QM_RTBWARNLIMIT;
-                qinf->qi_bhardlimit = be64_to_cpu(ddqp->d_blk_hardlimit);
-                qinf->qi_bsoftlimit = be64_to_cpu(ddqp->d_blk_softlimit);
-                qinf->qi_ihardlimit = be64_to_cpu(ddqp->d_ino_hardlimit);
-                qinf->qi_isoftlimit = be64_to_cpu(ddqp->d_ino_softlimit);
-                qinf->qi_rtbhardlimit = be64_to_cpu(ddqp->d_rtb_hardlimit);
-                qinf->qi_rtbsoftlimit = be64_to_cpu(ddqp->d_rtb_softlimit);
                xfs_qm_dqdestroy(dqp);
        } else {
                qinf->qi_btimelimit = XFS_QM_BTIMELIMIT;
@@ -656,6 +680,13 @@ xfs_qm_init_quotainfo(
                qinf->qi_rtbwarnlimit = XFS_QM_RTBWARNLIMIT;
        }
+        if (XFS_IS_UQUOTA_RUNNING(mp))
+                xfs_qm_set_defquota(mp, XFS_DQ_USER, qinf);
+        if (XFS_IS_GQUOTA_RUNNING(mp))
+                xfs_qm_set_defquota(mp, XFS_DQ_GROUP, qinf);
+        if (XFS_IS_PQUOTA_RUNNING(mp))
+                xfs_qm_set_defquota(mp, XFS_DQ_PROJ, qinf);
        qinf->qi_shrinker.count_objects = xfs_qm_shrink_count;
        qinf->qi_shrinker.scan_objects = xfs_qm_shrink_scan;
        qinf->qi_shrinker.seeks = DEFAULT_SEEKS;
diff --git a/fs/xfs/xfs_qm.h b/fs/xfs/xfs_qm.h
index 996a04064894..2975a822e9f0 100644
--- a/fs/xfs/xfs_qm.h
+++ b/fs/xfs/xfs_qm.h
@@ -53,6 +53,15 @@ extern struct kmem_zone	*xfs_qm_dqtrxzone;
 */
 #define XFS_DQUOT_CLUSTER_SIZE_FSB      (xfs_filblks_t)1
+struct xfs_def_quota {
+        xfs_qcnt_t       bhardlimit;     /* default data blk hard limit */
+        xfs_qcnt_t       bsoftlimit;     /* default data blk soft limit */
+        xfs_qcnt_t       ihardlimit;     /* default inode count hard limit */
+        xfs_qcnt_t       isoftlimit;     /* default inode count soft limit */
+        xfs_qcnt_t       rtbhardlimit;   /* default realtime blk hard limit */
+        xfs_qcnt_t       rtbsoftlimit;   /* default realtime blk soft limit */
+};
 /*
 * Various quota information for individual filesystems.
 * The mount structure keeps a pointer to this.
@@ -76,12 +85,9 @@ typedef struct xfs_quotainfo {
        struct mutex     qi_quotaofflock;/* to serialize quotaoff */
        xfs_filblks_t    qi_dqchunklen;  /* # BBs in a chunk of dqs */
        uint             qi_dqperchunk;  /* # ondisk dqs in above chunk */
-        xfs_qcnt_t       qi_bhardlimit;  /* default data blk hard limit */
+        struct xfs_def_quota    qi_usr_default;
-        xfs_qcnt_t       qi_bsoftlimit;  /* default data blk soft limit */
+        struct xfs_def_quota    qi_grp_default;
-        xfs_qcnt_t       qi_ihardlimit;  /* default inode count hard limit */
+        struct xfs_def_quota    qi_prj_default;
-        xfs_qcnt_t       qi_isoftlimit;  /* default inode count soft limit */
-        xfs_qcnt_t       qi_rtbhardlimit;/* default realtime blk hard limit */
-        xfs_qcnt_t       qi_rtbsoftlimit;/* default realtime blk soft limit */
        struct shrinker  qi_shrinker;
 } xfs_quotainfo_t;
@@ -104,15 +110,15 @@ xfs_dquot_tree(
 }
 static inline struct xfs_inode *
-xfs_dq_to_quota_inode(struct xfs_dquot *dqp)
+xfs_quota_inode(xfs_mount_t *mp, uint dq_flags)
 {
-        switch (dqp->dq_flags & XFS_DQ_ALLTYPES) {
+        switch (dq_flags & XFS_DQ_ALLTYPES) {
        case XFS_DQ_USER:
-                return dqp->q_mount->m_quotainfo->qi_uquotaip;
+                return mp->m_quotainfo->qi_uquotaip;
        case XFS_DQ_GROUP:
-                return dqp->q_mount->m_quotainfo->qi_gquotaip;
+                return mp->m_quotainfo->qi_gquotaip;
        case XFS_DQ_PROJ:
-                return dqp->q_mount->m_quotainfo->qi_pquotaip;
+                return mp->m_quotainfo->qi_pquotaip;
        default:
                ASSERT(0);
        }
@@ -164,11 +170,27 @@ extern void		xfs_qm_dqrele_all_inodes(struct xfs_mount *, uint);
 /* quota ops */
 extern int              xfs_qm_scall_trunc_qfiles(struct xfs_mount *, uint);
-extern int              xfs_qm_scall_getquota(struct xfs_mount *, xfs_dqid_t,
+extern int              xfs_qm_scall_getquota(struct xfs_mount *, xfs_dqid_t *,
-                                        uint, struct qc_dqblk *);
+                                        uint, struct qc_dqblk *, uint);
 extern int              xfs_qm_scall_setqlim(struct xfs_mount *, xfs_dqid_t, uint,
                                        struct qc_dqblk *);
 extern int              xfs_qm_scall_quotaon(struct xfs_mount *, uint);
 extern int              xfs_qm_scall_quotaoff(struct xfs_mount *, uint);
+static inline struct xfs_def_quota *
+xfs_get_defquota(struct xfs_dquot *dqp, struct xfs_quotainfo *qi)
+{
+        struct xfs_def_quota *defq;
+        if (XFS_QM_ISUDQ(dqp))
+                defq = &qi->qi_usr_default;
+        else if (XFS_QM_ISGDQ(dqp))
+                defq = &qi->qi_grp_default;
+        else {
+                ASSERT(XFS_QM_ISPDQ(dqp));
+                defq = &qi->qi_prj_default;
+        }
+        return defq;
+}
 #endif /* __XFS_QM_H__ */
diff --git a/fs/xfs/xfs_qm_syscalls.c b/fs/xfs/xfs_qm_syscalls.c
index 3640c6e896af..f4d0e0a8f517 100644
--- a/fs/xfs/xfs_qm_syscalls.c
+++ b/fs/xfs/xfs_qm_syscalls.c
@@ -404,6 +404,7 @@ xfs_qm_scall_setqlim(
        struct xfs_disk_dquot   *ddq;
        struct xfs_dquot        *dqp;
        struct xfs_trans        *tp;
+        struct xfs_def_quota    *defq;
        int                     error;
        xfs_qcnt_t              hard, soft;
@@ -431,6 +432,8 @@ xfs_qm_scall_setqlim(
                ASSERT(error != -ENOENT);
                goto out_unlock;
        }
+        defq = xfs_get_defquota(dqp, q);
        xfs_dqunlock(dqp);
        tp = xfs_trans_alloc(mp, XFS_TRANS_QM_SETQLIM);
@@ -458,8 +461,8 @@ xfs_qm_scall_setqlim(
                ddq->d_blk_softlimit = cpu_to_be64(soft);
                xfs_dquot_set_prealloc_limits(dqp);
                if (id == 0) {
-                        q->qi_bhardlimit = hard;
+                        defq->bhardlimit = hard;
-                        q->qi_bsoftlimit = soft;
+                        defq->bsoftlimit = soft;
                }
        } else {
                xfs_debug(mp, "blkhard %Ld < blksoft %Ld", hard, soft);
@@ -474,8 +477,8 @@ xfs_qm_scall_setqlim(
                ddq->d_rtb_hardlimit = cpu_to_be64(hard);
                ddq->d_rtb_softlimit = cpu_to_be64(soft);
                if (id == 0) {
-                        q->qi_rtbhardlimit = hard;
+                        defq->rtbhardlimit = hard;
-                        q->qi_rtbsoftlimit = soft;
+                        defq->rtbsoftlimit = soft;
                }
        } else {
                xfs_debug(mp, "rtbhard %Ld < rtbsoft %Ld", hard, soft);
@@ -491,8 +494,8 @@ xfs_qm_scall_setqlim(
                ddq->d_ino_hardlimit = cpu_to_be64(hard);
                ddq->d_ino_softlimit = cpu_to_be64(soft);
                if (id == 0) {
-                        q->qi_ihardlimit = hard;
+                        defq->ihardlimit = hard;
-                        q->qi_isoftlimit = soft;
+                        defq->isoftlimit = soft;
                }
        } else {
                xfs_debug(mp, "ihard %Ld < isoft %Ld", hard, soft);
@@ -635,9 +638,10 @@ out:
 int
 xfs_qm_scall_getquota(
        struct xfs_mount        *mp,
-        xfs_dqid_t              id,
+        xfs_dqid_t              *id,
        uint                    type,
-        struct qc_dqblk         *dst)
+        struct qc_dqblk         *dst,
+        uint                    dqget_flags)
 {
        struct xfs_dquot        *dqp;
        int                     error;
@@ -647,7 +651,7 @@ xfs_qm_scall_getquota(
         * we aren't passing the XFS_QMOPT_DOALLOC flag. If it doesn't
         * exist, we'll get ENOENT back.
         */
-        error = xfs_qm_dqget(mp, NULL, id, type, 0, &dqp);
+        error = xfs_qm_dqget(mp, NULL, *id, type, dqget_flags, &dqp);
        if (error)
                return error;
@@ -660,6 +664,9 @@ xfs_qm_scall_getquota(
                goto out_put;
        }
+        /* Fill in the ID we actually read from disk */
+        *id = be32_to_cpu(dqp->q_core.d_id);
        memset(dst, 0, sizeof(*dst));
        dst->d_spc_hardlimit =
                XFS_FSB_TO_B(mp, be64_to_cpu(dqp->q_core.d_blk_hardlimit));
@@ -701,7 +708,7 @@ xfs_qm_scall_getquota(
        if (((XFS_IS_UQUOTA_ENFORCED(mp) && type == XFS_DQ_USER) ||
             (XFS_IS_GQUOTA_ENFORCED(mp) && type == XFS_DQ_GROUP) ||
             (XFS_IS_PQUOTA_ENFORCED(mp) && type == XFS_DQ_PROJ)) &&
-            id != 0) {
+            *id != 0) {
                if ((dst->d_space > dst->d_spc_softlimit) &&
                    (dst->d_spc_softlimit > 0)) {
                        ASSERT(dst->d_spc_timer != 0);
diff --git a/fs/xfs/xfs_quotaops.c b/fs/xfs/xfs_quotaops.c
index 7795e0d01382..f82d79a8c694 100644
--- a/fs/xfs/xfs_quotaops.c
+++ b/fs/xfs/xfs_quotaops.c
@@ -231,14 +231,45 @@ xfs_fs_get_dqblk(
        struct qc_dqblk         *qdq)
 {
        struct xfs_mount        *mp = XFS_M(sb);
+        xfs_dqid_t              id;
        if (!XFS_IS_QUOTA_RUNNING(mp))
                return -ENOSYS;
        if (!XFS_IS_QUOTA_ON(mp))
                return -ESRCH;
-        return xfs_qm_scall_getquota(mp, from_kqid(&init_user_ns, qid),
+        id = from_kqid(&init_user_ns, qid);
-                                      xfs_quota_type(qid.type), qdq);
+        return xfs_qm_scall_getquota(mp, &id,
+                                      xfs_quota_type(qid.type), qdq, 0);
+}
+/* Return quota info for active quota >= this qid */
+STATIC int
+xfs_fs_get_nextdqblk(
+        struct super_block      *sb,
+        struct kqid             *qid,
+        struct qc_dqblk         *qdq)
+{
+        int                     ret;
+        struct xfs_mount        *mp = XFS_M(sb);
+        xfs_dqid_t              id;
+        if (!XFS_IS_QUOTA_RUNNING(mp))
+                return -ENOSYS;
+        if (!XFS_IS_QUOTA_ON(mp))
+                return -ESRCH;
+        id = from_kqid(&init_user_ns, *qid);
+        ret = xfs_qm_scall_getquota(mp, &id,
+                                    xfs_quota_type(qid->type), qdq,
+                                    XFS_QMOPT_DQNEXT);
+        if (ret)
+                return ret;
+        /* ID may be different, so convert back what we got */
+        *qid = make_kqid(current_user_ns(), qid->type, id);
+        return 0;
+        
 }
 STATIC int
@@ -267,5 +298,6 @@ const struct quotactl_ops xfs_quotactl_operations = {
        .quota_disable          = xfs_quota_disable,
        .rm_xquota              = xfs_fs_rm_xquota,
        .get_dqblk              = xfs_fs_get_dqblk,
+        .get_nextdqblk          = xfs_fs_get_nextdqblk,
        .set_dqblk              = xfs_fs_set_dqblk,
 };
diff --git a/fs/xfs/xfs_rtalloc.c b/fs/xfs/xfs_rtalloc.c
index be02a68b2fe2..abf44435d04a 100644
--- a/fs/xfs/xfs_rtalloc.c
+++ b/fs/xfs/xfs_rtalloc.c
@@ -1272,7 +1272,7 @@ xfs_rtpick_extent(
        ASSERT(xfs_isilocked(mp->m_rbmip, XFS_ILOCK_EXCL));
-        seqp = (__uint64_t *)&mp->m_rbmip->i_d.di_atime;
+        seqp = (__uint64_t *)&VFS_I(mp->m_rbmip)->i_atime;
        if (!(mp->m_rbmip->i_d.di_flags & XFS_DIFLAG_NEWRTBM)) {
                mp->m_rbmip->i_d.di_flags |= XFS_DIFLAG_NEWRTBM;
                *seqp = 0;
diff --git a/fs/xfs/xfs_super.c b/fs/xfs/xfs_super.c
index 59c9b7bd958d..d760934109b5 100644
--- a/fs/xfs/xfs_super.c
+++ b/fs/xfs/xfs_super.c
@@ -45,6 +45,7 @@
 #include "xfs_filestream.h"
 #include "xfs_quota.h"
 #include "xfs_sysfs.h"
+#include "xfs_ondisk.h"
 #include <linux/namei.h>
 #include <linux/init.h>
@@ -65,83 +66,85 @@ static struct kset *xfs_kset;		/* top-level xfs sysfs dir */
 static struct xfs_kobj xfs_dbg_kobj;    /* global debug sysfs attrs */
 #endif
-#define MNTOPT_LOGBUFS  "logbufs"       /* number of XFS log buffers */
-#define MNTOPT_LOGBSIZE "logbsize"      /* size of XFS log buffers */
-#define MNTOPT_LOGDEV   "logdev"        /* log device */
-#define MNTOPT_RTDEV    "rtdev"         /* realtime I/O device */
-#define MNTOPT_BIOSIZE  "biosize"       /* log2 of preferred buffered io size */
-#define MNTOPT_WSYNC    "wsync"         /* safe-mode nfs compatible mount */
-#define MNTOPT_NOALIGN  "noalign"       /* turn off stripe alignment */
-#define MNTOPT_SWALLOC  "swalloc"       /* turn on stripe width allocation */
-#define MNTOPT_SUNIT    "sunit"         /* data volume stripe unit */
-#define MNTOPT_SWIDTH   "swidth"        /* data volume stripe width */
-#define MNTOPT_NOUUID   "nouuid"        /* ignore filesystem UUID */
-#define MNTOPT_MTPT     "mtpt"          /* filesystem mount point */
-#define MNTOPT_GRPID    "grpid"         /* group-ID from parent directory */
-#define MNTOPT_NOGRPID  "nogrpid"       /* group-ID from current process */
-#define MNTOPT_BSDGROUPS    "bsdgroups"    /* group-ID from parent directory */
-#define MNTOPT_SYSVGROUPS   "sysvgroups"   /* group-ID from current process */
-#define MNTOPT_ALLOCSIZE    "allocsize"    /* preferred allocation size */
-#define MNTOPT_NORECOVERY   "norecovery"   /* don't run XFS recovery */
-#define MNTOPT_BARRIER  "barrier"       /* use writer barriers for log write and
-                                         * unwritten extent conversion */
-#define MNTOPT_NOBARRIER "nobarrier"    /* .. disable */
-#define MNTOPT_64BITINODE   "inode64"   /* inodes can be allocated anywhere */
-#define MNTOPT_32BITINODE   "inode32"   /* inode allocation limited to
-                                         * XFS_MAXINUMBER_32 */
-#define MNTOPT_IKEEP    "ikeep"         /* do not free empty inode clusters */
-#define MNTOPT_NOIKEEP  "noikeep"       /* free empty inode clusters */
-#define MNTOPT_LARGEIO     "largeio"    /* report large I/O sizes in stat() */
-#define MNTOPT_NOLARGEIO   "nolargeio"  /* do not report large I/O sizes
-                                         * in stat(). */
-#define MNTOPT_ATTR2    "attr2"         /* do use attr2 attribute format */
-#define MNTOPT_NOATTR2  "noattr2"       /* do not use attr2 attribute format */
-#define MNTOPT_FILESTREAM  "filestreams" /* use filestreams allocator */
-#define MNTOPT_QUOTA    "quota"         /* disk quotas (user) */
-#define MNTOPT_NOQUOTA  "noquota"       /* no quotas */
-#define MNTOPT_USRQUOTA "usrquota"      /* user quota enabled */
-#define MNTOPT_GRPQUOTA "grpquota"      /* group quota enabled */
-#define MNTOPT_PRJQUOTA "prjquota"      /* project quota enabled */
-#define MNTOPT_UQUOTA   "uquota"        /* user quota (IRIX variant) */
-#define MNTOPT_GQUOTA   "gquota"        /* group quota (IRIX variant) */
-#define MNTOPT_PQUOTA   "pquota"        /* project quota (IRIX variant) */
-#define MNTOPT_UQUOTANOENF "uqnoenforce"/* user quota limit enforcement */
-#define MNTOPT_GQUOTANOENF "gqnoenforce"/* group quota limit enforcement */
-#define MNTOPT_PQUOTANOENF "pqnoenforce"/* project quota limit enforcement */
-#define MNTOPT_QUOTANOENF  "qnoenforce" /* same as uqnoenforce */
-#define MNTOPT_DISCARD     "discard"    /* Discard unused blocks */
-#define MNTOPT_NODISCARD   "nodiscard"  /* Do not discard unused blocks */
-#define MNTOPT_DAX      "dax"           /* Enable direct access to bdev pages */
 /*
 * Table driven mount option parser.
- *
- * Currently only used for remount, but it will be used for mount
- * in the future, too.
 */
 enum {
-        Opt_barrier,
+        Opt_logbufs, Opt_logbsize, Opt_logdev, Opt_rtdev, Opt_biosize,
-        Opt_nobarrier,
+        Opt_wsync, Opt_noalign, Opt_swalloc, Opt_sunit, Opt_swidth, Opt_nouuid,
-        Opt_inode64,
+        Opt_mtpt, Opt_grpid, Opt_nogrpid, Opt_bsdgroups, Opt_sysvgroups,
-        Opt_inode32,
+        Opt_allocsize, Opt_norecovery, Opt_barrier, Opt_nobarrier,
-        Opt_err
+        Opt_inode64, Opt_inode32, Opt_ikeep, Opt_noikeep,
+        Opt_largeio, Opt_nolargeio, Opt_attr2, Opt_noattr2, Opt_filestreams,
+        Opt_quota, Opt_noquota, Opt_usrquota, Opt_grpquota, Opt_prjquota,
+        Opt_uquota, Opt_gquota, Opt_pquota,
+        Opt_uqnoenforce, Opt_gqnoenforce, Opt_pqnoenforce, Opt_qnoenforce,
+        Opt_discard, Opt_nodiscard, Opt_dax, Opt_err,
 };
 static const match_table_t tokens = {
-        {Opt_barrier, "barrier"},
+        {Opt_logbufs,   "logbufs=%u"},  /* number of XFS log buffers */
-        {Opt_nobarrier, "nobarrier"},
+        {Opt_logbsize,  "logbsize=%s"}, /* size of XFS log buffers */
-        {Opt_inode64, "inode64"},
+        {Opt_logdev,    "logdev=%s"},   /* log device */
-        {Opt_inode32, "inode32"},
+        {Opt_rtdev,     "rtdev=%s"},    /* realtime I/O device */
-        {Opt_err, NULL}
+        {Opt_biosize,   "biosize=%u"},  /* log2 of preferred buffered io size */
+        {Opt_wsync,     "wsync"},       /* safe-mode nfs compatible mount */
+        {Opt_noalign,   "noalign"},     /* turn off stripe alignment */
+        {Opt_swalloc,   "swalloc"},     /* turn on stripe width allocation */
+        {Opt_sunit,     "sunit=%u"},    /* data volume stripe unit */
+        {Opt_swidth,    "swidth=%u"},   /* data volume stripe width */
+        {Opt_nouuid,    "nouuid"},      /* ignore filesystem UUID */
+        {Opt_mtpt,      "mtpt"},        /* filesystem mount point */
+        {Opt_grpid,     "grpid"},       /* group-ID from parent directory */
+        {Opt_nogrpid,   "nogrpid"},     /* group-ID from current process */
+        {Opt_bsdgroups, "bsdgroups"},   /* group-ID from parent directory */
+        {Opt_sysvgroups,"sysvgroups"},  /* group-ID from current process */
+        {Opt_allocsize, "allocsize=%s"},/* preferred allocation size */
+        {Opt_norecovery,"norecovery"},  /* don't run XFS recovery */
+        {Opt_barrier,   "barrier"},     /* use writer barriers for log write and
+                                         * unwritten extent conversion */
+        {Opt_nobarrier, "nobarrier"},   /* .. disable */
+        {Opt_inode64,   "inode64"},     /* inodes can be allocated anywhere */
+        {Opt_inode32,   "inode32"},     /* inode allocation limited to
+                                         * XFS_MAXINUMBER_32 */
+        {Opt_ikeep,     "ikeep"},       /* do not free empty inode clusters */
+        {Opt_noikeep,   "noikeep"},     /* free empty inode clusters */
+        {Opt_largeio,   "largeio"},     /* report large I/O sizes in stat() */
+        {Opt_nolargeio, "nolargeio"},   /* do not report large I/O sizes
+                                         * in stat(). */
+        {Opt_attr2,     "attr2"},       /* do use attr2 attribute format */
+        {Opt_noattr2,   "noattr2"},     /* do not use attr2 attribute format */
+        {Opt_filestreams,"filestreams"},/* use filestreams allocator */
+        {Opt_quota,     "quota"},       /* disk quotas (user) */
+        {Opt_noquota,   "noquota"},     /* no quotas */
+        {Opt_usrquota,  "usrquota"},    /* user quota enabled */
+        {Opt_grpquota,  "grpquota"},    /* group quota enabled */
+        {Opt_prjquota,  "prjquota"},    /* project quota enabled */
+        {Opt_uquota,    "uquota"},      /* user quota (IRIX variant) */
+        {Opt_gquota,    "gquota"},      /* group quota (IRIX variant) */
+        {Opt_pquota,    "pquota"},      /* project quota (IRIX variant) */
+        {Opt_uqnoenforce,"uqnoenforce"},/* user quota limit enforcement */
+        {Opt_gqnoenforce,"gqnoenforce"},/* group quota limit enforcement */
+        {Opt_pqnoenforce,"pqnoenforce"},/* project quota limit enforcement */
+        {Opt_qnoenforce, "qnoenforce"}, /* same as uqnoenforce */
+        {Opt_discard,   "discard"},     /* Discard unused blocks */
+        {Opt_nodiscard, "nodiscard"},   /* Do not discard unused blocks */
+        {Opt_dax,       "dax"},         /* Enable direct access to bdev pages */
+        {Opt_err,       NULL},
 };
 STATIC int
-suffix_kstrtoint(char *s, unsigned int base, int *res)
+suffix_kstrtoint(const substring_t *s, unsigned int base, int *res)
 {
        int     last, shift_left_factor = 0, _res;
-        char    *value = s;
+        char    *value;
+        int     ret = 0;
+        value = match_strdup(s);
+        if (!value)
+                return -ENOMEM;
        last = strlen(value) - 1;
        if (value[last] == 'K' || value[last] == 'k') {
@@ -157,10 +160,11 @@ suffix_kstrtoint(char *s, unsigned int base, int *res)
                value[last] = '\0';
        }
-        if (kstrtoint(s, base, &_res))
+        if (kstrtoint(value, base, &_res))
-                return -EINVAL;
+                ret = -EINVAL;
+        kfree(value);
        *res = _res << shift_left_factor;
-        return 0;
+        return ret;
 }
 /*
@@ -169,14 +173,19 @@ suffix_kstrtoint(char *s, unsigned int base, int *res)
 *
 * Note that this function leaks the various device name allocations on
 * failure.  The caller takes care of them.
+ *
+ * *sb is const because this is also used to test options on the remount
+ * path, and we don't want this to have any side effects at remount time.
+ * Today this function does not change *sb, but just to future-proof...
 */
 STATIC int
 xfs_parseargs(
        struct xfs_mount        *mp,
        char                    *options)
 {
-        struct super_block      *sb = mp->m_super;
+        const struct super_block *sb = mp->m_super;
-        char                    *this_char, *value;
+        char                    *p;
+        substring_t             args[MAX_OPT_ARGS];
        int                     dsunit = 0;
        int                     dswidth = 0;
        int                     iosize = 0;
@@ -217,152 +226,152 @@ xfs_parseargs(
        if (!options)
                goto done;
-        while ((this_char = strsep(&options, ",")) != NULL) {
+        while ((p = strsep(&options, ",")) != NULL) {
-                if (!*this_char)
+                int             token;
+                if (!*p)
                        continue;
-                if ((value = strchr(this_char, '=')) != NULL)
-                        *value++ = 0;
-                if (!strcmp(this_char, MNTOPT_LOGBUFS)) {
+                token = match_token(p, tokens, args);
-                        if (!value || !*value) {
+                switch (token) {
-                                xfs_warn(mp, "%s option requires an argument",
+                case Opt_logbufs:
-                                        this_char);
+                        if (match_int(args, &mp->m_logbufs))
-                                return -EINVAL;
-                        }
-                        if (kstrtoint(value, 10, &mp->m_logbufs))
-                                return -EINVAL;
-                } else if (!strcmp(this_char, MNTOPT_LOGBSIZE)) {
-                        if (!value || !*value) {
-                                xfs_warn(mp, "%s option requires an argument",
-                                        this_char);
-                                return -EINVAL;
-                        }
-                        if (suffix_kstrtoint(value, 10, &mp->m_logbsize))
                                return -EINVAL;
-                } else if (!strcmp(this_char, MNTOPT_LOGDEV)) {
+                        break;
-                        if (!value || !*value) {
+                case Opt_logbsize:
-                                xfs_warn(mp, "%s option requires an argument",
+                        if (suffix_kstrtoint(args, 10, &mp->m_logbsize))
-                                        this_char);
                                return -EINVAL;
-                        }
+                        break;
-                        mp->m_logname = kstrndup(value, MAXNAMELEN, GFP_KERNEL);
+                case Opt_logdev:
+                        mp->m_logname = match_strdup(args);
                        if (!mp->m_logname)
                                return -ENOMEM;
-                } else if (!strcmp(this_char, MNTOPT_MTPT)) {
+                        break;
-                        xfs_warn(mp, "%s option not allowed on this system",
+                case Opt_mtpt:
-                                this_char);
+                        xfs_warn(mp, "%s option not allowed on this system", p);
                        return -EINVAL;
-                } else if (!strcmp(this_char, MNTOPT_RTDEV)) {
+                case Opt_rtdev:
-                        if (!value || !*value) {
+                        mp->m_rtname = match_strdup(args);
-                                xfs_warn(mp, "%s option requires an argument",
-                                        this_char);
-                                return -EINVAL;
-                        }
-                        mp->m_rtname = kstrndup(value, MAXNAMELEN, GFP_KERNEL);
                        if (!mp->m_rtname)
                                return -ENOMEM;
-                } else if (!strcmp(this_char, MNTOPT_ALLOCSIZE) ||
+                        break;
-                           !strcmp(this_char, MNTOPT_BIOSIZE)) {
+                case Opt_allocsize:
-                        if (!value || !*value) {
+                case Opt_biosize:
-                                xfs_warn(mp, "%s option requires an argument",
+                        if (suffix_kstrtoint(args, 10, &iosize))
-                                        this_char);
-                                return -EINVAL;
-                        }
-                        if (suffix_kstrtoint(value, 10, &iosize))
                                return -EINVAL;
                        iosizelog = ffs(iosize) - 1;
-                } else if (!strcmp(this_char, MNTOPT_GRPID) ||
+                        break;
-                           !strcmp(this_char, MNTOPT_BSDGROUPS)) {
+                case Opt_grpid:
+                case Opt_bsdgroups:
                        mp->m_flags |= XFS_MOUNT_GRPID;
-                } else if (!strcmp(this_char, MNTOPT_NOGRPID) ||
+                        break;
-                           !strcmp(this_char, MNTOPT_SYSVGROUPS)) {
+                case Opt_nogrpid:
+                case Opt_sysvgroups:
                        mp->m_flags &= ~XFS_MOUNT_GRPID;
-                } else if (!strcmp(this_char, MNTOPT_WSYNC)) {
+                        break;
+                case Opt_wsync:
                        mp->m_flags |= XFS_MOUNT_WSYNC;
-                } else if (!strcmp(this_char, MNTOPT_NORECOVERY)) {
+                        break;
+                case Opt_norecovery:
                        mp->m_flags |= XFS_MOUNT_NORECOVERY;
-                } else if (!strcmp(this_char, MNTOPT_NOALIGN)) {
+                        break;
+                case Opt_noalign:
                        mp->m_flags |= XFS_MOUNT_NOALIGN;
-                } else if (!strcmp(this_char, MNTOPT_SWALLOC)) {
+                        break;
+                case Opt_swalloc:
                        mp->m_flags |= XFS_MOUNT_SWALLOC;
-                } else if (!strcmp(this_char, MNTOPT_SUNIT)) {
+                        break;
-                        if (!value || !*value) {
+                case Opt_sunit:
-                                xfs_warn(mp, "%s option requires an argument",
+                        if (match_int(args, &dsunit))
-                                        this_char);
-                                return -EINVAL;
-                        }
-                        if (kstrtoint(value, 10, &dsunit))
-                                return -EINVAL;
-                } else if (!strcmp(this_char, MNTOPT_SWIDTH)) {
-                        if (!value || !*value) {
-                                xfs_warn(mp, "%s option requires an argument",
-                                        this_char);
                                return -EINVAL;
-                        }
+                        break;
-                        if (kstrtoint(value, 10, &dswidth))
+                case Opt_swidth:
+                        if (match_int(args, &dswidth))
                                return -EINVAL;
-                } else if (!strcmp(this_char, MNTOPT_32BITINODE)) {
+                        break;
+                case Opt_inode32:
                        mp->m_flags |= XFS_MOUNT_SMALL_INUMS;
-                } else if (!strcmp(this_char, MNTOPT_64BITINODE)) {
+                        break;
+                case Opt_inode64:
                        mp->m_flags &= ~XFS_MOUNT_SMALL_INUMS;
-                } else if (!strcmp(this_char, MNTOPT_NOUUID)) {
+                        break;
+                case Opt_nouuid:
                        mp->m_flags |= XFS_MOUNT_NOUUID;
-                } else if (!strcmp(this_char, MNTOPT_BARRIER)) {
+                        break;
+                case Opt_barrier:
                        mp->m_flags |= XFS_MOUNT_BARRIER;
-                } else if (!strcmp(this_char, MNTOPT_NOBARRIER)) {
+                        break;
+                case Opt_nobarrier:
                        mp->m_flags &= ~XFS_MOUNT_BARRIER;
-                } else if (!strcmp(this_char, MNTOPT_IKEEP)) {
+                        break;
+                case Opt_ikeep:
                        mp->m_flags |= XFS_MOUNT_IKEEP;
-                } else if (!strcmp(this_char, MNTOPT_NOIKEEP)) {
+                        break;
+                case Opt_noikeep:
                        mp->m_flags &= ~XFS_MOUNT_IKEEP;
-                } else if (!strcmp(this_char, MNTOPT_LARGEIO)) {
+                        break;
+                case Opt_largeio:
                        mp->m_flags &= ~XFS_MOUNT_COMPAT_IOSIZE;
-                } else if (!strcmp(this_char, MNTOPT_NOLARGEIO)) {
+                        break;
+                case Opt_nolargeio:
                        mp->m_flags |= XFS_MOUNT_COMPAT_IOSIZE;
-                } else if (!strcmp(this_char, MNTOPT_ATTR2)) {
+                        break;
+                case Opt_attr2:
                        mp->m_flags |= XFS_MOUNT_ATTR2;
-                } else if (!strcmp(this_char, MNTOPT_NOATTR2)) {
+                        break;
+                case Opt_noattr2:
                        mp->m_flags &= ~XFS_MOUNT_ATTR2;
                        mp->m_flags |= XFS_MOUNT_NOATTR2;
-                } else if (!strcmp(this_char, MNTOPT_FILESTREAM)) {
+                        break;
+                case Opt_filestreams:
                        mp->m_flags |= XFS_MOUNT_FILESTREAMS;
-                } else if (!strcmp(this_char, MNTOPT_NOQUOTA)) {
+                        break;
+                case Opt_noquota:
                        mp->m_qflags &= ~XFS_ALL_QUOTA_ACCT;
                        mp->m_qflags &= ~XFS_ALL_QUOTA_ENFD;
                        mp->m_qflags &= ~XFS_ALL_QUOTA_ACTIVE;
-                } else if (!strcmp(this_char, MNTOPT_QUOTA) ||
+                        break;
-                           !strcmp(this_char, MNTOPT_UQUOTA) ||
+                case Opt_quota:
-                           !strcmp(this_char, MNTOPT_USRQUOTA)) {
+                case Opt_uquota:
+                case Opt_usrquota:
                        mp->m_qflags |= (XFS_UQUOTA_ACCT | XFS_UQUOTA_ACTIVE |
                                         XFS_UQUOTA_ENFD);
-                } else if (!strcmp(this_char, MNTOPT_QUOTANOENF) ||
+                        break;
-                           !strcmp(this_char, MNTOPT_UQUOTANOENF)) {
+                case Opt_qnoenforce:
+                case Opt_uqnoenforce:
                        mp->m_qflags |= (XFS_UQUOTA_ACCT | XFS_UQUOTA_ACTIVE);
                        mp->m_qflags &= ~XFS_UQUOTA_ENFD;
-                } else if (!strcmp(this_char, MNTOPT_PQUOTA) ||
+                        break;
-                           !strcmp(this_char, MNTOPT_PRJQUOTA)) {
+                case Opt_pquota:
+                case Opt_prjquota:
                        mp->m_qflags |= (XFS_PQUOTA_ACCT | XFS_PQUOTA_ACTIVE |
                                         XFS_PQUOTA_ENFD);
-                } else if (!strcmp(this_char, MNTOPT_PQUOTANOENF)) {
+                        break;
+                case Opt_pqnoenforce:
                        mp->m_qflags |= (XFS_PQUOTA_ACCT | XFS_PQUOTA_ACTIVE);
                        mp->m_qflags &= ~XFS_PQUOTA_ENFD;
-                } else if (!strcmp(this_char, MNTOPT_GQUOTA) ||
+                case Opt_gquota:
-                           !strcmp(this_char, MNTOPT_GRPQUOTA)) {
+                case Opt_grpquota:
                        mp->m_qflags |= (XFS_GQUOTA_ACCT | XFS_GQUOTA_ACTIVE |
                                         XFS_GQUOTA_ENFD);
-                } else if (!strcmp(this_char, MNTOPT_GQUOTANOENF)) {
+                        break;
+                case Opt_gqnoenforce:
                        mp->m_qflags |= (XFS_GQUOTA_ACCT | XFS_GQUOTA_ACTIVE);
                        mp->m_qflags &= ~XFS_GQUOTA_ENFD;
-                } else if (!strcmp(this_char, MNTOPT_DISCARD)) {
+                        break;
+                case Opt_discard:
                        mp->m_flags |= XFS_MOUNT_DISCARD;
-                } else if (!strcmp(this_char, MNTOPT_NODISCARD)) {
+                        break;
+                case Opt_nodiscard:
                        mp->m_flags &= ~XFS_MOUNT_DISCARD;
+                        break;
 #ifdef CONFIG_FS_DAX
-                } else if (!strcmp(this_char, MNTOPT_DAX)) {
+                case Opt_dax:
                        mp->m_flags |= XFS_MOUNT_DAX;
+                        break;
 #endif
-                } else {
+                default:
-                        xfs_warn(mp, "unknown mount option [%s].", this_char);
+                        xfs_warn(mp, "unknown mount option [%s].", p);
                        return -EINVAL;
                }
        }
@@ -461,25 +470,25 @@ xfs_showargs(
 {
        static struct proc_xfs_info xfs_info_set[] = {
                /* the few simple ones we can get from the mount struct */
-                { XFS_MOUNT_IKEEP,              "," MNTOPT_IKEEP },
+                { XFS_MOUNT_IKEEP,              ",ikeep" },
-                { XFS_MOUNT_WSYNC,              "," MNTOPT_WSYNC },
+                { XFS_MOUNT_WSYNC,              ",wsync" },
-                { XFS_MOUNT_NOALIGN,            "," MNTOPT_NOALIGN },
+                { XFS_MOUNT_NOALIGN,            ",noalign" },
-                { XFS_MOUNT_SWALLOC,            "," MNTOPT_SWALLOC },
+                { XFS_MOUNT_SWALLOC,            ",swalloc" },
-                { XFS_MOUNT_NOUUID,             "," MNTOPT_NOUUID },
+                { XFS_MOUNT_NOUUID,             ",nouuid" },
-                { XFS_MOUNT_NORECOVERY,         "," MNTOPT_NORECOVERY },
+                { XFS_MOUNT_NORECOVERY,         ",norecovery" },
-                { XFS_MOUNT_ATTR2,              "," MNTOPT_ATTR2 },
+                { XFS_MOUNT_ATTR2,              ",attr2" },
-                { XFS_MOUNT_FILESTREAMS,        "," MNTOPT_FILESTREAM },
+                { XFS_MOUNT_FILESTREAMS,        ",filestreams" },
-                { XFS_MOUNT_GRPID,              "," MNTOPT_GRPID },
+                { XFS_MOUNT_GRPID,              ",grpid" },
-                { XFS_MOUNT_DISCARD,            "," MNTOPT_DISCARD },
+                { XFS_MOUNT_DISCARD,            ",discard" },
-                { XFS_MOUNT_SMALL_INUMS,        "," MNTOPT_32BITINODE },
+                { XFS_MOUNT_SMALL_INUMS,        ",inode32" },
-                { XFS_MOUNT_DAX,                "," MNTOPT_DAX },
+                { XFS_MOUNT_DAX,                ",dax" },
                { 0, NULL }
        };
        static struct proc_xfs_info xfs_info_unset[] = {
                /* the few simple ones we can get from the mount struct */
-                { XFS_MOUNT_COMPAT_IOSIZE,      "," MNTOPT_LARGEIO },
+                { XFS_MOUNT_COMPAT_IOSIZE,      ",largeio" },
-                { XFS_MOUNT_BARRIER,            "," MNTOPT_NOBARRIER },
+                { XFS_MOUNT_BARRIER,            ",nobarrier" },
-                { XFS_MOUNT_SMALL_INUMS,        "," MNTOPT_64BITINODE },
+                { XFS_MOUNT_SMALL_INUMS,        ",inode64" },
                { 0, NULL }
        };
        struct proc_xfs_info    *xfs_infop;
@@ -494,46 +503,46 @@ xfs_showargs(
        }
        if (mp->m_flags & XFS_MOUNT_DFLT_IOSIZE)
-                seq_printf(m, "," MNTOPT_ALLOCSIZE "=%dk",
+                seq_printf(m, ",allocsize=%dk",
                                (int)(1 << mp->m_writeio_log) >> 10);
        if (mp->m_logbufs > 0)
-                seq_printf(m, "," MNTOPT_LOGBUFS "=%d", mp->m_logbufs);
+                seq_printf(m, ",logbufs=%d", mp->m_logbufs);
        if (mp->m_logbsize > 0)
-                seq_printf(m, "," MNTOPT_LOGBSIZE "=%dk", mp->m_logbsize >> 10);
+                seq_printf(m, ",logbsize=%dk", mp->m_logbsize >> 10);
        if (mp->m_logname)
-                seq_show_option(m, MNTOPT_LOGDEV, mp->m_logname);
+                seq_show_option(m, "logdev", mp->m_logname);
        if (mp->m_rtname)
-                seq_show_option(m, MNTOPT_RTDEV, mp->m_rtname);
+                seq_show_option(m, "rtdev", mp->m_rtname);
        if (mp->m_dalign > 0)
-                seq_printf(m, "," MNTOPT_SUNIT "=%d",
+                seq_printf(m, ",sunit=%d",
                                (int)XFS_FSB_TO_BB(mp, mp->m_dalign));
        if (mp->m_swidth > 0)
-                seq_printf(m, "," MNTOPT_SWIDTH "=%d",
+                seq_printf(m, ",swidth=%d",
                                (int)XFS_FSB_TO_BB(mp, mp->m_swidth));
        if (mp->m_qflags & (XFS_UQUOTA_ACCT|XFS_UQUOTA_ENFD))
-                seq_puts(m, "," MNTOPT_USRQUOTA);
+                seq_puts(m, ",usrquota");
        else if (mp->m_qflags & XFS_UQUOTA_ACCT)
-                seq_puts(m, "," MNTOPT_UQUOTANOENF);
+                seq_puts(m, ",uqnoenforce");
        if (mp->m_qflags & XFS_PQUOTA_ACCT) {
                if (mp->m_qflags & XFS_PQUOTA_ENFD)
-                        seq_puts(m, "," MNTOPT_PRJQUOTA);
+                        seq_puts(m, ",prjquota");
                else
-                        seq_puts(m, "," MNTOPT_PQUOTANOENF);
+                        seq_puts(m, ",pqnoenforce");
        }
        if (mp->m_qflags & XFS_GQUOTA_ACCT) {
                if (mp->m_qflags & XFS_GQUOTA_ENFD)
-                        seq_puts(m, "," MNTOPT_GRPQUOTA);
+                        seq_puts(m, ",grpquota");
                else
-                        seq_puts(m, "," MNTOPT_GQUOTANOENF);
+                        seq_puts(m, ",gqnoenforce");
        }
        if (!(mp->m_qflags & XFS_ALL_QUOTA_ACCT))
-                seq_puts(m, "," MNTOPT_NOQUOTA);
+                seq_puts(m, ",noquota");
        return 0;
 }
@@ -572,23 +581,35 @@ xfs_max_file_offset(
 }
 /*
- * xfs_set_inode32() and xfs_set_inode64() are passed an agcount
+ * Set parameters for inode allocation heuristics, taking into account
- * because in the growfs case, mp->m_sb.sb_agcount is not updated
+ * filesystem size and inode32/inode64 mount options; i.e. specifically
- * yet to the potentially higher ag count.
+ * whether or not XFS_MOUNT_SMALL_INUMS is set.
+ *
+ * Inode allocation patterns are altered only if inode32 is requested
+ * (XFS_MOUNT_SMALL_INUMS), and the filesystem is sufficiently large.
+ * If altered, XFS_MOUNT_32BITINODES is set as well.
+ *
+ * An agcount independent of that in the mount structure is provided
+ * because in the growfs case, mp->m_sb.sb_agcount is not yet updated
+ * to the potentially higher ag count.
+ *
+ * Returns the maximum AG index which may contain inodes.
 */
 xfs_agnumber_t
-xfs_set_inode32(struct xfs_mount *mp, xfs_agnumber_t agcount)
+xfs_set_inode_alloc(
+        struct xfs_mount *mp,
+        xfs_agnumber_t  agcount)
 {
-        xfs_agnumber_t  index = 0;
+        xfs_agnumber_t  index;
        xfs_agnumber_t  maxagi = 0;
        xfs_sb_t        *sbp = &mp->m_sb;
        xfs_agnumber_t  max_metadata;
        xfs_agino_t     agino;
        xfs_ino_t       ino;
-        xfs_perag_t     *pag;
-        /* Calculate how much should be reserved for inodes to meet
+        /*
-         * the max inode percentage.
+         * Calculate how much should be reserved for inodes to meet
+         * the max inode percentage.  Used only for inode32.
         */
        if (mp->m_maxicount) {
                __uint64_t      icount;
@@ -602,54 +623,48 @@ xfs_set_inode32(struct xfs_mount *mp, xfs_agnumber_t agcount)
                max_metadata = agcount;
        }
+        /* Get the last possible inode in the filesystem */
        agino = XFS_OFFBNO_TO_AGINO(mp, sbp->sb_agblocks - 1, 0);
+        ino = XFS_AGINO_TO_INO(mp, agcount - 1, agino);
+        /*
+         * If user asked for no more than 32-bit inodes, and the fs is
+         * sufficiently large, set XFS_MOUNT_32BITINODES if we must alter
+         * the allocator to accommodate the request.
+         */
+        if ((mp->m_flags & XFS_MOUNT_SMALL_INUMS) && ino > XFS_MAXINUMBER_32)
+                mp->m_flags |= XFS_MOUNT_32BITINODES;
+        else
+                mp->m_flags &= ~XFS_MOUNT_32BITINODES;
        for (index = 0; index < agcount; index++) {
-                ino = XFS_AGINO_TO_INO(mp, index, agino);
+                struct xfs_perag        *pag;
-                if (ino > XFS_MAXINUMBER_32) {
+                ino = XFS_AGINO_TO_INO(mp, index, agino);
-                        pag = xfs_perag_get(mp, index);
-                        pag->pagi_inodeok = 0;
-                        pag->pagf_metadata = 0;
-                        xfs_perag_put(pag);
-                        continue;
-                }
                pag = xfs_perag_get(mp, index);
-                pag->pagi_inodeok = 1;
-                maxagi++;
-                if (index < max_metadata)
-                        pag->pagf_metadata = 1;
-                xfs_perag_put(pag);
-        }
-        mp->m_flags |= (XFS_MOUNT_32BITINODES |
-                        XFS_MOUNT_SMALL_INUMS);
-        return maxagi;
+                if (mp->m_flags & XFS_MOUNT_32BITINODES) {
-}
+                        if (ino > XFS_MAXINUMBER_32) {
+                                pag->pagi_inodeok = 0;
-xfs_agnumber_t
+                                pag->pagf_metadata = 0;
-xfs_set_inode64(struct xfs_mount *mp, xfs_agnumber_t agcount)
+                        } else {
-{
+                                pag->pagi_inodeok = 1;
-        xfs_agnumber_t index = 0;
+                                maxagi++;
+                                if (index < max_metadata)
-        for (index = 0; index < agcount; index++) {
+                                        pag->pagf_metadata = 1;
-                struct xfs_perag        *pag;
+                                else
+                                        pag->pagf_metadata = 0;
+                        }
+                } else {
+                        pag->pagi_inodeok = 1;
+                        pag->pagf_metadata = 0;
+                }
-                pag = xfs_perag_get(mp, index);
-                pag->pagi_inodeok = 1;
-                pag->pagf_metadata = 0;
                xfs_perag_put(pag);
        }
-        /* There is no need for lock protection on m_flags,
+        return (mp->m_flags & XFS_MOUNT_32BITINODES) ? maxagi : agcount;
-         * the rw_semaphore of the VFS superblock is locked
-         * during mount/umount/remount operations, so this is
-         * enough to avoid concurency on the m_flags field
-         */
-        mp->m_flags &= ~(XFS_MOUNT_32BITINODES |
-                         XFS_MOUNT_SMALL_INUMS);
-        return index;
 }
 STATIC int
@@ -1166,6 +1181,27 @@ xfs_quiesce_attr(
 }
 STATIC int
+xfs_test_remount_options(
+        struct super_block      *sb,
+        struct xfs_mount        *mp,
+        char                    *options)
+{
+        int                     error = 0;
+        struct xfs_mount        *tmp_mp;
+        tmp_mp = kmem_zalloc(sizeof(*tmp_mp), KM_MAYFAIL);
+        if (!tmp_mp)
+                return -ENOMEM;
+        tmp_mp->m_super = sb;
+        error = xfs_parseargs(tmp_mp, options);
+        xfs_free_fsname(tmp_mp);
+        kfree(tmp_mp);
+        return error;
+}
+STATIC int
 xfs_fs_remount(
        struct super_block      *sb,
        int                     *flags,
@@ -1177,6 +1213,11 @@ xfs_fs_remount(
        char                    *p;
        int                     error;
+        /* First, check for complete junk; i.e. invalid options */
+        error = xfs_test_remount_options(sb, mp, options);
+        if (error)
+                return error;
        sync_filesystem(sb);
        while ((p = strsep(&options, ",")) != NULL) {
                int token;
@@ -1193,10 +1234,12 @@ xfs_fs_remount(
                        mp->m_flags &= ~XFS_MOUNT_BARRIER;
                        break;
                case Opt_inode64:
-                        mp->m_maxagi = xfs_set_inode64(mp, sbp->sb_agcount);
+                        mp->m_flags &= ~XFS_MOUNT_SMALL_INUMS;
+                        mp->m_maxagi = xfs_set_inode_alloc(mp, sbp->sb_agcount);
                        break;
                case Opt_inode32:
-                        mp->m_maxagi = xfs_set_inode32(mp, sbp->sb_agcount);
+                        mp->m_flags |= XFS_MOUNT_SMALL_INUMS;
+                        mp->m_maxagi = xfs_set_inode_alloc(mp, sbp->sb_agcount);
                        break;
                default:
                        /*
@@ -1344,9 +1387,8 @@ xfs_finish_flags(
         */
        if (xfs_sb_version_hascrc(&mp->m_sb) &&
            (mp->m_flags & XFS_MOUNT_NOATTR2)) {
-                xfs_warn(mp,
+                xfs_warn(mp, "Cannot mount a V5 filesystem as noattr2. "
-"Cannot mount a V5 filesystem as %s. %s is always enabled for V5 filesystems.",
+                             "attr2 is always enabled for V5 filesystems.");
-                        MNTOPT_NOATTR2, MNTOPT_ATTR2);
                return -EINVAL;
        }
@@ -1817,6 +1859,8 @@ init_xfs_fs(void)
 {
        int                     error;
+        xfs_check_ondisk_structs();
        printk(KERN_INFO XFS_VERSION_STRING " with "
                         XFS_BUILD_OPTIONS " enabled\n");
diff --git a/fs/xfs/xfs_super.h b/fs/xfs/xfs_super.h
index 499058fea303..2dfb1ce4585f 100644
--- a/fs/xfs/xfs_super.h
+++ b/fs/xfs/xfs_super.h
@@ -65,8 +65,8 @@ extern __uint64_t xfs_max_file_offset(unsigned int);
 extern void xfs_flush_inodes(struct xfs_mount *mp);
 extern void xfs_blkdev_issue_flush(struct xfs_buftarg *);
-extern xfs_agnumber_t xfs_set_inode32(struct xfs_mount *, xfs_agnumber_t agcount);
+extern xfs_agnumber_t xfs_set_inode_alloc(struct xfs_mount *,
-extern xfs_agnumber_t xfs_set_inode64(struct xfs_mount *, xfs_agnumber_t agcount);
+                                           xfs_agnumber_t agcount);
 extern const struct export_operations xfs_export_operations;
 extern const struct xattr_handler *xfs_xattr_handlers[];
diff --git a/fs/xfs/xfs_sysfs.c b/fs/xfs/xfs_sysfs.c
index 641d625eb334..6ced4f143494 100644
--- a/fs/xfs/xfs_sysfs.c
+++ b/fs/xfs/xfs_sysfs.c
@@ -18,10 +18,13 @@
 #include "xfs.h"
 #include "xfs_sysfs.h"
+#include "xfs_format.h"
 #include "xfs_log_format.h"
+#include "xfs_trans_resv.h"
 #include "xfs_log.h"
 #include "xfs_log_priv.h"
 #include "xfs_stats.h"
+#include "xfs_mount.h"
 struct xfs_sysfs_attr {
        struct attribute attr;
@@ -45,16 +48,6 @@ to_attr(struct attribute *attr)
 #define ATTR_LIST(name) &xfs_sysfs_attr_##name.attr
-/*
- * xfs_mount kobject. This currently has no attributes and thus no need for show
- * and store helpers. The mp kobject serves as the per-mount parent object that
- * is identified by the fsname under sysfs.
- */
-struct kobj_type xfs_mp_ktype = {
-        .release = xfs_sysfs_release,
-};
 STATIC ssize_t
 xfs_sysfs_object_show(
        struct kobject          *kobject,
@@ -83,6 +76,71 @@ static const struct sysfs_ops xfs_sysfs_ops = {
        .store = xfs_sysfs_object_store,
 };
+/*
+ * xfs_mount kobject. The mp kobject also serves as the per-mount parent object
+ * that is identified by the fsname under sysfs.
+ */
+static inline struct xfs_mount *
+to_mp(struct kobject *kobject)
+{
+        struct xfs_kobj *kobj = to_kobj(kobject);
+        return container_of(kobj, struct xfs_mount, m_kobj);
+}
+#ifdef DEBUG
+STATIC ssize_t
+fail_writes_store(
+        struct kobject          *kobject,
+        const char              *buf,
+        size_t                  count)
+{
+        struct xfs_mount        *mp = to_mp(kobject);
+        int                     ret;
+        int                     val;
+        ret = kstrtoint(buf, 0, &val);
+        if (ret)
+                return ret;
+        if (val == 1)
+                mp->m_fail_writes = true;
+        else if (val == 0)
+                mp->m_fail_writes = false;
+        else
+                return -EINVAL;
+        return count;
+}
+STATIC ssize_t
+fail_writes_show(
+        struct kobject          *kobject,
+        char                    *buf)
+{
+        struct xfs_mount        *mp = to_mp(kobject);
+        return snprintf(buf, PAGE_SIZE, "%d\n", mp->m_fail_writes ? 1 : 0);
+}
+XFS_SYSFS_ATTR_RW(fail_writes);
+#endif /* DEBUG */
+static struct attribute *xfs_mp_attrs[] = {
+#ifdef DEBUG
+        ATTR_LIST(fail_writes),
+#endif
+        NULL,
+};
+struct kobj_type xfs_mp_ktype = {
+        .release = xfs_sysfs_release,
+        .sysfs_ops = &xfs_sysfs_ops,
+        .default_attrs = xfs_mp_attrs,
+};
 #ifdef DEBUG
 /* debug */
diff --git a/fs/xfs/xfs_trace.h b/fs/xfs/xfs_trace.h
index 391d797cb53f..c8d58426008e 100644
--- a/fs/xfs/xfs_trace.h
+++ b/fs/xfs/xfs_trace.h
@@ -1296,11 +1296,7 @@ DEFINE_IOMAP_EVENT(xfs_map_blocks_found);
 DEFINE_IOMAP_EVENT(xfs_map_blocks_alloc);
 DEFINE_IOMAP_EVENT(xfs_get_blocks_found);
 DEFINE_IOMAP_EVENT(xfs_get_blocks_alloc);
-DEFINE_IOMAP_EVENT(xfs_gbmap_direct);
+DEFINE_IOMAP_EVENT(xfs_get_blocks_map_direct);
-DEFINE_IOMAP_EVENT(xfs_gbmap_direct_new);
-DEFINE_IOMAP_EVENT(xfs_gbmap_direct_update);
-DEFINE_IOMAP_EVENT(xfs_gbmap_direct_none);
-DEFINE_IOMAP_EVENT(xfs_gbmap_direct_endio);
 DECLARE_EVENT_CLASS(xfs_simple_io_class,
        TP_PROTO(struct xfs_inode *ip, xfs_off_t offset, ssize_t count),
@@ -1340,6 +1336,9 @@ DEFINE_SIMPLE_IO_EVENT(xfs_unwritten_convert);
 DEFINE_SIMPLE_IO_EVENT(xfs_get_blocks_notfound);
 DEFINE_SIMPLE_IO_EVENT(xfs_setfilesize);
 DEFINE_SIMPLE_IO_EVENT(xfs_zero_eof);
+DEFINE_SIMPLE_IO_EVENT(xfs_end_io_direct_write);
+DEFINE_SIMPLE_IO_EVENT(xfs_end_io_direct_write_unwritten);
+DEFINE_SIMPLE_IO_EVENT(xfs_end_io_direct_write_append);
 DECLARE_EVENT_CLASS(xfs_itrunc_class,
        TP_PROTO(struct xfs_inode *ip, xfs_fsize_t new_size),
diff --git a/fs/xfs/xfs_trans.c b/fs/xfs/xfs_trans.c
index 748b16aff45a..20c53666cb4b 100644
--- a/fs/xfs/xfs_trans.c
+++ b/fs/xfs/xfs_trans.c
@@ -1028,6 +1028,8 @@ __xfs_trans_roll(
        struct xfs_trans_res    tres;
        int                     error;
+        *committed = 0;
        /*
         * Ensure that the inode is always logged.
         */
@@ -1082,6 +1084,6 @@ xfs_trans_roll(
        struct xfs_trans        **tpp,
        struct xfs_inode        *dp)
 {
-        int                     committed = 0;
+        int                     committed;
        return __xfs_trans_roll(tpp, dp, &committed);
 }
diff --git a/fs/xfs/xfs_trans.h b/fs/xfs/xfs_trans.h
index 4643070d7cae..e7c49cf43fbc 100644
--- a/fs/xfs/xfs_trans.h
+++ b/fs/xfs/xfs_trans.h
@@ -133,7 +133,6 @@ typedef struct xfs_trans {
 * XFS transaction mechanism exported interfaces that are
 * actually macros.
 */
-#define xfs_trans_get_block_res(tp)     ((tp)->t_blk_res)
 #define xfs_trans_set_sync(tp)          ((tp)->t_flags |= XFS_TRANS_SYNC)
 #if defined(DEBUG) || defined(XFS_WARN)
diff --git a/fs/xfs/xfs_trans_ail.c b/fs/xfs/xfs_trans_ail.c
index 4f18fd92ca13..d6c9c3e9e02b 100644
--- a/fs/xfs/xfs_trans_ail.c
+++ b/fs/xfs/xfs_trans_ail.c
@@ -497,6 +497,7 @@ xfsaild(
        long            tout = 0;       /* milliseconds */
        current->flags |= PF_MEMALLOC;
+        set_freezable();
        while (!kthread_should_stop()) {
                if (tout && tout <= 20)
@@ -519,14 +520,14 @@ xfsaild(
                if (!xfs_ail_min(ailp) &&
                    ailp->xa_target == ailp->xa_target_prev) {
                        spin_unlock(&ailp->xa_lock);
-                        schedule();
+                        freezable_schedule();
                        tout = 0;
                        continue;
                }
                spin_unlock(&ailp->xa_lock);
                if (tout)
-                        schedule_timeout(msecs_to_jiffies(tout));
+                        freezable_schedule_timeout(msecs_to_jiffies(tout));
                __set_current_state(TASK_RUNNING);
diff --git a/fs/xfs/xfs_trans_buf.c b/fs/xfs/xfs_trans_buf.c
index 75798412859a..8ee29ca132dc 100644
--- a/fs/xfs/xfs_trans_buf.c
+++ b/fs/xfs/xfs_trans_buf.c
@@ -155,7 +155,7 @@ xfs_trans_get_buf_map(
                ASSERT(xfs_buf_islocked(bp));
                if (XFS_FORCED_SHUTDOWN(tp->t_mountp)) {
                        xfs_buf_stale(bp);
-                        XFS_BUF_DONE(bp);
+                        bp->b_flags |= XBF_DONE;
                }
                ASSERT(bp->b_transp == tp);
@@ -518,7 +518,7 @@ xfs_trans_log_buf(xfs_trans_t	*tp,
         * inside the b_bdstrat callback so that this won't get written to
         * disk.
         */
-        XFS_BUF_DONE(bp);
+        bp->b_flags |= XBF_DONE;
        ASSERT(atomic_read(&bip->bli_refcount) > 0);
        bp->b_iodone = xfs_buf_iodone_callbacks;
@@ -534,8 +534,8 @@ xfs_trans_log_buf(xfs_trans_t	*tp,
         */
        if (bip->bli_flags & XFS_BLI_STALE) {
                bip->bli_flags &= ~XFS_BLI_STALE;
-                ASSERT(XFS_BUF_ISSTALE(bp));
+                ASSERT(bp->b_flags & XBF_STALE);
-                XFS_BUF_UNSTALE(bp);
+                bp->b_flags &= ~XBF_STALE;
                bip->__bli_format.blf_flags &= ~XFS_BLF_CANCEL;
        }
@@ -600,7 +600,7 @@ xfs_trans_binval(
                 * If the buffer is already invalidated, then
                 * just return.
                 */
-                ASSERT(XFS_BUF_ISSTALE(bp));
+                ASSERT(bp->b_flags & XBF_STALE);
                ASSERT(!(bip->bli_flags & (XFS_BLI_LOGGED | XFS_BLI_DIRTY)));
                ASSERT(!(bip->__bli_format.blf_flags & XFS_BLF_INODE_BUF));
                ASSERT(!(bip->__bli_format.blf_flags & XFS_BLFT_MASK));
diff --git a/fs/xfs/xfs_trans_dquot.c b/fs/xfs/xfs_trans_dquot.c
index 995170194df0..c3d547211d16 100644
--- a/fs/xfs/xfs_trans_dquot.c
+++ b/fs/xfs/xfs_trans_dquot.c
@@ -609,17 +609,20 @@ xfs_trans_dqresv(
        xfs_qcnt_t      total_count;
        xfs_qcnt_t      *resbcountp;
        xfs_quotainfo_t *q = mp->m_quotainfo;
+        struct xfs_def_quota    *defq;
        xfs_dqlock(dqp);
+        defq = xfs_get_defquota(dqp, q);
        if (flags & XFS_TRANS_DQ_RES_BLKS) {
                hardlimit = be64_to_cpu(dqp->q_core.d_blk_hardlimit);
                if (!hardlimit)
-                        hardlimit = q->qi_bhardlimit;
+                        hardlimit = defq->bhardlimit;
                softlimit = be64_to_cpu(dqp->q_core.d_blk_softlimit);
                if (!softlimit)
-                        softlimit = q->qi_bsoftlimit;
+                        softlimit = defq->bsoftlimit;
                timer = be32_to_cpu(dqp->q_core.d_btimer);
                warns = be16_to_cpu(dqp->q_core.d_bwarns);
                warnlimit = dqp->q_mount->m_quotainfo->qi_bwarnlimit;
@@ -628,10 +631,10 @@ xfs_trans_dqresv(
                ASSERT(flags & XFS_TRANS_DQ_RES_RTBLKS);
                hardlimit = be64_to_cpu(dqp->q_core.d_rtb_hardlimit);
                if (!hardlimit)
-                        hardlimit = q->qi_rtbhardlimit;
+                        hardlimit = defq->rtbhardlimit;
                softlimit = be64_to_cpu(dqp->q_core.d_rtb_softlimit);
                if (!softlimit)
-                        softlimit = q->qi_rtbsoftlimit;
+                        softlimit = defq->rtbsoftlimit;
                timer = be32_to_cpu(dqp->q_core.d_rtbtimer);
                warns = be16_to_cpu(dqp->q_core.d_rtbwarns);
                warnlimit = dqp->q_mount->m_quotainfo->qi_rtbwarnlimit;
@@ -672,10 +675,10 @@ xfs_trans_dqresv(
                        warnlimit = dqp->q_mount->m_quotainfo->qi_iwarnlimit;
                        hardlimit = be64_to_cpu(dqp->q_core.d_ino_hardlimit);
                        if (!hardlimit)
-                                hardlimit = q->qi_ihardlimit;
+                                hardlimit = defq->ihardlimit;
                        softlimit = be64_to_cpu(dqp->q_core.d_ino_softlimit);
                        if (!softlimit)
-                                softlimit = q->qi_isoftlimit;
+                                softlimit = defq->isoftlimit;
                        if (hardlimit && total_count > hardlimit) {
                                xfs_quota_warn(mp, dqp, QUOTA_NL_IHARDWARN);
diff --git a/fs/xfs/xfs_trans_inode.c b/fs/xfs/xfs_trans_inode.c
index b97f1df910ab..11a3af08b5c7 100644
--- a/fs/xfs/xfs_trans_inode.c
+++ b/fs/xfs/xfs_trans_inode.c
@@ -75,18 +75,10 @@ xfs_trans_ichgtime(
        tv = current_fs_time(inode->i_sb);
-        if ((flags & XFS_ICHGTIME_MOD) &&
+        if (flags & XFS_ICHGTIME_MOD)
-            !timespec_equal(&inode->i_mtime, &tv)) {
                inode->i_mtime = tv;
-                ip->i_d.di_mtime.t_sec = tv.tv_sec;
+        if (flags & XFS_ICHGTIME_CHG)
-                ip->i_d.di_mtime.t_nsec = tv.tv_nsec;
-        }
-        if ((flags & XFS_ICHGTIME_CHG) &&
-            !timespec_equal(&inode->i_ctime, &tv)) {
                inode->i_ctime = tv;
-                ip->i_d.di_ctime.t_sec = tv.tv_sec;
-                ip->i_d.di_ctime.t_nsec = tv.tv_nsec;
-        }
 }
 /*
@@ -125,7 +117,7 @@ xfs_trans_log_inode(
         */
        if (!(ip->i_itemp->ili_item.li_desc->lid_flags & XFS_LID_DIRTY) &&
            IS_I_VERSION(VFS_I(ip))) {
-                ip->i_d.di_changecount = ++VFS_I(ip)->i_version;
+                VFS_I(ip)->i_version++;
                flags |= XFS_ILOG_CORE;
        }