332 files changed, 17315 insertions, 8424 deletions
diff --git a/fs/Kconfig b/fs/Kconfig
index f95ae3a027f3..eaff24a19502 100644
--- a/fs/Kconfig
+++ b/fs/Kconfig
@@ -28,8 +28,8 @@ config FS_MBCACHE
        tristate
        default y if EXT2_FS=y && EXT2_FS_XATTR
        default y if EXT3_FS=y && EXT3_FS_XATTR
-        default y if EXT4_FS=y && EXT4_FS_XATTR
+        default y if EXT4_FS=y
-        default m if EXT2_FS_XATTR || EXT3_FS_XATTR || EXT4_FS_XATTR
+        default m if EXT2_FS_XATTR || EXT3_FS_XATTR || EXT4_FS
 source "fs/reiserfs/Kconfig"
 source "fs/jfs/Kconfig"
diff --git a/fs/attr.c b/fs/attr.c
index cce7df53b694..1449adb14ef6 100644
--- a/fs/attr.c
+++ b/fs/attr.c
@@ -49,14 +49,15 @@ int inode_change_ok(const struct inode *inode, struct iattr *attr)
        /* Make sure a caller can chown. */
        if ((ia_valid & ATTR_UID) &&
            (!uid_eq(current_fsuid(), inode->i_uid) ||
-             !uid_eq(attr->ia_uid, inode->i_uid)) && !capable(CAP_CHOWN))
+             !uid_eq(attr->ia_uid, inode->i_uid)) &&
+            !inode_capable(inode, CAP_CHOWN))
                return -EPERM;
        /* Make sure caller can chgrp. */
        if ((ia_valid & ATTR_GID) &&
            (!uid_eq(current_fsuid(), inode->i_uid) ||
            (!in_group_p(attr->ia_gid) && !gid_eq(attr->ia_gid, inode->i_gid))) &&
-            !capable(CAP_CHOWN))
+            !inode_capable(inode, CAP_CHOWN))
                return -EPERM;
        /* Make sure a caller can chmod. */
@@ -65,7 +66,8 @@ int inode_change_ok(const struct inode *inode, struct iattr *attr)
                        return -EPERM;
                /* Also check the setgid bit! */
                if (!in_group_p((ia_valid & ATTR_GID) ? attr->ia_gid :
-                                inode->i_gid) && !capable(CAP_FSETID))
+                                inode->i_gid) &&
+                    !inode_capable(inode, CAP_FSETID))
                        attr->ia_mode &= ~S_ISGID;
        }
@@ -157,7 +159,8 @@ void setattr_copy(struct inode *inode, const struct iattr *attr)
        if (ia_valid & ATTR_MODE) {
                umode_t mode = attr->ia_mode;
-                if (!in_group_p(inode->i_gid) && !capable(CAP_FSETID))
+                if (!in_group_p(inode->i_gid) &&
+                    !inode_capable(inode, CAP_FSETID))
                        mode &= ~S_ISGID;
                inode->i_mode = mode;
        }
diff --git a/fs/autofs4/autofs_i.h b/fs/autofs4/autofs_i.h
index 908e18455413..b785e7707959 100644
--- a/fs/autofs4/autofs_i.h
+++ b/fs/autofs4/autofs_i.h
@@ -74,8 +74,8 @@ struct autofs_info {
        unsigned long last_used;
        atomic_t count;
-        uid_t uid;
+        kuid_t uid;
-        gid_t gid;
+        kgid_t gid;
 };
 #define AUTOFS_INF_EXPIRING     (1<<0) /* dentry is in the process of expiring */
@@ -89,8 +89,8 @@ struct autofs_wait_queue {
        struct qstr name;
        u32 dev;
        u64 ino;
-        uid_t uid;
+        kuid_t uid;
-        gid_t gid;
+        kgid_t gid;
        pid_t pid;
        pid_t tgid;
        /* This is for status reporting upon return */
diff --git a/fs/autofs4/dev-ioctl.c b/fs/autofs4/dev-ioctl.c
index a16214109d31..9f68a37bb2b2 100644
--- a/fs/autofs4/dev-ioctl.c
+++ b/fs/autofs4/dev-ioctl.c
@@ -437,8 +437,8 @@ static int autofs_dev_ioctl_requester(struct file *fp,
                err = 0;
                autofs4_expire_wait(path.dentry);
                spin_lock(&sbi->fs_lock);
-                param->requester.uid = ino->uid;
+                param->requester.uid = from_kuid_munged(current_user_ns(), ino->uid);
-                param->requester.gid = ino->gid;
+                param->requester.gid = from_kgid_munged(current_user_ns(), ino->gid);
                spin_unlock(&sbi->fs_lock);
        }
        path_put(&path);
diff --git a/fs/autofs4/expire.c b/fs/autofs4/expire.c
index 842d00048a65..01443ce43ee7 100644
--- a/fs/autofs4/expire.c
+++ b/fs/autofs4/expire.c
@@ -548,15 +548,6 @@ int autofs4_do_expire_multi(struct super_block *sb, struct vfsmount *mnt,
                spin_lock(&sbi->fs_lock);
                ino->flags &= ~AUTOFS_INF_EXPIRING;
-                spin_lock(&dentry->d_lock);
-                if (!ret) {
-                        if ((IS_ROOT(dentry) ||
-                            (autofs_type_indirect(sbi->type) &&
-                             IS_ROOT(dentry->d_parent))) &&
-                            !(dentry->d_flags & DCACHE_NEED_AUTOMOUNT))
-                                __managed_dentry_set_automount(dentry);
-                }
-                spin_unlock(&dentry->d_lock);
                complete_all(&ino->expire_complete);
                spin_unlock(&sbi->fs_lock);
                dput(dentry);
diff --git a/fs/autofs4/inode.c b/fs/autofs4/inode.c
index 8a4fed8ead30..b104726e2d0a 100644
--- a/fs/autofs4/inode.c
+++ b/fs/autofs4/inode.c
@@ -36,8 +36,8 @@ struct autofs_info *autofs4_new_ino(struct autofs_sb_info *sbi)
 void autofs4_clean_ino(struct autofs_info *ino)
 {
-        ino->uid = 0;
+        ino->uid = GLOBAL_ROOT_UID;
-        ino->gid = 0;
+        ino->gid = GLOBAL_ROOT_GID;
        ino->last_used = jiffies;
 }
@@ -79,10 +79,12 @@ static int autofs4_show_options(struct seq_file *m, struct dentry *root)
                return 0;
        seq_printf(m, ",fd=%d", sbi->pipefd);
-        if (root_inode->i_uid != 0)
+        if (!uid_eq(root_inode->i_uid, GLOBAL_ROOT_UID))
-                seq_printf(m, ",uid=%u", root_inode->i_uid);
+                seq_printf(m, ",uid=%u",
-        if (root_inode->i_gid != 0)
+                        from_kuid_munged(&init_user_ns, root_inode->i_uid));
-                seq_printf(m, ",gid=%u", root_inode->i_gid);
+        if (!gid_eq(root_inode->i_gid, GLOBAL_ROOT_GID))
+                seq_printf(m, ",gid=%u",
+                        from_kgid_munged(&init_user_ns, root_inode->i_gid));
        seq_printf(m, ",pgrp=%d", sbi->oz_pgrp);
        seq_printf(m, ",timeout=%lu", sbi->exp_timeout/HZ);
        seq_printf(m, ",minproto=%d", sbi->min_proto);
@@ -126,7 +128,7 @@ static const match_table_t tokens = {
        {Opt_err, NULL}
 };
-static int parse_options(char *options, int *pipefd, uid_t *uid, gid_t *gid,
+static int parse_options(char *options, int *pipefd, kuid_t *uid, kgid_t *gid,
                pid_t *pgrp, unsigned int *type, int *minproto, int *maxproto)
 {
        char *p;
@@ -159,12 +161,16 @@ static int parse_options(char *options, int *pipefd, uid_t *uid, gid_t *gid,
                case Opt_uid:
                        if (match_int(args, &option))
                                return 1;
-                        *uid = option;
+                        *uid = make_kuid(current_user_ns(), option);
+                        if (!uid_valid(*uid))
+                                return 1;
                        break;
                case Opt_gid:
                        if (match_int(args, &option))
                                return 1;
-                        *gid = option;
+                        *gid = make_kgid(current_user_ns(), option);
+                        if (!gid_valid(*gid))
+                                return 1;
                        break;
                case Opt_pgrp:
                        if (match_int(args, &option))
diff --git a/fs/autofs4/root.c b/fs/autofs4/root.c
index 91b11650722e..c93447604da8 100644
--- a/fs/autofs4/root.c
+++ b/fs/autofs4/root.c
@@ -124,13 +124,10 @@ static int autofs4_dir_open(struct inode *inode, struct file *file)
         * it.
         */
        spin_lock(&sbi->lookup_lock);
-        spin_lock(&dentry->d_lock);
+        if (!d_mountpoint(dentry) && simple_empty(dentry)) {
-        if (!d_mountpoint(dentry) && list_empty(&dentry->d_subdirs)) {
-                spin_unlock(&dentry->d_lock);
                spin_unlock(&sbi->lookup_lock);
                return -ENOENT;
        }
-        spin_unlock(&dentry->d_lock);
        spin_unlock(&sbi->lookup_lock);
 out:
@@ -355,7 +352,6 @@ static struct vfsmount *autofs4_d_automount(struct path *path)
                status = autofs4_mount_wait(dentry);
                if (status)
                        return ERR_PTR(status);
-                spin_lock(&sbi->fs_lock);
                goto done;
        }
@@ -364,8 +360,11 @@ static struct vfsmount *autofs4_d_automount(struct path *path)
         * having d_mountpoint() true, so there's no need to call back
         * to the daemon.
         */
-        if (dentry->d_inode && S_ISLNK(dentry->d_inode->i_mode))
+        if (dentry->d_inode && S_ISLNK(dentry->d_inode->i_mode)) {
+                spin_unlock(&sbi->fs_lock);
                goto done;
+        }
        if (!d_mountpoint(dentry)) {
                /*
                 * It's possible that user space hasn't removed directories
@@ -379,15 +378,13 @@ static struct vfsmount *autofs4_d_automount(struct path *path)
                 * require user space behave.
                 */
                if (sbi->version > 4) {
-                        if (have_submounts(dentry))
+                        if (have_submounts(dentry)) {
+                                spin_unlock(&sbi->fs_lock);
                                goto done;
+                        }
                } else {
-                        spin_lock(&dentry->d_lock);
+                        if (!simple_empty(dentry))
-                        if (!list_empty(&dentry->d_subdirs)) {
-                                spin_unlock(&dentry->d_lock);
                                goto done;
-                        }
-                        spin_unlock(&dentry->d_lock);
                }
                ino->flags |= AUTOFS_INF_PENDING;
                spin_unlock(&sbi->fs_lock);
@@ -399,28 +396,8 @@ static struct vfsmount *autofs4_d_automount(struct path *path)
                        return ERR_PTR(status);
                }
        }
-done:
-        if (!(ino->flags & AUTOFS_INF_EXPIRING)) {
-                /*
-                 * Any needed mounting has been completed and the path
-                 * updated so clear DCACHE_NEED_AUTOMOUNT so we don't
-                 * call ->d_automount() on rootless multi-mounts since
-                 * it can lead to an incorrect ELOOP error return.
-                 *
-                 * Only clear DMANAGED_AUTOMOUNT for rootless multi-mounts and
-                 * symlinks as in all other cases the dentry will be covered by
-                 * an actual mount so ->d_automount() won't be called during
-                 * the follow.
-                 */
-                spin_lock(&dentry->d_lock);
-                if ((!d_mountpoint(dentry) &&
-                    !list_empty(&dentry->d_subdirs)) ||
-                    (dentry->d_inode && S_ISLNK(dentry->d_inode->i_mode)))
-                        __managed_dentry_clear_automount(dentry);
-                spin_unlock(&dentry->d_lock);
-        }
        spin_unlock(&sbi->fs_lock);
+done:
        /* Mount succeeded, check if we ended up with a new dentry */
        dentry = autofs4_mountpoint_changed(path);
        if (!dentry)
@@ -432,6 +409,8 @@ done:
 int autofs4_d_manage(struct dentry *dentry, bool rcu_walk)
 {
        struct autofs_sb_info *sbi = autofs4_sbi(dentry->d_sb);
+        struct autofs_info *ino = autofs4_dentry_ino(dentry);
+        int status;
        DPRINTK("dentry=%p %.*s",
                dentry, dentry->d_name.len, dentry->d_name.name);
@@ -456,7 +435,32 @@ int autofs4_d_manage(struct dentry *dentry, bool rcu_walk)
         * This dentry may be under construction so wait on mount
         * completion.
         */
-        return autofs4_mount_wait(dentry);
+        status = autofs4_mount_wait(dentry);
+        if (status)
+                return status;
+        spin_lock(&sbi->fs_lock);
+        /*
+         * If the dentry has been selected for expire while we slept
+         * on the lock then it might go away. We'll deal with that in
+         * ->d_automount() and wait on a new mount if the expire
+         * succeeds or return here if it doesn't (since there's no
+         * mount to follow with a rootless multi-mount).
+         */
+        if (!(ino->flags & AUTOFS_INF_EXPIRING)) {
+                /*
+                 * Any needed mounting has been completed and the path
+                 * updated so check if this is a rootless multi-mount so
+                 * we can avoid needless calls ->d_automount() and avoid
+                 * an incorrect ELOOP error return.
+                 */
+                if ((!d_mountpoint(dentry) && !simple_empty(dentry)) ||
+                    (dentry->d_inode && S_ISLNK(dentry->d_inode->i_mode)))
+                        status = -EISDIR;
+        }
+        spin_unlock(&sbi->fs_lock);
+        return status;
 }
 /* Lookups in the root directory */
@@ -599,9 +603,7 @@ static int autofs4_dir_unlink(struct inode *dir, struct dentry *dentry)
        spin_lock(&sbi->lookup_lock);
        __autofs4_add_expiring(dentry);
-        spin_lock(&dentry->d_lock);
+        d_drop(dentry);
-        __d_drop(dentry);
-        spin_unlock(&dentry->d_lock);
        spin_unlock(&sbi->lookup_lock);
        return 0;
@@ -672,15 +674,12 @@ static int autofs4_dir_rmdir(struct inode *dir, struct dentry *dentry)
                return -EACCES;
        spin_lock(&sbi->lookup_lock);
-        spin_lock(&dentry->d_lock);
+        if (!simple_empty(dentry)) {
-        if (!list_empty(&dentry->d_subdirs)) {
-                spin_unlock(&dentry->d_lock);
                spin_unlock(&sbi->lookup_lock);
                return -ENOTEMPTY;
        }
        __autofs4_add_expiring(dentry);
-        __d_drop(dentry);
+        d_drop(dentry);
-        spin_unlock(&dentry->d_lock);
        spin_unlock(&sbi->lookup_lock);
        if (sbi->version < 5)
diff --git a/fs/autofs4/waitq.c b/fs/autofs4/waitq.c
index dce436e595c1..03bc1d347d8e 100644
--- a/fs/autofs4/waitq.c
+++ b/fs/autofs4/waitq.c
@@ -154,6 +154,7 @@ static void autofs4_notify_daemon(struct autofs_sb_info *sbi,
        case autofs_ptype_expire_direct:
        {
                struct autofs_v5_packet *packet = &pkt.v5_pkt.v5_packet;
+                struct user_namespace *user_ns = sbi->pipe->f_cred->user_ns;
                pktsz = sizeof(*packet);
@@ -163,8 +164,8 @@ static void autofs4_notify_daemon(struct autofs_sb_info *sbi,
                packet->name[wq->name.len] = '\0';
                packet->dev = wq->dev;
                packet->ino = wq->ino;
-                packet->uid = wq->uid;
+                packet->uid = from_kuid_munged(user_ns, wq->uid);
-                packet->gid = wq->gid;
+                packet->gid = from_kgid_munged(user_ns, wq->gid);
                packet->pid = wq->pid;
                packet->tgid = wq->tgid;
                break;
diff --git a/fs/bad_inode.c b/fs/bad_inode.c
index b1342ffb3cf6..922ad460bff9 100644
--- a/fs/bad_inode.c
+++ b/fs/bad_inode.c
@@ -16,7 +16,7 @@
 #include <linux/poll.h>
-static loff_t bad_file_llseek(struct file *file, loff_t offset, int origin)
+static loff_t bad_file_llseek(struct file *file, loff_t offset, int whence)
 {
        return -EIO;
 }
diff --git a/fs/binfmt_aout.c b/fs/binfmt_aout.c
index 0e7a6f81ae36..6043567b95c2 100644
--- a/fs/binfmt_aout.c
+++ b/fs/binfmt_aout.c
@@ -30,7 +30,7 @@
 #include <asm/cacheflush.h>
 #include <asm/a.out-core.h>
-static int load_aout_binary(struct linux_binprm *, struct pt_regs * regs);
+static int load_aout_binary(struct linux_binprm *);
 static int load_aout_library(struct file*);
 #ifdef CONFIG_COREDUMP
@@ -201,8 +201,9 @@ static unsigned long __user *create_aout_tables(char __user *p, struct linux_bin
 * libraries.  There is no binary dependent code anywhere else.
 */
-static int load_aout_binary(struct linux_binprm * bprm, struct pt_regs * regs)
+static int load_aout_binary(struct linux_binprm * bprm)
 {
+        struct pt_regs *regs = current_pt_regs();
        struct exec ex;
        unsigned long error;
        unsigned long fd_offset;
diff --git a/fs/binfmt_elf.c b/fs/binfmt_elf.c
index fbd9f60bd763..0c42cdbabecf 100644
--- a/fs/binfmt_elf.c
+++ b/fs/binfmt_elf.c
@@ -44,7 +44,7 @@
 #define user_siginfo_t siginfo_t
 #endif
-static int load_elf_binary(struct linux_binprm *bprm, struct pt_regs *regs);
+static int load_elf_binary(struct linux_binprm *bprm);
 static int load_elf_library(struct file *);
 static unsigned long elf_map(struct file *, unsigned long, struct elf_phdr *,
                                int, int, unsigned long);
@@ -558,7 +558,7 @@ static unsigned long randomize_stack_top(unsigned long stack_top)
 #endif
 }
-static int load_elf_binary(struct linux_binprm *bprm, struct pt_regs *regs)
+static int load_elf_binary(struct linux_binprm *bprm)
 {
        struct file *interpreter = NULL; /* to shut gcc up */
        unsigned long load_addr = 0, load_bias = 0;
@@ -575,6 +575,7 @@ static int load_elf_binary(struct linux_binprm *bprm, struct pt_regs *regs)
        unsigned long reloc_func_desc __maybe_unused = 0;
        int executable_stack = EXSTACK_DEFAULT;
        unsigned long def_flags = 0;
+        struct pt_regs *regs = current_pt_regs();
        struct {
                struct elfhdr elf_ex;
                struct elfhdr interp_elf_ex;
@@ -1600,8 +1601,10 @@ static int fill_note_info(struct elfhdr *elf, int phdrs,
        info->thread = NULL;
        psinfo = kmalloc(sizeof(*psinfo), GFP_KERNEL);
-        if (psinfo == NULL)
+        if (psinfo == NULL) {
+                info->psinfo.data = NULL; /* So we don't free this wrongly */
                return 0;
+        }
        fill_note(&info->psinfo, "CORE", NT_PRPSINFO, sizeof(*psinfo), psinfo);
diff --git a/fs/binfmt_elf_fdpic.c b/fs/binfmt_elf_fdpic.c
index a46049154107..dc84732e554f 100644
--- a/fs/binfmt_elf_fdpic.c
+++ b/fs/binfmt_elf_fdpic.c
@@ -56,7 +56,7 @@ typedef char *elf_caddr_t;
 MODULE_LICENSE("GPL");
-static int load_elf_fdpic_binary(struct linux_binprm *, struct pt_regs *);
+static int load_elf_fdpic_binary(struct linux_binprm *);
 static int elf_fdpic_fetch_phdrs(struct elf_fdpic_params *, struct file *);
 static int elf_fdpic_map_file(struct elf_fdpic_params *, struct file *,
                              struct mm_struct *, const char *);
@@ -164,10 +164,10 @@ static int elf_fdpic_fetch_phdrs(struct elf_fdpic_params *params,
 /*
 * load an fdpic binary into various bits of memory
 */
-static int load_elf_fdpic_binary(struct linux_binprm *bprm,
+static int load_elf_fdpic_binary(struct linux_binprm *bprm)
-                                 struct pt_regs *regs)
 {
        struct elf_fdpic_params exec_params, interp_params;
+        struct pt_regs *regs = current_pt_regs();
        struct elf_phdr *phdr;
        unsigned long stack_size, entryaddr;
 #ifdef ELF_FDPIC_PLAT_INIT
diff --git a/fs/binfmt_em86.c b/fs/binfmt_em86.c
index 2790c7e1912e..037a3e2b045b 100644
--- a/fs/binfmt_em86.c
+++ b/fs/binfmt_em86.c
@@ -22,7 +22,7 @@
 #define EM86_INTERP     "/usr/bin/em86"
 #define EM86_I_NAME     "em86"
-static int load_em86(struct linux_binprm *bprm,struct pt_regs *regs)
+static int load_em86(struct linux_binprm *bprm)
 {
        char *interp, *i_name, *i_arg;
        struct file * file;
@@ -42,7 +42,6 @@ static int load_em86(struct linux_binprm *bprm,struct pt_regs *regs)
                        return -ENOEXEC;
        }
-        bprm->recursion_depth++; /* Well, the bang-shell is implicit... */
        allow_write_access(bprm->file);
        fput(bprm->file);
        bprm->file = NULL;
@@ -90,7 +89,7 @@ static int load_em86(struct linux_binprm *bprm,struct pt_regs *regs)
        if (retval < 0)
                return retval;
-        return search_binary_handler(bprm, regs);
+        return search_binary_handler(bprm);
 }
 static struct linux_binfmt em86_format = {
diff --git a/fs/binfmt_flat.c b/fs/binfmt_flat.c
index e280352b28f9..b56371981d16 100644
--- a/fs/binfmt_flat.c
+++ b/fs/binfmt_flat.c
@@ -88,7 +88,7 @@ struct lib_info {
 static int load_flat_shared_library(int id, struct lib_info *p);
 #endif
-static int load_flat_binary(struct linux_binprm *, struct pt_regs * regs);
+static int load_flat_binary(struct linux_binprm *);
 static int flat_core_dump(struct coredump_params *cprm);
 static struct linux_binfmt flat_format = {
@@ -858,9 +858,10 @@ out:
 * libraries.  There is no binary dependent code anywhere else.
 */
-static int load_flat_binary(struct linux_binprm * bprm, struct pt_regs * regs)
+static int load_flat_binary(struct linux_binprm * bprm)
 {
        struct lib_info libinfo;
+        struct pt_regs *regs = current_pt_regs();
        unsigned long p = bprm->p;
        unsigned long stack_len;
        unsigned long start_addr;
diff --git a/fs/binfmt_misc.c b/fs/binfmt_misc.c
index 790b3cddca67..9be335fb8a7c 100644
--- a/fs/binfmt_misc.c
+++ b/fs/binfmt_misc.c
@@ -104,7 +104,7 @@ static Node *check_file(struct linux_binprm *bprm)
 /*
 * the loader itself
 */
-static int load_misc_binary(struct linux_binprm *bprm, struct pt_regs *regs)
+static int load_misc_binary(struct linux_binprm *bprm)
 {
        Node *fmt;
        struct file * interp_file = NULL;
@@ -117,10 +117,6 @@ static int load_misc_binary(struct linux_binprm *bprm, struct pt_regs *regs)
        if (!enabled)
                goto _ret;
-        retval = -ENOEXEC;
-        if (bprm->recursion_depth > BINPRM_MAX_RECURSION)
-                goto _ret;
        /* to keep locking time low, we copy the interpreter string */
        read_lock(&entries_lock);
        fmt = check_file(bprm);
@@ -197,9 +193,7 @@ static int load_misc_binary(struct linux_binprm *bprm, struct pt_regs *regs)
        if (retval < 0)
                goto _error;
-        bprm->recursion_depth++;
+        retval = search_binary_handler(bprm);
-        retval = search_binary_handler (bprm, regs);
        if (retval < 0)
                goto _error;
diff --git a/fs/binfmt_script.c b/fs/binfmt_script.c
index d3b8c1f63155..1610a91637e5 100644
--- a/fs/binfmt_script.c
+++ b/fs/binfmt_script.c
@@ -14,7 +14,7 @@
 #include <linux/err.h>
 #include <linux/fs.h>
-static int load_script(struct linux_binprm *bprm,struct pt_regs *regs)
+static int load_script(struct linux_binprm *bprm)
 {
        const char *i_arg, *i_name;
        char *cp;
@@ -22,15 +22,13 @@ static int load_script(struct linux_binprm *bprm,struct pt_regs *regs)
        char interp[BINPRM_BUF_SIZE];
        int retval;
-        if ((bprm->buf[0] != '#') || (bprm->buf[1] != '!') ||
+        if ((bprm->buf[0] != '#') || (bprm->buf[1] != '!'))
-            (bprm->recursion_depth > BINPRM_MAX_RECURSION))
                return -ENOEXEC;
        /*
         * This section does the #! interpretation.
         * Sorta complicated, but hopefully it will work.  -TYT
         */
-        bprm->recursion_depth++;
        allow_write_access(bprm->file);
        fput(bprm->file);
        bprm->file = NULL;
@@ -95,7 +93,7 @@ static int load_script(struct linux_binprm *bprm,struct pt_regs *regs)
        retval = prepare_binprm(bprm);
        if (retval < 0)
                return retval;
-        return search_binary_handler(bprm,regs);
+        return search_binary_handler(bprm);
 }
 static struct linux_binfmt script_format = {
diff --git a/fs/binfmt_som.c b/fs/binfmt_som.c
index 4517aaff61b4..4e00ed68d4a6 100644
--- a/fs/binfmt_som.c
+++ b/fs/binfmt_som.c
@@ -35,7 +35,7 @@
 #include <linux/elf.h>
-static int load_som_binary(struct linux_binprm * bprm, struct pt_regs * regs);
+static int load_som_binary(struct linux_binprm * bprm);
 static int load_som_library(struct file *);
 /*
@@ -180,13 +180,14 @@ out:
 */
 static int
-load_som_binary(struct linux_binprm * bprm, struct pt_regs * regs)
+load_som_binary(struct linux_binprm * bprm)
 {
        int retval;
        unsigned int size;
        unsigned long som_entry;
        struct som_hdr *som_ex;
        struct som_exec_auxhdr *hpuxhdr;
+        struct pt_regs *regs = current_pt_regs();
        /* Get the exec-header */
        som_ex = (struct som_hdr *) bprm->buf;
diff --git a/fs/block_dev.c b/fs/block_dev.c
index 1a1e5e3b1eaf..172f8491a2bd 100644
--- a/fs/block_dev.c
+++ b/fs/block_dev.c
@@ -70,19 +70,6 @@ static void bdev_inode_switch_bdi(struct inode *inode,
        spin_unlock(&dst->wb.list_lock);
 }
-sector_t blkdev_max_block(struct block_device *bdev)
-{
-        sector_t retval = ~((sector_t)0);
-        loff_t sz = i_size_read(bdev->bd_inode);
-        if (sz) {
-                unsigned int size = block_size(bdev);
-                unsigned int sizebits = blksize_bits(size);
-                retval = (sz >> sizebits);
-        }
-        return retval;
-}
 /* Kill _all_ buffers and pagecache , dirty or not.. */
 void kill_bdev(struct block_device *bdev)
 {
@@ -116,8 +103,6 @@ EXPORT_SYMBOL(invalidate_bdev);
 int set_blocksize(struct block_device *bdev, int size)
 {
-        struct address_space *mapping;
        /* Size must be a power of two, and between 512 and PAGE_SIZE */
        if (size > PAGE_SIZE || size < 512 || !is_power_of_2(size))
                return -EINVAL;
@@ -126,19 +111,6 @@ int set_blocksize(struct block_device *bdev, int size)
        if (size < bdev_logical_block_size(bdev))
                return -EINVAL;
-        /* Prevent starting I/O or mapping the device */
-        percpu_down_write(&bdev->bd_block_size_semaphore);
-        /* Check that the block device is not memory mapped */
-        mapping = bdev->bd_inode->i_mapping;
-        mutex_lock(&mapping->i_mmap_mutex);
-        if (mapping_mapped(mapping)) {
-                mutex_unlock(&mapping->i_mmap_mutex);
-                percpu_up_write(&bdev->bd_block_size_semaphore);
-                return -EBUSY;
-        }
-        mutex_unlock(&mapping->i_mmap_mutex);
        /* Don't change the size if it is same as current */
        if (bdev->bd_block_size != size) {
                sync_blockdev(bdev);
@@ -146,9 +118,6 @@ int set_blocksize(struct block_device *bdev, int size)
                bdev->bd_inode->i_blkbits = blksize_bits(size);
                kill_bdev(bdev);
        }
-        percpu_up_write(&bdev->bd_block_size_semaphore);
        return 0;
 }
@@ -181,52 +150,12 @@ static int
 blkdev_get_block(struct inode *inode, sector_t iblock,
                struct buffer_head *bh, int create)
 {
-        if (iblock >= blkdev_max_block(I_BDEV(inode))) {
-                if (create)
-                        return -EIO;
-                /*
-                 * for reads, we're just trying to fill a partial page.
-                 * return a hole, they will have to call get_block again
-                 * before they can fill it, and they will get -EIO at that
-                 * time
-                 */
-                return 0;
-        }
        bh->b_bdev = I_BDEV(inode);
        bh->b_blocknr = iblock;
        set_buffer_mapped(bh);
        return 0;
 }
-static int
-blkdev_get_blocks(struct inode *inode, sector_t iblock,
-                struct buffer_head *bh, int create)
-{
-        sector_t end_block = blkdev_max_block(I_BDEV(inode));
-        unsigned long max_blocks = bh->b_size >> inode->i_blkbits;
-        if ((iblock + max_blocks) > end_block) {
-                max_blocks = end_block - iblock;
-                if ((long)max_blocks <= 0) {
-                        if (create)
-                                return -EIO;    /* write fully beyond EOF */
-                        /*
-                         * It is a read which is fully beyond EOF.  We return
-                         * a !buffer_mapped buffer
-                         */
-                        max_blocks = 0;
-                }
-        }
-        bh->b_bdev = I_BDEV(inode);
-        bh->b_blocknr = iblock;
-        bh->b_size = max_blocks << inode->i_blkbits;
-        if (max_blocks)
-                set_buffer_mapped(bh);
-        return 0;
-}
 static ssize_t
 blkdev_direct_IO(int rw, struct kiocb *iocb, const struct iovec *iov,
                        loff_t offset, unsigned long nr_segs)
@@ -235,7 +164,7 @@ blkdev_direct_IO(int rw, struct kiocb *iocb, const struct iovec *iov,
        struct inode *inode = file->f_mapping->host;
        return __blockdev_direct_IO(rw, iocb, inode, I_BDEV(inode), iov, offset,
-                                    nr_segs, blkdev_get_blocks, NULL, NULL, 0);
+                                    nr_segs, blkdev_get_block, NULL, NULL, 0);
 }
 int __sync_blockdev(struct block_device *bdev, int wait)
@@ -392,7 +321,7 @@ static int blkdev_write_end(struct file *file, struct address_space *mapping,
 * for a block special file file->f_path.dentry->d_inode->i_size is zero
 * so we compute the size by hand (just as in block_read/write above)
 */
-static loff_t block_llseek(struct file *file, loff_t offset, int origin)
+static loff_t block_llseek(struct file *file, loff_t offset, int whence)
 {
        struct inode *bd_inode = file->f_mapping->host;
        loff_t size;
@@ -402,7 +331,7 @@ static loff_t block_llseek(struct file *file, loff_t offset, int origin)
        size = i_size_read(bd_inode);
        retval = -EINVAL;
-        switch (origin) {
+        switch (whence) {
                case SEEK_END:
                        offset += size;
                        break;
@@ -459,12 +388,6 @@ static struct inode *bdev_alloc_inode(struct super_block *sb)
        struct bdev_inode *ei = kmem_cache_alloc(bdev_cachep, GFP_KERNEL);
        if (!ei)
                return NULL;
-        if (unlikely(percpu_init_rwsem(&ei->bdev.bd_block_size_semaphore))) {
-                kmem_cache_free(bdev_cachep, ei);
-                return NULL;
-        }
        return &ei->vfs_inode;
 }
@@ -473,8 +396,6 @@ static void bdev_i_callback(struct rcu_head *head)
        struct inode *inode = container_of(head, struct inode, i_rcu);
        struct bdev_inode *bdi = BDEV_I(inode);
-        percpu_free_rwsem(&bdi->bdev.bd_block_size_semaphore);
        kmem_cache_free(bdev_cachep, bdi);
 }
@@ -1593,22 +1514,6 @@ static long block_ioctl(struct file *file, unsigned cmd, unsigned long arg)
        return blkdev_ioctl(bdev, mode, cmd, arg);
 }
-ssize_t blkdev_aio_read(struct kiocb *iocb, const struct iovec *iov,
-                        unsigned long nr_segs, loff_t pos)
-{
-        ssize_t ret;
-        struct block_device *bdev = I_BDEV(iocb->ki_filp->f_mapping->host);
-        percpu_down_read(&bdev->bd_block_size_semaphore);
-        ret = generic_file_aio_read(iocb, iov, nr_segs, pos);
-        percpu_up_read(&bdev->bd_block_size_semaphore);
-        return ret;
-}
-EXPORT_SYMBOL_GPL(blkdev_aio_read);
 /*
 * Write data to the block device.  Only intended for the block device itself
 * and the raw driver which basically is a fake block device.
@@ -1620,16 +1525,12 @@ ssize_t blkdev_aio_write(struct kiocb *iocb, const struct iovec *iov,
                         unsigned long nr_segs, loff_t pos)
 {
        struct file *file = iocb->ki_filp;
-        struct block_device *bdev = I_BDEV(file->f_mapping->host);
        struct blk_plug plug;
        ssize_t ret;
        BUG_ON(iocb->ki_pos != pos);
        blk_start_plug(&plug);
-        percpu_down_read(&bdev->bd_block_size_semaphore);
        ret = __generic_file_aio_write(iocb, iov, nr_segs, &iocb->ki_pos);
        if (ret > 0 || ret == -EIOCBQUEUED) {
                ssize_t err;
@@ -1638,62 +1539,27 @@ ssize_t blkdev_aio_write(struct kiocb *iocb, const struct iovec *iov,
                if (err < 0 && ret > 0)
                        ret = err;
        }
-        percpu_up_read(&bdev->bd_block_size_semaphore);
        blk_finish_plug(&plug);
        return ret;
 }
 EXPORT_SYMBOL_GPL(blkdev_aio_write);
-static int blkdev_mmap(struct file *file, struct vm_area_struct *vma)
+static ssize_t blkdev_aio_read(struct kiocb *iocb, const struct iovec *iov,
-{
+                         unsigned long nr_segs, loff_t pos)
-        int ret;
-        struct block_device *bdev = I_BDEV(file->f_mapping->host);
-        percpu_down_read(&bdev->bd_block_size_semaphore);
-        ret = generic_file_mmap(file, vma);
-        percpu_up_read(&bdev->bd_block_size_semaphore);
-        return ret;
-}
-static ssize_t blkdev_splice_read(struct file *file, loff_t *ppos,
-                                  struct pipe_inode_info *pipe, size_t len,
-                                  unsigned int flags)
-{
-        ssize_t ret;
-        struct block_device *bdev = I_BDEV(file->f_mapping->host);
-        percpu_down_read(&bdev->bd_block_size_semaphore);
-        ret = generic_file_splice_read(file, ppos, pipe, len, flags);
-        percpu_up_read(&bdev->bd_block_size_semaphore);
-        return ret;
-}
-static ssize_t blkdev_splice_write(struct pipe_inode_info *pipe,
-                                   struct file *file, loff_t *ppos, size_t len,
-                                   unsigned int flags)
 {
-        ssize_t ret;
+        struct file *file = iocb->ki_filp;
-        struct block_device *bdev = I_BDEV(file->f_mapping->host);
+        struct inode *bd_inode = file->f_mapping->host;
+        loff_t size = i_size_read(bd_inode);
-        percpu_down_read(&bdev->bd_block_size_semaphore);
-        ret = generic_file_splice_write(pipe, file, ppos, len, flags);
-        percpu_up_read(&bdev->bd_block_size_semaphore);
+        if (pos >= size)
+                return 0;
-        return ret;
+        size -= pos;
+        if (size < INT_MAX)
+                nr_segs = iov_shorten((struct iovec *)iov, nr_segs, size);
+        return generic_file_aio_read(iocb, iov, nr_segs, pos);
 }
 /*
 * Try to release a page associated with block device when the system
 * is under memory pressure.
@@ -1724,16 +1590,16 @@ const struct file_operations def_blk_fops = {
        .llseek         = block_llseek,
        .read           = do_sync_read,
        .write          = do_sync_write,
-        .aio_read       = blkdev_aio_read,
+        .aio_read       = blkdev_aio_read,
        .aio_write      = blkdev_aio_write,
-        .mmap           = blkdev_mmap,
+        .mmap           = generic_file_mmap,
        .fsync          = blkdev_fsync,
        .unlocked_ioctl = block_ioctl,
 #ifdef CONFIG_COMPAT
        .compat_ioctl   = compat_blkdev_ioctl,
 #endif
-        .splice_read    = blkdev_splice_read,
+        .splice_read    = generic_file_splice_read,
-        .splice_write   = blkdev_splice_write,
+        .splice_write   = generic_file_splice_write,
 };
 int ioctl_by_bdev(struct block_device *bdev, unsigned cmd, unsigned long arg)
diff --git a/fs/btrfs/Makefile b/fs/btrfs/Makefile
index d7fcdba141a2..7df3e0f0ee51 100644
--- a/fs/btrfs/Makefile
+++ b/fs/btrfs/Makefile
@@ -8,7 +8,7 @@ btrfs-y += super.o ctree.o extent-tree.o print-tree.o root-tree.o dir-item.o \
           extent_io.o volumes.o async-thread.o ioctl.o locking.o orphan.o \
           export.o tree-log.o free-space-cache.o zlib.o lzo.o \
           compression.o delayed-ref.o relocation.o delayed-inode.o scrub.o \
-           reada.o backref.o ulist.o qgroup.o send.o
+           reada.o backref.o ulist.o qgroup.o send.o dev-replace.o
 btrfs-$(CONFIG_BTRFS_FS_POSIX_ACL) += acl.o
 btrfs-$(CONFIG_BTRFS_FS_CHECK_INTEGRITY) += check-integrity.o
diff --git a/fs/btrfs/acl.c b/fs/btrfs/acl.c
index 0c16e3dbfd56..e15d2b0d8d3b 100644
--- a/fs/btrfs/acl.c
+++ b/fs/btrfs/acl.c
@@ -121,6 +121,8 @@ static int btrfs_set_acl(struct btrfs_trans_handle *trans,
                        ret = posix_acl_equiv_mode(acl, &inode->i_mode);
                        if (ret < 0)
                                return ret;
+                        if (ret == 0)
+                                acl = NULL;
                }
                ret = 0;
                break;
diff --git a/fs/btrfs/backref.c b/fs/btrfs/backref.c
index 208d8aa5b07e..04edf69be875 100644
--- a/fs/btrfs/backref.c
+++ b/fs/btrfs/backref.c
@@ -461,6 +461,7 @@ static int __merge_refs(struct list_head *head, int mode)
                     pos2 = n2, n2 = pos2->next) {
                        struct __prelim_ref *ref2;
                        struct __prelim_ref *xchg;
+                        struct extent_inode_elem *eie;
                        ref2 = list_entry(pos2, struct __prelim_ref, list);
@@ -472,12 +473,20 @@ static int __merge_refs(struct list_head *head, int mode)
                                        ref1 = ref2;
                                        ref2 = xchg;
                                }
-                                ref1->count += ref2->count;
                        } else {
                                if (ref1->parent != ref2->parent)
                                        continue;
-                                ref1->count += ref2->count;
                        }
+                        eie = ref1->inode_list;
+                        while (eie && eie->next)
+                                eie = eie->next;
+                        if (eie)
+                                eie->next = ref2->inode_list;
+                        else
+                                ref1->inode_list = ref2->inode_list;
+                        ref1->count += ref2->count;
                        list_del(&ref2->list);
                        kfree(ref2);
                }
@@ -890,8 +899,7 @@ again:
        while (!list_empty(&prefs)) {
                ref = list_first_entry(&prefs, struct __prelim_ref, list);
                list_del(&ref->list);
-                if (ref->count < 0)
+                WARN_ON(ref->count < 0);
-                        WARN_ON(1);
                if (ref->count && ref->root_id && ref->parent == 0) {
                        /* no parent == root of tree */
                        ret = ulist_add(roots, ref->root_id, 0, GFP_NOFS);
diff --git a/fs/btrfs/btrfs_inode.h b/fs/btrfs/btrfs_inode.h
index ed8ca7ca5eff..2a8c242bc4f5 100644
--- a/fs/btrfs/btrfs_inode.h
+++ b/fs/btrfs/btrfs_inode.h
@@ -39,6 +39,7 @@
 #define BTRFS_INODE_HAS_ORPHAN_ITEM             5
 #define BTRFS_INODE_HAS_ASYNC_EXTENT            6
 #define BTRFS_INODE_NEEDS_FULL_SYNC             7
+#define BTRFS_INODE_COPY_EVERYTHING             8
 /* in memory btrfs inode */
 struct btrfs_inode {
@@ -90,6 +91,9 @@ struct btrfs_inode {
        unsigned long runtime_flags;
+        /* Keep track of who's O_SYNC/fsycing currently */
+        atomic_t sync_writers;
        /* full 64 bit generation number, struct vfs_inode doesn't have a big
         * enough field for this.
         */
diff --git a/fs/btrfs/check-integrity.c b/fs/btrfs/check-integrity.c
index 5a3e45db642a..11d47bfb62b4 100644
--- a/fs/btrfs/check-integrity.c
+++ b/fs/btrfs/check-integrity.c
@@ -137,7 +137,7 @@ struct btrfsic_block {
        unsigned int never_written:1;   /* block was added because it was
                                         * referenced, not because it was
                                         * written */
-        unsigned int mirror_num:2;      /* large enough to hold
+        unsigned int mirror_num;        /* large enough to hold
                                         * BTRFS_SUPER_MIRROR_MAX */
        struct btrfsic_dev_state *dev_state;
        u64 dev_bytenr;         /* key, physical byte num on disk */
@@ -723,7 +723,7 @@ static int btrfsic_process_superblock(struct btrfsic_state *state,
                }
                num_copies =
-                    btrfs_num_copies(&state->root->fs_info->mapping_tree,
+                    btrfs_num_copies(state->root->fs_info,
                                     next_bytenr, state->metablock_size);
                if (state->print_mask & BTRFSIC_PRINT_MASK_NUM_COPIES)
                        printk(KERN_INFO "num_copies(log_bytenr=%llu) = %d\n",
@@ -903,7 +903,7 @@ static int btrfsic_process_superblock_dev_mirror(
                }
                num_copies =
-                    btrfs_num_copies(&state->root->fs_info->mapping_tree,
+                    btrfs_num_copies(state->root->fs_info,
                                     next_bytenr, state->metablock_size);
                if (state->print_mask & BTRFSIC_PRINT_MASK_NUM_COPIES)
                        printk(KERN_INFO "num_copies(log_bytenr=%llu) = %d\n",
@@ -1287,7 +1287,7 @@ static int btrfsic_create_link_to_next_block(
        *next_blockp = NULL;
        if (0 == *num_copiesp) {
                *num_copiesp =
-                    btrfs_num_copies(&state->root->fs_info->mapping_tree,
+                    btrfs_num_copies(state->root->fs_info,
                                     next_bytenr, state->metablock_size);
                if (state->print_mask & BTRFSIC_PRINT_MASK_NUM_COPIES)
                        printk(KERN_INFO "num_copies(log_bytenr=%llu) = %d\n",
@@ -1489,7 +1489,7 @@ static int btrfsic_handle_extent_data(
                        chunk_len = num_bytes;
                num_copies =
-                    btrfs_num_copies(&state->root->fs_info->mapping_tree,
+                    btrfs_num_copies(state->root->fs_info,
                                     next_bytenr, state->datablock_size);
                if (state->print_mask & BTRFSIC_PRINT_MASK_NUM_COPIES)
                        printk(KERN_INFO "num_copies(log_bytenr=%llu) = %d\n",
@@ -1582,9 +1582,21 @@ static int btrfsic_map_block(struct btrfsic_state *state, u64 bytenr, u32 len,
        struct btrfs_device *device;
        length = len;
-        ret = btrfs_map_block(&state->root->fs_info->mapping_tree, READ,
+        ret = btrfs_map_block(state->root->fs_info, READ,
                              bytenr, &length, &multi, mirror_num);
+        if (ret) {
+                block_ctx_out->start = 0;
+                block_ctx_out->dev_bytenr = 0;
+                block_ctx_out->len = 0;
+                block_ctx_out->dev = NULL;
+                block_ctx_out->datav = NULL;
+                block_ctx_out->pagev = NULL;
+                block_ctx_out->mem_to_free = NULL;
+                return ret;
+        }
        device = multi->stripes[0].dev;
        block_ctx_out->dev = btrfsic_dev_state_lookup(device->bdev);
        block_ctx_out->dev_bytenr = multi->stripes[0].physical;
@@ -1594,8 +1606,7 @@ static int btrfsic_map_block(struct btrfsic_state *state, u64 bytenr, u32 len,
        block_ctx_out->pagev = NULL;
        block_ctx_out->mem_to_free = NULL;
-        if (0 == ret)
+        kfree(multi);
-                kfree(multi);
        if (NULL == block_ctx_out->dev) {
                ret = -ENXIO;
                printk(KERN_INFO "btrfsic: error, cannot lookup dev (#1)!\n");
@@ -2463,7 +2474,7 @@ static int btrfsic_process_written_superblock(
                }
                num_copies =
-                    btrfs_num_copies(&state->root->fs_info->mapping_tree,
+                    btrfs_num_copies(state->root->fs_info,
                                     next_bytenr, BTRFS_SUPER_INFO_SIZE);
                if (state->print_mask & BTRFSIC_PRINT_MASK_NUM_COPIES)
                        printk(KERN_INFO "num_copies(log_bytenr=%llu) = %d\n",
@@ -2960,7 +2971,7 @@ static void btrfsic_cmp_log_and_dev_bytenr(struct btrfsic_state *state,
        struct btrfsic_block_data_ctx block_ctx;
        int match = 0;
-        num_copies = btrfs_num_copies(&state->root->fs_info->mapping_tree,
+        num_copies = btrfs_num_copies(state->root->fs_info,
                                      bytenr, state->metablock_size);
        for (mirror_num = 1; mirror_num <= num_copies; mirror_num++) {
diff --git a/fs/btrfs/compression.c b/fs/btrfs/compression.c
index c6467aa88bee..94ab2f80e7e3 100644
--- a/fs/btrfs/compression.c
+++ b/fs/btrfs/compression.c
@@ -687,7 +687,8 @@ int btrfs_submit_compressed_read(struct inode *inode, struct bio *bio,
                        ret = btrfs_map_bio(root, READ, comp_bio,
                                            mirror_num, 0);
-                        BUG_ON(ret); /* -ENOMEM */
+                        if (ret)
+                                bio_endio(comp_bio, ret);
                        bio_put(comp_bio);
@@ -712,7 +713,8 @@ int btrfs_submit_compressed_read(struct inode *inode, struct bio *bio,
        }
        ret = btrfs_map_bio(root, READ, comp_bio, mirror_num, 0);
-        BUG_ON(ret); /* -ENOMEM */
+        if (ret)
+                bio_endio(comp_bio, ret);
        bio_put(comp_bio);
        return 0;
diff --git a/fs/btrfs/ctree.c b/fs/btrfs/ctree.c
index cdfb4c49a806..c7b67cf24bba 100644
--- a/fs/btrfs/ctree.c
+++ b/fs/btrfs/ctree.c
@@ -38,8 +38,7 @@ static int balance_node_right(struct btrfs_trans_handle *trans,
                              struct extent_buffer *dst_buf,
                              struct extent_buffer *src_buf);
 static void del_ptr(struct btrfs_trans_handle *trans, struct btrfs_root *root,
-                    struct btrfs_path *path, int level, int slot,
+                    struct btrfs_path *path, int level, int slot);
-                    int tree_mod_log);
 static void tree_mod_log_free_eb(struct btrfs_fs_info *fs_info,
                                 struct extent_buffer *eb);
 struct extent_buffer *read_old_tree_block(struct btrfs_root *root, u64 bytenr,
@@ -776,8 +775,7 @@ tree_mod_log_eb_move(struct btrfs_fs_info *fs_info, struct extent_buffer *dst,
 static noinline void
 tree_mod_log_set_node_key(struct btrfs_fs_info *fs_info,
-                          struct extent_buffer *eb,
+                          struct extent_buffer *eb, int slot, int atomic)
-                          struct btrfs_disk_key *disk_key, int slot, int atomic)
 {
        int ret;
@@ -1140,13 +1138,13 @@ __tree_mod_log_rewind(struct extent_buffer *eb, u64 time_seq,
                switch (tm->op) {
                case MOD_LOG_KEY_REMOVE_WHILE_FREEING:
                        BUG_ON(tm->slot < n);
-                case MOD_LOG_KEY_REMOVE_WHILE_MOVING:
                case MOD_LOG_KEY_REMOVE:
+                        n++;
+                case MOD_LOG_KEY_REMOVE_WHILE_MOVING:
                        btrfs_set_node_key(eb, &tm->key, tm->slot);
                        btrfs_set_node_blockptr(eb, tm->slot, tm->blockptr);
                        btrfs_set_node_ptr_generation(eb, tm->slot,
                                                      tm->generation);
-                        n++;
                        break;
                case MOD_LOG_KEY_REPLACE:
                        BUG_ON(tm->slot >= n);
@@ -1361,19 +1359,16 @@ noinline int btrfs_cow_block(struct btrfs_trans_handle *trans,
        u64 search_start;
        int ret;
-        if (trans->transaction != root->fs_info->running_transaction) {
+        if (trans->transaction != root->fs_info->running_transaction)
-                printk(KERN_CRIT "trans %llu running %llu\n",
+                WARN(1, KERN_CRIT "trans %llu running %llu\n",
                       (unsigned long long)trans->transid,
                       (unsigned long long)
                       root->fs_info->running_transaction->transid);
-                WARN_ON(1);
-        }
+        if (trans->transid != root->fs_info->generation)
-        if (trans->transid != root->fs_info->generation) {
+                WARN(1, KERN_CRIT "trans %llu running %llu\n",
-                printk(KERN_CRIT "trans %llu running %llu\n",
                       (unsigned long long)trans->transid,
                       (unsigned long long)root->fs_info->generation);
-                WARN_ON(1);
-        }
        if (!should_cow_block(trans, root, buf)) {
                *cow_ret = buf;
@@ -1469,10 +1464,8 @@ int btrfs_realloc_node(struct btrfs_trans_handle *trans,
        if (cache_only && parent_level != 1)
                return 0;
-        if (trans->transaction != root->fs_info->running_transaction)
+        WARN_ON(trans->transaction != root->fs_info->running_transaction);
-                WARN_ON(1);
+        WARN_ON(trans->transid != root->fs_info->generation);
-        if (trans->transid != root->fs_info->generation)
-                WARN_ON(1);
        parent_nritems = btrfs_header_nritems(parent);
        blocksize = btrfs_level_size(root, parent_level - 1);
@@ -1827,7 +1820,7 @@ static noinline int balance_level(struct btrfs_trans_handle *trans,
                if (btrfs_header_nritems(right) == 0) {
                        clean_tree_block(trans, root, right);
                        btrfs_tree_unlock(right);
-                        del_ptr(trans, root, path, level + 1, pslot + 1, 1);
+                        del_ptr(trans, root, path, level + 1, pslot + 1);
                        root_sub_used(root, right->len);
                        btrfs_free_tree_block(trans, root, right, 0, 1);
                        free_extent_buffer_stale(right);
@@ -1836,7 +1829,7 @@ static noinline int balance_level(struct btrfs_trans_handle *trans,
                        struct btrfs_disk_key right_key;
                        btrfs_node_key(right, &right_key, 0);
                        tree_mod_log_set_node_key(root->fs_info, parent,
-                                                  &right_key, pslot + 1, 0);
+                                                  pslot + 1, 0);
                        btrfs_set_node_key(parent, &right_key, pslot + 1);
                        btrfs_mark_buffer_dirty(parent);
                }
@@ -1871,7 +1864,7 @@ static noinline int balance_level(struct btrfs_trans_handle *trans,
        if (btrfs_header_nritems(mid) == 0) {
                clean_tree_block(trans, root, mid);
                btrfs_tree_unlock(mid);
-                del_ptr(trans, root, path, level + 1, pslot, 1);
+                del_ptr(trans, root, path, level + 1, pslot);
                root_sub_used(root, mid->len);
                btrfs_free_tree_block(trans, root, mid, 0, 1);
                free_extent_buffer_stale(mid);
@@ -1880,7 +1873,7 @@ static noinline int balance_level(struct btrfs_trans_handle *trans,
                /* update the parent key to reflect our changes */
                struct btrfs_disk_key mid_key;
                btrfs_node_key(mid, &mid_key, 0);
-                tree_mod_log_set_node_key(root->fs_info, parent, &mid_key,
+                tree_mod_log_set_node_key(root->fs_info, parent,
                                          pslot, 0);
                btrfs_set_node_key(parent, &mid_key, pslot);
                btrfs_mark_buffer_dirty(parent);
@@ -1980,7 +1973,7 @@ static noinline int push_nodes_for_insert(struct btrfs_trans_handle *trans,
                        orig_slot += left_nr;
                        btrfs_node_key(mid, &disk_key, 0);
                        tree_mod_log_set_node_key(root->fs_info, parent,
-                                                  &disk_key, pslot, 0);
+                                                  pslot, 0);
                        btrfs_set_node_key(parent, &disk_key, pslot);
                        btrfs_mark_buffer_dirty(parent);
                        if (btrfs_header_nritems(left) > orig_slot) {
@@ -2033,7 +2026,7 @@ static noinline int push_nodes_for_insert(struct btrfs_trans_handle *trans,
                        btrfs_node_key(right, &disk_key, 0);
                        tree_mod_log_set_node_key(root->fs_info, parent,
-                                                  &disk_key, pslot + 1, 0);
+                                                  pslot + 1, 0);
                        btrfs_set_node_key(parent, &disk_key, pslot + 1);
                        btrfs_mark_buffer_dirty(parent);
@@ -2219,6 +2212,9 @@ static noinline void unlock_up(struct btrfs_path *path, int level,
        int no_skips = 0;
        struct extent_buffer *t;
+        if (path->really_keep_locks)
+                return;
        for (i = level; i < BTRFS_MAX_LEVEL; i++) {
                if (!path->nodes[i])
                        break;
@@ -2266,7 +2262,7 @@ noinline void btrfs_unlock_up_safe(struct btrfs_path *path, int level)
 {
        int i;
-        if (path->keep_locks)
+        if (path->keep_locks || path->really_keep_locks)
                return;
        for (i = level; i < BTRFS_MAX_LEVEL; i++) {
@@ -2499,7 +2495,7 @@ int btrfs_search_slot(struct btrfs_trans_handle *trans, struct btrfs_root
        if (!cow)
                write_lock_level = -1;
-        if (cow && (p->keep_locks || p->lowest_level))
+        if (cow && (p->really_keep_locks || p->keep_locks || p->lowest_level))
                write_lock_level = BTRFS_MAX_LEVEL;
        min_write_lock_level = write_lock_level;
@@ -2568,7 +2564,10 @@ again:
                         * must have write locks on this node and the
                         * parent
                         */
-                        if (level + 1 > write_lock_level) {
+                        if (level > write_lock_level ||
+                            (level + 1 > write_lock_level &&
+                            level + 1 < BTRFS_MAX_LEVEL &&
+                            p->nodes[level + 1])) {
                                write_lock_level = level + 1;
                                btrfs_release_path(p);
                                goto again;
@@ -2917,7 +2916,7 @@ static void fixup_low_keys(struct btrfs_trans_handle *trans,
                if (!path->nodes[i])
                        break;
                t = path->nodes[i];
-                tree_mod_log_set_node_key(root->fs_info, t, key, tslot, 1);
+                tree_mod_log_set_node_key(root->fs_info, t, tslot, 1);
                btrfs_set_node_key(t, key, tslot);
                btrfs_mark_buffer_dirty(path->nodes[i]);
                if (tslot != 0)
@@ -3302,14 +3301,21 @@ static noinline int split_node(struct btrfs_trans_handle *trans,
 */
 static int leaf_space_used(struct extent_buffer *l, int start, int nr)
 {
+        struct btrfs_item *start_item;
+        struct btrfs_item *end_item;
+        struct btrfs_map_token token;
        int data_len;
        int nritems = btrfs_header_nritems(l);
        int end = min(nritems, start + nr) - 1;
        if (!nr)
                return 0;
-        data_len = btrfs_item_end_nr(l, start);
+        btrfs_init_map_token(&token);
-        data_len = data_len - btrfs_item_offset_nr(l, end);
+        start_item = btrfs_item_nr(l, start);
+        end_item = btrfs_item_nr(l, end);
+        data_len = btrfs_token_item_offset(l, start_item, &token) +
+                btrfs_token_item_size(l, start_item, &token);
+        data_len = data_len - btrfs_token_item_offset(l, end_item, &token);
        data_len += sizeof(struct btrfs_item) * nr;
        WARN_ON(data_len < 0);
        return data_len;
@@ -3403,8 +3409,7 @@ static noinline int __push_leaf_right(struct btrfs_trans_handle *trans,
        if (push_items == 0)
                goto out_unlock;
-        if (!empty && push_items == left_nritems)
+        WARN_ON(!empty && push_items == left_nritems);
-                WARN_ON(1);
        /* push left to right */
        right_nritems = btrfs_header_nritems(right);
@@ -3642,11 +3647,9 @@ static noinline int __push_leaf_left(struct btrfs_trans_handle *trans,
        btrfs_set_header_nritems(left, old_left_nritems + push_items);
        /* fixup right node */
-        if (push_items > right_nritems) {
+        if (push_items > right_nritems)
-                printk(KERN_CRIT "push items %d nr %u\n", push_items,
+                WARN(1, KERN_CRIT "push items %d nr %u\n", push_items,
                       right_nritems);
-                WARN_ON(1);
-        }
        if (push_items < right_nritems) {
                push_space = btrfs_item_offset_nr(right, push_items - 1) -
@@ -4602,16 +4605,21 @@ int btrfs_insert_item(struct btrfs_trans_handle *trans, struct btrfs_root
 * empty a node.
 */
 static void del_ptr(struct btrfs_trans_handle *trans, struct btrfs_root *root,
-                    struct btrfs_path *path, int level, int slot,
+                    struct btrfs_path *path, int level, int slot)
-                    int tree_mod_log)
 {
        struct extent_buffer *parent = path->nodes[level];
        u32 nritems;
        int ret;
+        if (level) {
+                ret = tree_mod_log_insert_key(root->fs_info, parent, slot,
+                                              MOD_LOG_KEY_REMOVE);
+                BUG_ON(ret < 0);
+        }
        nritems = btrfs_header_nritems(parent);
        if (slot != nritems - 1) {
-                if (tree_mod_log && level)
+                if (level)
                        tree_mod_log_eb_move(root->fs_info, parent, slot,
                                             slot + 1, nritems - slot - 1);
                memmove_extent_buffer(parent,
@@ -4619,10 +4627,6 @@ static void del_ptr(struct btrfs_trans_handle *trans, struct btrfs_root *root,
                              btrfs_node_key_ptr_offset(slot + 1),
                              sizeof(struct btrfs_key_ptr) *
                              (nritems - slot - 1));
-        } else if (tree_mod_log && level) {
-                ret = tree_mod_log_insert_key(root->fs_info, parent, slot,
-                                              MOD_LOG_KEY_REMOVE);
-                BUG_ON(ret < 0);
        }
        nritems--;
@@ -4656,7 +4660,7 @@ static noinline void btrfs_del_leaf(struct btrfs_trans_handle *trans,
                                    struct extent_buffer *leaf)
 {
        WARN_ON(btrfs_header_generation(leaf) != trans->transid);
-        del_ptr(trans, root, path, 1, path->slots[1], 1);
+        del_ptr(trans, root, path, 1, path->slots[1]);
        /*
         * btrfs_free_extent is expensive, we want to make sure we
@@ -5123,13 +5127,13 @@ int btrfs_compare_trees(struct btrfs_root *left_root,
        right_path->search_commit_root = 1;
        right_path->skip_locking = 1;
-        spin_lock(&left_root->root_times_lock);
+        spin_lock(&left_root->root_item_lock);
        left_start_ctransid = btrfs_root_ctransid(&left_root->root_item);
-        spin_unlock(&left_root->root_times_lock);
+        spin_unlock(&left_root->root_item_lock);
-        spin_lock(&right_root->root_times_lock);
+        spin_lock(&right_root->root_item_lock);
        right_start_ctransid = btrfs_root_ctransid(&right_root->root_item);
-        spin_unlock(&right_root->root_times_lock);
+        spin_unlock(&right_root->root_item_lock);
        trans = btrfs_join_transaction(left_root);
        if (IS_ERR(trans)) {
@@ -5224,15 +5228,15 @@ int btrfs_compare_trees(struct btrfs_root *left_root,
                                goto out;
                        }
-                        spin_lock(&left_root->root_times_lock);
+                        spin_lock(&left_root->root_item_lock);
                        ctransid = btrfs_root_ctransid(&left_root->root_item);
-                        spin_unlock(&left_root->root_times_lock);
+                        spin_unlock(&left_root->root_item_lock);
                        if (ctransid != left_start_ctransid)
                                left_start_ctransid = 0;
-                        spin_lock(&right_root->root_times_lock);
+                        spin_lock(&right_root->root_item_lock);
                        ctransid = btrfs_root_ctransid(&right_root->root_item);
-                        spin_unlock(&right_root->root_times_lock);
+                        spin_unlock(&right_root->root_item_lock);
                        if (ctransid != right_start_ctransid)
                                right_start_ctransid = 0;
@@ -5496,6 +5500,139 @@ int btrfs_next_leaf(struct btrfs_root *root, struct btrfs_path *path)
        return btrfs_next_old_leaf(root, path, 0);
 }
+/* Release the path up to but not including the given level */
+static void btrfs_release_level(struct btrfs_path *path, int level)
+{
+        int i;
+        for (i = 0; i < level; i++) {
+                path->slots[i] = 0;
+                if (!path->nodes[i])
+                        continue;
+                if (path->locks[i]) {
+                        btrfs_tree_unlock_rw(path->nodes[i], path->locks[i]);
+                        path->locks[i] = 0;
+                }
+                free_extent_buffer(path->nodes[i]);
+                path->nodes[i] = NULL;
+        }
+}
+/*
+ * This function assumes 2 things
+ *
+ * 1) You are using path->keep_locks
+ * 2) You are not inserting items.
+ *
+ * If either of these are not true do not use this function. If you need a next
+ * leaf with either of these not being true then this function can be easily
+ * adapted to do that, but at the moment these are the limitations.
+ */
+int btrfs_next_leaf_write(struct btrfs_trans_handle *trans,
+                          struct btrfs_root *root, struct btrfs_path *path,
+                          int del)
+{
+        struct extent_buffer *b;
+        struct btrfs_key key;
+        u32 nritems;
+        int level = 1;
+        int slot;
+        int ret = 1;
+        int write_lock_level = BTRFS_MAX_LEVEL;
+        int ins_len = del ? -1 : 0;
+        WARN_ON(!(path->keep_locks || path->really_keep_locks));
+        nritems = btrfs_header_nritems(path->nodes[0]);
+        btrfs_item_key_to_cpu(path->nodes[0], &key, nritems - 1);
+        while (path->nodes[level]) {
+                nritems = btrfs_header_nritems(path->nodes[level]);
+                if (!(path->locks[level] & BTRFS_WRITE_LOCK)) {
+search:
+                        btrfs_release_path(path);
+                        ret = btrfs_search_slot(trans, root, &key, path,
+                                                ins_len, 1);
+                        if (ret < 0)
+                                goto out;
+                        level = 1;
+                        continue;
+                }
+                if (path->slots[level] >= nritems - 1) {
+                        level++;
+                        continue;
+                }
+                btrfs_release_level(path, level);
+                break;
+        }
+        if (!path->nodes[level]) {
+                ret = 1;
+                goto out;
+        }
+        path->slots[level]++;
+        b = path->nodes[level];
+        while (b) {
+                level = btrfs_header_level(b);
+                if (!should_cow_block(trans, root, b))
+                        goto cow_done;
+                btrfs_set_path_blocking(path);
+                ret = btrfs_cow_block(trans, root, b,
+                                      path->nodes[level + 1],
+                                      path->slots[level + 1], &b);
+                if (ret)
+                        goto out;
+cow_done:
+                path->nodes[level] = b;
+                btrfs_clear_path_blocking(path, NULL, 0);
+                if (level != 0) {
+                        ret = setup_nodes_for_search(trans, root, path, b,
+                                                     level, ins_len,
+                                                     &write_lock_level);
+                        if (ret == -EAGAIN)
+                                goto search;
+                        if (ret)
+                                goto out;
+                        b = path->nodes[level];
+                        slot = path->slots[level];
+                        ret = read_block_for_search(trans, root, path,
+                                                    &b, level, slot, &key, 0);
+                        if (ret == -EAGAIN)
+                                goto search;
+                        if (ret)
+                                goto out;
+                        level = btrfs_header_level(b);
+                        if (!btrfs_try_tree_write_lock(b)) {
+                                btrfs_set_path_blocking(path);
+                                btrfs_tree_lock(b);
+                                btrfs_clear_path_blocking(path, b,
+                                                          BTRFS_WRITE_LOCK);
+                        }
+                        path->locks[level] = BTRFS_WRITE_LOCK;
+                        path->nodes[level] = b;
+                        path->slots[level] = 0;
+                } else {
+                        path->slots[level] = 0;
+                        ret = 0;
+                        break;
+                }
+        }
+out:
+        if (ret)
+                btrfs_release_path(path);
+        return ret;
+}
 int btrfs_next_old_leaf(struct btrfs_root *root, struct btrfs_path *path,
                        u64 time_seq)
 {
diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h
index c72ead869507..547b7b05727f 100644
--- a/fs/btrfs/ctree.h
+++ b/fs/btrfs/ctree.h
@@ -48,7 +48,7 @@ struct btrfs_ordered_sum;
 #define BTRFS_MAGIC "_BHRfS_M"
-#define BTRFS_MAX_MIRRORS 2
+#define BTRFS_MAX_MIRRORS 3
 #define BTRFS_MAX_LEVEL 8
@@ -142,6 +142,8 @@ struct btrfs_ordered_sum;
 #define BTRFS_EMPTY_SUBVOL_DIR_OBJECTID 2
+#define BTRFS_DEV_REPLACE_DEVID 0
 /*
 * the max metadata block size.  This limit is somewhat artificial,
 * but the memmove costs go through the roof for larger blocks.
@@ -172,6 +174,9 @@ static int btrfs_csum_sizes[] = { 4, 0 };
 /* four bytes for CRC32 */
 #define BTRFS_EMPTY_DIR_SIZE 0
+/* spefic to btrfs_map_block(), therefore not in include/linux/blk_types.h */
+#define REQ_GET_READ_MIRRORS    (1 << 30)
 #define BTRFS_FT_UNKNOWN        0
 #define BTRFS_FT_REG_FILE       1
 #define BTRFS_FT_DIR            2
@@ -413,7 +418,7 @@ struct btrfs_root_backup {
        __le64 bytes_used;
        __le64 num_devices;
        /* future */
-        __le64 unsed_64[4];
+        __le64 unused_64[4];
        u8 tree_root_level;
        u8 chunk_root_level;
@@ -571,6 +576,7 @@ struct btrfs_path {
        unsigned int skip_locking:1;
        unsigned int leave_spinning:1;
        unsigned int search_commit_root:1;
+        unsigned int really_keep_locks:1;
 };
 /*
@@ -885,6 +891,59 @@ struct btrfs_dev_stats_item {
        __le64 values[BTRFS_DEV_STAT_VALUES_MAX];
 } __attribute__ ((__packed__));
+#define BTRFS_DEV_REPLACE_ITEM_CONT_READING_FROM_SRCDEV_MODE_ALWAYS     0
+#define BTRFS_DEV_REPLACE_ITEM_CONT_READING_FROM_SRCDEV_MODE_AVOID      1
+#define BTRFS_DEV_REPLACE_ITEM_STATE_NEVER_STARTED      0
+#define BTRFS_DEV_REPLACE_ITEM_STATE_STARTED            1
+#define BTRFS_DEV_REPLACE_ITEM_STATE_SUSPENDED          2
+#define BTRFS_DEV_REPLACE_ITEM_STATE_FINISHED           3
+#define BTRFS_DEV_REPLACE_ITEM_STATE_CANCELED           4
+struct btrfs_dev_replace {
+        u64 replace_state;      /* see #define above */
+        u64 time_started;       /* seconds since 1-Jan-1970 */
+        u64 time_stopped;       /* seconds since 1-Jan-1970 */
+        atomic64_t num_write_errors;
+        atomic64_t num_uncorrectable_read_errors;
+        u64 cursor_left;
+        u64 committed_cursor_left;
+        u64 cursor_left_last_write_of_item;
+        u64 cursor_right;
+        u64 cont_reading_from_srcdev_mode;      /* see #define above */
+        int is_valid;
+        int item_needs_writeback;
+        struct btrfs_device *srcdev;
+        struct btrfs_device *tgtdev;
+        pid_t lock_owner;
+        atomic_t nesting_level;
+        struct mutex lock_finishing_cancel_unmount;
+        struct mutex lock_management_lock;
+        struct mutex lock;
+        struct btrfs_scrub_progress scrub_progress;
+};
+struct btrfs_dev_replace_item {
+        /*
+         * grow this item struct at the end for future enhancements and keep
+         * the existing values unchanged
+         */
+        __le64 src_devid;
+        __le64 cursor_left;
+        __le64 cursor_right;
+        __le64 cont_reading_from_srcdev_mode;
+        __le64 replace_state;
+        __le64 time_started;
+        __le64 time_stopped;
+        __le64 num_write_errors;
+        __le64 num_uncorrectable_read_errors;
+} __attribute__ ((__packed__));
 /* different types of block groups (and chunks) */
 #define BTRFS_BLOCK_GROUP_DATA          (1ULL << 0)
 #define BTRFS_BLOCK_GROUP_SYSTEM        (1ULL << 1)
@@ -1333,6 +1392,7 @@ struct btrfs_fs_info {
        struct btrfs_workers generic_worker;
        struct btrfs_workers workers;
        struct btrfs_workers delalloc_workers;
+        struct btrfs_workers flush_workers;
        struct btrfs_workers endio_workers;
        struct btrfs_workers endio_meta_workers;
        struct btrfs_workers endio_meta_write_workers;
@@ -1429,6 +1489,8 @@ struct btrfs_fs_info {
        struct rw_semaphore scrub_super_lock;
        int scrub_workers_refcnt;
        struct btrfs_workers scrub_workers;
+        struct btrfs_workers scrub_wr_completion_workers;
+        struct btrfs_workers scrub_nocow_workers;
 #ifdef CONFIG_BTRFS_FS_CHECK_INTEGRITY
        u32 check_integrity_print_mask;
@@ -1470,6 +1532,11 @@ struct btrfs_fs_info {
        int backup_root_index;
        int num_tolerated_disk_barrier_failures;
+        /* device replace state */
+        struct btrfs_dev_replace dev_replace;
+        atomic_t mutually_exclusive_operation_running;
 };
 /*
@@ -1579,7 +1646,7 @@ struct btrfs_root {
        int force_cow;
-        spinlock_t root_times_lock;
+        spinlock_t root_item_lock;
 };
 struct btrfs_ioctl_defrag_range_args {
@@ -1723,6 +1790,12 @@ struct btrfs_ioctl_defrag_range_args {
 #define BTRFS_DEV_STATS_KEY     249
 /*
+ * Persistantly stores the device replace state in the device tree.
+ * The key is built like this: (0, BTRFS_DEV_REPLACE_KEY, 0).
+ */
+#define BTRFS_DEV_REPLACE_KEY   250
+/*
 * string items are for debugging.  They just store a short string of
 * data in the FS
 */
@@ -1787,7 +1860,7 @@ struct btrfs_map_token {
 static inline void btrfs_init_map_token (struct btrfs_map_token *token)
 {
-        memset(token, 0, sizeof(*token));
+        token->kaddr = NULL;
 }
 /* some macros to generate set/get funcs for the struct fields.  This
@@ -2755,6 +2828,49 @@ BTRFS_SETGET_FUNCS(qgroup_limit_rsv_rfer, struct btrfs_qgroup_limit_item,
 BTRFS_SETGET_FUNCS(qgroup_limit_rsv_excl, struct btrfs_qgroup_limit_item,
                   rsv_excl, 64);
+/* btrfs_dev_replace_item */
+BTRFS_SETGET_FUNCS(dev_replace_src_devid,
+                   struct btrfs_dev_replace_item, src_devid, 64);
+BTRFS_SETGET_FUNCS(dev_replace_cont_reading_from_srcdev_mode,
+                   struct btrfs_dev_replace_item, cont_reading_from_srcdev_mode,
+                   64);
+BTRFS_SETGET_FUNCS(dev_replace_replace_state, struct btrfs_dev_replace_item,
+                   replace_state, 64);
+BTRFS_SETGET_FUNCS(dev_replace_time_started, struct btrfs_dev_replace_item,
+                   time_started, 64);
+BTRFS_SETGET_FUNCS(dev_replace_time_stopped, struct btrfs_dev_replace_item,
+                   time_stopped, 64);
+BTRFS_SETGET_FUNCS(dev_replace_num_write_errors, struct btrfs_dev_replace_item,
+                   num_write_errors, 64);
+BTRFS_SETGET_FUNCS(dev_replace_num_uncorrectable_read_errors,
+                   struct btrfs_dev_replace_item, num_uncorrectable_read_errors,
+                   64);
+BTRFS_SETGET_FUNCS(dev_replace_cursor_left, struct btrfs_dev_replace_item,
+                   cursor_left, 64);
+BTRFS_SETGET_FUNCS(dev_replace_cursor_right, struct btrfs_dev_replace_item,
+                   cursor_right, 64);
+BTRFS_SETGET_STACK_FUNCS(stack_dev_replace_src_devid,
+                         struct btrfs_dev_replace_item, src_devid, 64);
+BTRFS_SETGET_STACK_FUNCS(stack_dev_replace_cont_reading_from_srcdev_mode,
+                         struct btrfs_dev_replace_item,
+                         cont_reading_from_srcdev_mode, 64);
+BTRFS_SETGET_STACK_FUNCS(stack_dev_replace_replace_state,
+                         struct btrfs_dev_replace_item, replace_state, 64);
+BTRFS_SETGET_STACK_FUNCS(stack_dev_replace_time_started,
+                         struct btrfs_dev_replace_item, time_started, 64);
+BTRFS_SETGET_STACK_FUNCS(stack_dev_replace_time_stopped,
+                         struct btrfs_dev_replace_item, time_stopped, 64);
+BTRFS_SETGET_STACK_FUNCS(stack_dev_replace_num_write_errors,
+                         struct btrfs_dev_replace_item, num_write_errors, 64);
+BTRFS_SETGET_STACK_FUNCS(stack_dev_replace_num_uncorrectable_read_errors,
+                         struct btrfs_dev_replace_item,
+                         num_uncorrectable_read_errors, 64);
+BTRFS_SETGET_STACK_FUNCS(stack_dev_replace_cursor_left,
+                         struct btrfs_dev_replace_item, cursor_left, 64);
+BTRFS_SETGET_STACK_FUNCS(stack_dev_replace_cursor_right,
+                         struct btrfs_dev_replace_item, cursor_right, 64);
 static inline struct btrfs_fs_info *btrfs_sb(struct super_block *sb)
 {
        return sb->s_fs_info;
@@ -2900,6 +3016,18 @@ void btrfs_create_pending_block_groups(struct btrfs_trans_handle *trans,
 u64 btrfs_reduce_alloc_profile(struct btrfs_root *root, u64 flags);
 u64 btrfs_get_alloc_profile(struct btrfs_root *root, int data);
 void btrfs_clear_space_info_full(struct btrfs_fs_info *info);
+enum btrfs_reserve_flush_enum {
+        /* If we are in the transaction, we can't flush anything.*/
+        BTRFS_RESERVE_NO_FLUSH,
+        /*
+         * Flushing delalloc may cause deadlock somewhere, in this
+         * case, use FLUSH LIMIT
+         */
+        BTRFS_RESERVE_FLUSH_LIMIT,
+        BTRFS_RESERVE_FLUSH_ALL,
+};
 int btrfs_check_data_free_space(struct inode *inode, u64 bytes);
 void btrfs_free_reserved_data_space(struct inode *inode, u64 bytes);
 void btrfs_trans_release_metadata(struct btrfs_trans_handle *trans,
@@ -2919,19 +3047,13 @@ struct btrfs_block_rsv *btrfs_alloc_block_rsv(struct btrfs_root *root,
 void btrfs_free_block_rsv(struct btrfs_root *root,
                          struct btrfs_block_rsv *rsv);
 int btrfs_block_rsv_add(struct btrfs_root *root,
-                        struct btrfs_block_rsv *block_rsv,
+                        struct btrfs_block_rsv *block_rsv, u64 num_bytes,
-                        u64 num_bytes);
+                        enum btrfs_reserve_flush_enum flush);
-int btrfs_block_rsv_add_noflush(struct btrfs_root *root,
-                                struct btrfs_block_rsv *block_rsv,
-                                u64 num_bytes);
 int btrfs_block_rsv_check(struct btrfs_root *root,
                          struct btrfs_block_rsv *block_rsv, int min_factor);
 int btrfs_block_rsv_refill(struct btrfs_root *root,
-                          struct btrfs_block_rsv *block_rsv,
+                           struct btrfs_block_rsv *block_rsv, u64 min_reserved,
-                          u64 min_reserved);
+                           enum btrfs_reserve_flush_enum flush);
-int btrfs_block_rsv_refill_noflush(struct btrfs_root *root,
-                                   struct btrfs_block_rsv *block_rsv,
-                                   u64 min_reserved);
 int btrfs_block_rsv_migrate(struct btrfs_block_rsv *src_rsv,
                            struct btrfs_block_rsv *dst_rsv,
                            u64 num_bytes);
@@ -2955,6 +3077,7 @@ int btrfs_trim_fs(struct btrfs_root *root, struct fstrim_range *range);
 int btrfs_init_space_info(struct btrfs_fs_info *fs_info);
 int btrfs_delayed_refs_qgroup_accounting(struct btrfs_trans_handle *trans,
                                         struct btrfs_fs_info *fs_info);
+int __get_raid_index(u64 flags);
 /* ctree.c */
 int btrfs_bin_search(struct extent_buffer *eb, struct btrfs_key *key,
                     int level, int *slot);
@@ -3065,6 +3188,9 @@ static inline int btrfs_insert_empty_item(struct btrfs_trans_handle *trans,
 }
 int btrfs_next_leaf(struct btrfs_root *root, struct btrfs_path *path);
+int btrfs_next_leaf_write(struct btrfs_trans_handle *trans,
+                          struct btrfs_root *root, struct btrfs_path *path,
+                          int del);
 int btrfs_next_old_leaf(struct btrfs_root *root, struct btrfs_path *path,
                        u64 time_seq);
 static inline int btrfs_next_old_item(struct btrfs_root *root,
@@ -3157,6 +3283,8 @@ void btrfs_update_root_times(struct btrfs_trans_handle *trans,
                             struct btrfs_root *root);
 /* dir-item.c */
+int btrfs_check_dir_item_collision(struct btrfs_root *root, u64 dir,
+                          const char *name, int name_len);
 int btrfs_insert_dir_item(struct btrfs_trans_handle *trans,
                          struct btrfs_root *root, const char *name,
                          int name_len, struct inode *dir,
@@ -3256,6 +3384,7 @@ int btrfs_lookup_file_extent(struct btrfs_trans_handle *trans,
                             struct btrfs_root *root,
                             struct btrfs_path *path, u64 objectid,
                             u64 bytenr, int mod);
+u64 btrfs_file_extent_length(struct btrfs_path *path);
 int btrfs_csum_file_blocks(struct btrfs_trans_handle *trans,
                           struct btrfs_root *root,
                           struct btrfs_ordered_sum *sums);
@@ -3271,6 +3400,19 @@ int btrfs_csum_truncate(struct btrfs_trans_handle *trans,
 int btrfs_lookup_csums_range(struct btrfs_root *root, u64 start, u64 end,
                             struct list_head *list, int search_commit);
 /* inode.c */
+struct btrfs_delalloc_work {
+        struct inode *inode;
+        int wait;
+        int delay_iput;
+        struct completion completion;
+        struct list_head list;
+        struct btrfs_work work;
+};
+struct btrfs_delalloc_work *btrfs_alloc_delalloc_work(struct inode *inode,
+                                                    int wait, int delay_iput);
+void btrfs_wait_and_free_delalloc_work(struct btrfs_delalloc_work *work);
 struct extent_map *btrfs_get_extent_fiemap(struct inode *inode, struct page *page,
                                           size_t pg_offset, u64 start, u64 len,
                                           int create);
@@ -3370,9 +3512,12 @@ void btrfs_get_block_group_info(struct list_head *groups_list,
                                struct btrfs_ioctl_space_info *space);
 /* file.c */
+int btrfs_auto_defrag_init(void);
+void btrfs_auto_defrag_exit(void);
 int btrfs_add_inode_defrag(struct btrfs_trans_handle *trans,
                           struct inode *inode);
 int btrfs_run_defrag_inodes(struct btrfs_fs_info *fs_info);
+void btrfs_cleanup_defrag_inodes(struct btrfs_fs_info *fs_info);
 int btrfs_sync_file(struct file *file, loff_t start, loff_t end, int datasync);
 void btrfs_drop_extent_cache(struct inode *inode, u64 start, u64 end,
                             int skip_pinned);
@@ -3519,15 +3664,16 @@ int btrfs_reloc_post_snapshot(struct btrfs_trans_handle *trans,
                              struct btrfs_pending_snapshot *pending);
 /* scrub.c */
-int btrfs_scrub_dev(struct btrfs_root *root, u64 devid, u64 start, u64 end,
+int btrfs_scrub_dev(struct btrfs_fs_info *fs_info, u64 devid, u64 start,
-                    struct btrfs_scrub_progress *progress, int readonly);
+                    u64 end, struct btrfs_scrub_progress *progress,
+                    int readonly, int is_dev_replace);
 void btrfs_scrub_pause(struct btrfs_root *root);
 void btrfs_scrub_pause_super(struct btrfs_root *root);
 void btrfs_scrub_continue(struct btrfs_root *root);
 void btrfs_scrub_continue_super(struct btrfs_root *root);
-int __btrfs_scrub_cancel(struct btrfs_fs_info *info);
+int btrfs_scrub_cancel(struct btrfs_fs_info *info);
-int btrfs_scrub_cancel(struct btrfs_root *root);
+int btrfs_scrub_cancel_dev(struct btrfs_fs_info *info,
-int btrfs_scrub_cancel_dev(struct btrfs_root *root, struct btrfs_device *dev);
+                           struct btrfs_device *dev);
 int btrfs_scrub_cancel_devid(struct btrfs_root *root, u64 devid);
 int btrfs_scrub_progress(struct btrfs_root *root, u64 devid,
                         struct btrfs_scrub_progress *progress);
diff --git a/fs/btrfs/delayed-inode.c b/fs/btrfs/delayed-inode.c
index 478f66bdc57b..34836036f01b 100644
--- a/fs/btrfs/delayed-inode.c
+++ b/fs/btrfs/delayed-inode.c
@@ -651,7 +651,8 @@ static int btrfs_delayed_inode_reserve_metadata(
         */
        if (!src_rsv || (!trans->bytes_reserved &&
                         src_rsv->type != BTRFS_BLOCK_RSV_DELALLOC)) {
-                ret = btrfs_block_rsv_add_noflush(root, dst_rsv, num_bytes);
+                ret = btrfs_block_rsv_add(root, dst_rsv, num_bytes,
+                                          BTRFS_RESERVE_NO_FLUSH);
                /*
                 * Since we're under a transaction reserve_metadata_bytes could
                 * try to commit the transaction which will make it return
@@ -686,7 +687,8 @@ static int btrfs_delayed_inode_reserve_metadata(
                 * reserve something strictly for us.  If not be a pain and try
                 * to steal from the delalloc block rsv.
                 */
-                ret = btrfs_block_rsv_add_noflush(root, dst_rsv, num_bytes);
+                ret = btrfs_block_rsv_add(root, dst_rsv, num_bytes,
+                                          BTRFS_RESERVE_NO_FLUSH);
                if (!ret)
                        goto out;
@@ -1255,7 +1257,6 @@ static void btrfs_async_run_delayed_node_done(struct btrfs_work *work)
        struct btrfs_delayed_node *delayed_node = NULL;
        struct btrfs_root *root;
        struct btrfs_block_rsv *block_rsv;
-        unsigned long nr = 0;
        int need_requeue = 0;
        int ret;
@@ -1316,11 +1317,9 @@ static void btrfs_async_run_delayed_node_done(struct btrfs_work *work)
                                           delayed_node);
        mutex_unlock(&delayed_node->mutex);
-        nr = trans->blocks_used;
        trans->block_rsv = block_rsv;
        btrfs_end_transaction_dmeta(trans, root);
-        __btrfs_btree_balance_dirty(root, nr);
+        btrfs_btree_balance_dirty_nodelay(root);
 free_path:
        btrfs_free_path(path);
 out:
diff --git a/fs/btrfs/dev-replace.c b/fs/btrfs/dev-replace.c
new file mode 100644
index 000000000000..66dbc8dbddf7
--- /dev/null
+++ b/fs/btrfs/dev-replace.c
@@ -0,0 +1,856 @@
+/*
+ * Copyright (C) STRATO AG 2012.  All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public
+ * License v2 as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public
+ * License along with this program; if not, write to the
+ * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
+ * Boston, MA 021110-1307, USA.
+ */
+#include <linux/sched.h>
+#include <linux/bio.h>
+#include <linux/slab.h>
+#include <linux/buffer_head.h>
+#include <linux/blkdev.h>
+#include <linux/random.h>
+#include <linux/iocontext.h>
+#include <linux/capability.h>
+#include <linux/kthread.h>
+#include <linux/math64.h>
+#include <asm/div64.h>
+#include "compat.h"
+#include "ctree.h"
+#include "extent_map.h"
+#include "disk-io.h"
+#include "transaction.h"
+#include "print-tree.h"
+#include "volumes.h"
+#include "async-thread.h"
+#include "check-integrity.h"
+#include "rcu-string.h"
+#include "dev-replace.h"
+static u64 btrfs_get_seconds_since_1970(void);
+static int btrfs_dev_replace_finishing(struct btrfs_fs_info *fs_info,
+                                       int scrub_ret);
+static void btrfs_dev_replace_update_device_in_mapping_tree(
+                                                struct btrfs_fs_info *fs_info,
+                                                struct btrfs_device *srcdev,
+                                                struct btrfs_device *tgtdev);
+static int btrfs_dev_replace_find_srcdev(struct btrfs_root *root, u64 srcdevid,
+                                         char *srcdev_name,
+                                         struct btrfs_device **device);
+static u64 __btrfs_dev_replace_cancel(struct btrfs_fs_info *fs_info);
+static int btrfs_dev_replace_kthread(void *data);
+static int btrfs_dev_replace_continue_on_mount(struct btrfs_fs_info *fs_info);
+int btrfs_init_dev_replace(struct btrfs_fs_info *fs_info)
+{
+        struct btrfs_key key;
+        struct btrfs_root *dev_root = fs_info->dev_root;
+        struct btrfs_dev_replace *dev_replace = &fs_info->dev_replace;
+        struct extent_buffer *eb;
+        int slot;
+        int ret = 0;
+        struct btrfs_path *path = NULL;
+        int item_size;
+        struct btrfs_dev_replace_item *ptr;
+        u64 src_devid;
+        path = btrfs_alloc_path();
+        if (!path) {
+                ret = -ENOMEM;
+                goto out;
+        }
+        key.objectid = 0;
+        key.type = BTRFS_DEV_REPLACE_KEY;
+        key.offset = 0;
+        ret = btrfs_search_slot(NULL, dev_root, &key, path, 0, 0);
+        if (ret) {
+no_valid_dev_replace_entry_found:
+                ret = 0;
+                dev_replace->replace_state =
+                        BTRFS_DEV_REPLACE_ITEM_STATE_NEVER_STARTED;
+                dev_replace->cont_reading_from_srcdev_mode =
+                    BTRFS_DEV_REPLACE_ITEM_CONT_READING_FROM_SRCDEV_MODE_ALWAYS;
+                dev_replace->replace_state = 0;
+                dev_replace->time_started = 0;
+                dev_replace->time_stopped = 0;
+                atomic64_set(&dev_replace->num_write_errors, 0);
+                atomic64_set(&dev_replace->num_uncorrectable_read_errors, 0);
+                dev_replace->cursor_left = 0;
+                dev_replace->committed_cursor_left = 0;
+                dev_replace->cursor_left_last_write_of_item = 0;
+                dev_replace->cursor_right = 0;
+                dev_replace->srcdev = NULL;
+                dev_replace->tgtdev = NULL;
+                dev_replace->is_valid = 0;
+                dev_replace->item_needs_writeback = 0;
+                goto out;
+        }
+        slot = path->slots[0];
+        eb = path->nodes[0];
+        item_size = btrfs_item_size_nr(eb, slot);
+        ptr = btrfs_item_ptr(eb, slot, struct btrfs_dev_replace_item);
+        if (item_size != sizeof(struct btrfs_dev_replace_item)) {
+                pr_warn("btrfs: dev_replace entry found has unexpected size, ignore entry\n");
+                goto no_valid_dev_replace_entry_found;
+        }
+        src_devid = btrfs_dev_replace_src_devid(eb, ptr);
+        dev_replace->cont_reading_from_srcdev_mode =
+                btrfs_dev_replace_cont_reading_from_srcdev_mode(eb, ptr);
+        dev_replace->replace_state = btrfs_dev_replace_replace_state(eb, ptr);
+        dev_replace->time_started = btrfs_dev_replace_time_started(eb, ptr);
+        dev_replace->time_stopped =
+                btrfs_dev_replace_time_stopped(eb, ptr);
+        atomic64_set(&dev_replace->num_write_errors,
+                     btrfs_dev_replace_num_write_errors(eb, ptr));
+        atomic64_set(&dev_replace->num_uncorrectable_read_errors,
+                     btrfs_dev_replace_num_uncorrectable_read_errors(eb, ptr));
+        dev_replace->cursor_left = btrfs_dev_replace_cursor_left(eb, ptr);
+        dev_replace->committed_cursor_left = dev_replace->cursor_left;
+        dev_replace->cursor_left_last_write_of_item = dev_replace->cursor_left;
+        dev_replace->cursor_right = btrfs_dev_replace_cursor_right(eb, ptr);
+        dev_replace->is_valid = 1;
+        dev_replace->item_needs_writeback = 0;
+        switch (dev_replace->replace_state) {
+        case BTRFS_IOCTL_DEV_REPLACE_STATE_NEVER_STARTED:
+        case BTRFS_IOCTL_DEV_REPLACE_STATE_FINISHED:
+        case BTRFS_IOCTL_DEV_REPLACE_STATE_CANCELED:
+                dev_replace->srcdev = NULL;
+                dev_replace->tgtdev = NULL;
+                break;
+        case BTRFS_IOCTL_DEV_REPLACE_STATE_STARTED:
+        case BTRFS_IOCTL_DEV_REPLACE_STATE_SUSPENDED:
+                dev_replace->srcdev = btrfs_find_device(fs_info, src_devid,
+                                                        NULL, NULL);
+                dev_replace->tgtdev = btrfs_find_device(fs_info,
+                                                        BTRFS_DEV_REPLACE_DEVID,
+                                                        NULL, NULL);
+                /*
+                 * allow 'btrfs dev replace_cancel' if src/tgt device is
+                 * missing
+                 */
+                if (!dev_replace->srcdev &&
+                    !btrfs_test_opt(dev_root, DEGRADED)) {
+                        ret = -EIO;
+                        pr_warn("btrfs: cannot mount because device replace operation is ongoing and\n" "srcdev (devid %llu) is missing, need to run 'btrfs dev scan'?\n",
+                                (unsigned long long)src_devid);
+                }
+                if (!dev_replace->tgtdev &&
+                    !btrfs_test_opt(dev_root, DEGRADED)) {
+                        ret = -EIO;
+                        pr_warn("btrfs: cannot mount because device replace operation is ongoing and\n" "tgtdev (devid %llu) is missing, need to run btrfs dev scan?\n",
+                                (unsigned long long)BTRFS_DEV_REPLACE_DEVID);
+                }
+                if (dev_replace->tgtdev) {
+                        if (dev_replace->srcdev) {
+                                dev_replace->tgtdev->total_bytes =
+                                        dev_replace->srcdev->total_bytes;
+                                dev_replace->tgtdev->disk_total_bytes =
+                                        dev_replace->srcdev->disk_total_bytes;
+                                dev_replace->tgtdev->bytes_used =
+                                        dev_replace->srcdev->bytes_used;
+                        }
+                        dev_replace->tgtdev->is_tgtdev_for_dev_replace = 1;
+                        btrfs_init_dev_replace_tgtdev_for_resume(fs_info,
+                                dev_replace->tgtdev);
+                }
+                break;
+        }
+out:
+        if (path)
+                btrfs_free_path(path);
+        return ret;
+}
+/*
+ * called from commit_transaction. Writes changed device replace state to
+ * disk.
+ */
+int btrfs_run_dev_replace(struct btrfs_trans_handle *trans,
+                          struct btrfs_fs_info *fs_info)
+{
+        int ret;
+        struct btrfs_root *dev_root = fs_info->dev_root;
+        struct btrfs_path *path;
+        struct btrfs_key key;
+        struct extent_buffer *eb;
+        struct btrfs_dev_replace_item *ptr;
+        struct btrfs_dev_replace *dev_replace = &fs_info->dev_replace;
+        btrfs_dev_replace_lock(dev_replace);
+        if (!dev_replace->is_valid ||
+            !dev_replace->item_needs_writeback) {
+                btrfs_dev_replace_unlock(dev_replace);
+                return 0;
+        }
+        btrfs_dev_replace_unlock(dev_replace);
+        key.objectid = 0;
+        key.type = BTRFS_DEV_REPLACE_KEY;
+        key.offset = 0;
+        path = btrfs_alloc_path();
+        if (!path) {
+                ret = -ENOMEM;
+                goto out;
+        }
+        ret = btrfs_search_slot(trans, dev_root, &key, path, -1, 1);
+        if (ret < 0) {
+                pr_warn("btrfs: error %d while searching for dev_replace item!\n",
+                        ret);
+                goto out;
+        }
+        if (ret == 0 &&
+            btrfs_item_size_nr(path->nodes[0], path->slots[0]) < sizeof(*ptr)) {
+                /*
+                 * need to delete old one and insert a new one.
+                 * Since no attempt is made to recover any old state, if the
+                 * dev_replace state is 'running', the data on the target
+                 * drive is lost.
+                 * It would be possible to recover the state: just make sure
+                 * that the beginning of the item is never changed and always
+                 * contains all the essential information. Then read this
+                 * minimal set of information and use it as a base for the
+                 * new state.
+                 */
+                ret = btrfs_del_item(trans, dev_root, path);
+                if (ret != 0) {
+                        pr_warn("btrfs: delete too small dev_replace item failed %d!\n",
+                                ret);
+                        goto out;
+                }
+                ret = 1;
+        }
+        if (ret == 1) {
+                /* need to insert a new item */
+                btrfs_release_path(path);
+                ret = btrfs_insert_empty_item(trans, dev_root, path,
+                                              &key, sizeof(*ptr));
+                if (ret < 0) {
+                        pr_warn("btrfs: insert dev_replace item failed %d!\n",
+                                ret);
+                        goto out;
+                }
+        }
+        eb = path->nodes[0];
+        ptr = btrfs_item_ptr(eb, path->slots[0],
+                             struct btrfs_dev_replace_item);
+        btrfs_dev_replace_lock(dev_replace);
+        if (dev_replace->srcdev)
+                btrfs_set_dev_replace_src_devid(eb, ptr,
+                        dev_replace->srcdev->devid);
+        else
+                btrfs_set_dev_replace_src_devid(eb, ptr, (u64)-1);
+        btrfs_set_dev_replace_cont_reading_from_srcdev_mode(eb, ptr,
+                dev_replace->cont_reading_from_srcdev_mode);
+        btrfs_set_dev_replace_replace_state(eb, ptr,
+                dev_replace->replace_state);
+        btrfs_set_dev_replace_time_started(eb, ptr, dev_replace->time_started);
+        btrfs_set_dev_replace_time_stopped(eb, ptr, dev_replace->time_stopped);
+        btrfs_set_dev_replace_num_write_errors(eb, ptr,
+                atomic64_read(&dev_replace->num_write_errors));
+        btrfs_set_dev_replace_num_uncorrectable_read_errors(eb, ptr,
+                atomic64_read(&dev_replace->num_uncorrectable_read_errors));
+        dev_replace->cursor_left_last_write_of_item =
+                dev_replace->cursor_left;
+        btrfs_set_dev_replace_cursor_left(eb, ptr,
+                dev_replace->cursor_left_last_write_of_item);
+        btrfs_set_dev_replace_cursor_right(eb, ptr,
+                dev_replace->cursor_right);
+        dev_replace->item_needs_writeback = 0;
+        btrfs_dev_replace_unlock(dev_replace);
+        btrfs_mark_buffer_dirty(eb);
+out:
+        btrfs_free_path(path);
+        return ret;
+}
+void btrfs_after_dev_replace_commit(struct btrfs_fs_info *fs_info)
+{
+        struct btrfs_dev_replace *dev_replace = &fs_info->dev_replace;
+        dev_replace->committed_cursor_left =
+                dev_replace->cursor_left_last_write_of_item;
+}
+static u64 btrfs_get_seconds_since_1970(void)
+{
+        struct timespec t = CURRENT_TIME_SEC;
+        return t.tv_sec;
+}
+int btrfs_dev_replace_start(struct btrfs_root *root,
+                            struct btrfs_ioctl_dev_replace_args *args)
+{
+        struct btrfs_trans_handle *trans;
+        struct btrfs_fs_info *fs_info = root->fs_info;
+        struct btrfs_dev_replace *dev_replace = &fs_info->dev_replace;
+        int ret;
+        struct btrfs_device *tgt_device = NULL;
+        struct btrfs_device *src_device = NULL;
+        switch (args->start.cont_reading_from_srcdev_mode) {
+        case BTRFS_IOCTL_DEV_REPLACE_CONT_READING_FROM_SRCDEV_MODE_ALWAYS:
+        case BTRFS_IOCTL_DEV_REPLACE_CONT_READING_FROM_SRCDEV_MODE_AVOID:
+                break;
+        default:
+                return -EINVAL;
+        }
+        if ((args->start.srcdevid == 0 && args->start.srcdev_name[0] == '\0') ||
+            args->start.tgtdev_name[0] == '\0')
+                return -EINVAL;
+        mutex_lock(&fs_info->volume_mutex);
+        ret = btrfs_init_dev_replace_tgtdev(root, args->start.tgtdev_name,
+                                            &tgt_device);
+        if (ret) {
+                pr_err("btrfs: target device %s is invalid!\n",
+                       args->start.tgtdev_name);
+                mutex_unlock(&fs_info->volume_mutex);
+                return -EINVAL;
+        }
+        ret = btrfs_dev_replace_find_srcdev(root, args->start.srcdevid,
+                                            args->start.srcdev_name,
+                                            &src_device);
+        mutex_unlock(&fs_info->volume_mutex);
+        if (ret) {
+                ret = -EINVAL;
+                goto leave_no_lock;
+        }
+        if (tgt_device->total_bytes < src_device->total_bytes) {
+                pr_err("btrfs: target device is smaller than source device!\n");
+                ret = -EINVAL;
+                goto leave_no_lock;
+        }
+        btrfs_dev_replace_lock(dev_replace);
+        switch (dev_replace->replace_state) {
+        case BTRFS_IOCTL_DEV_REPLACE_STATE_NEVER_STARTED:
+        case BTRFS_IOCTL_DEV_REPLACE_STATE_FINISHED:
+        case BTRFS_IOCTL_DEV_REPLACE_STATE_CANCELED:
+                break;
+        case BTRFS_IOCTL_DEV_REPLACE_STATE_STARTED:
+        case BTRFS_IOCTL_DEV_REPLACE_STATE_SUSPENDED:
+                args->result = BTRFS_IOCTL_DEV_REPLACE_RESULT_ALREADY_STARTED;
+                goto leave;
+        }
+        dev_replace->cont_reading_from_srcdev_mode =
+                args->start.cont_reading_from_srcdev_mode;
+        WARN_ON(!src_device);
+        dev_replace->srcdev = src_device;
+        WARN_ON(!tgt_device);
+        dev_replace->tgtdev = tgt_device;
+        printk_in_rcu(KERN_INFO
+                      "btrfs: dev_replace from %s (devid %llu) to %s) started\n",
+                      src_device->missing ? "<missing disk>" :
+                        rcu_str_deref(src_device->name),
+                      src_device->devid,
+                      rcu_str_deref(tgt_device->name));
+        tgt_device->total_bytes = src_device->total_bytes;
+        tgt_device->disk_total_bytes = src_device->disk_total_bytes;
+        tgt_device->bytes_used = src_device->bytes_used;
+        /*
+         * from now on, the writes to the srcdev are all duplicated to
+         * go to the tgtdev as well (refer to btrfs_map_block()).
+         */
+        dev_replace->replace_state = BTRFS_IOCTL_DEV_REPLACE_STATE_STARTED;
+        dev_replace->time_started = btrfs_get_seconds_since_1970();
+        dev_replace->cursor_left = 0;
+        dev_replace->committed_cursor_left = 0;
+        dev_replace->cursor_left_last_write_of_item = 0;
+        dev_replace->cursor_right = 0;
+        dev_replace->is_valid = 1;
+        dev_replace->item_needs_writeback = 1;
+        args->result = BTRFS_IOCTL_DEV_REPLACE_RESULT_NO_ERROR;
+        btrfs_dev_replace_unlock(dev_replace);
+        btrfs_wait_ordered_extents(root, 0);
+        /* force writing the updated state information to disk */
+        trans = btrfs_start_transaction(root, 0);
+        if (IS_ERR(trans)) {
+                ret = PTR_ERR(trans);
+                btrfs_dev_replace_lock(dev_replace);
+                goto leave;
+        }
+        ret = btrfs_commit_transaction(trans, root);
+        WARN_ON(ret);
+        /* the disk copy procedure reuses the scrub code */
+        ret = btrfs_scrub_dev(fs_info, src_device->devid, 0,
+                              src_device->total_bytes,
+                              &dev_replace->scrub_progress, 0, 1);
+        ret = btrfs_dev_replace_finishing(root->fs_info, ret);
+        WARN_ON(ret);
+        return 0;
+leave:
+        dev_replace->srcdev = NULL;
+        dev_replace->tgtdev = NULL;
+        btrfs_dev_replace_unlock(dev_replace);
+leave_no_lock:
+        if (tgt_device)
+                btrfs_destroy_dev_replace_tgtdev(fs_info, tgt_device);
+        return ret;
+}
+static int btrfs_dev_replace_finishing(struct btrfs_fs_info *fs_info,
+                                       int scrub_ret)
+{
+        struct btrfs_dev_replace *dev_replace = &fs_info->dev_replace;
+        struct btrfs_device *tgt_device;
+        struct btrfs_device *src_device;
+        struct btrfs_root *root = fs_info->tree_root;
+        u8 uuid_tmp[BTRFS_UUID_SIZE];
+        struct btrfs_trans_handle *trans;
+        int ret = 0;
+        /* don't allow cancel or unmount to disturb the finishing procedure */
+        mutex_lock(&dev_replace->lock_finishing_cancel_unmount);
+        btrfs_dev_replace_lock(dev_replace);
+        /* was the operation canceled, or is it finished? */
+        if (dev_replace->replace_state !=
+            BTRFS_IOCTL_DEV_REPLACE_STATE_STARTED) {
+                btrfs_dev_replace_unlock(dev_replace);
+                mutex_unlock(&dev_replace->lock_finishing_cancel_unmount);
+                return 0;
+        }
+        tgt_device = dev_replace->tgtdev;
+        src_device = dev_replace->srcdev;
+        btrfs_dev_replace_unlock(dev_replace);
+        /* replace old device with new one in mapping tree */
+        if (!scrub_ret)
+                btrfs_dev_replace_update_device_in_mapping_tree(fs_info,
+                                                                src_device,
+                                                                tgt_device);
+        /*
+         * flush all outstanding I/O and inode extent mappings before the
+         * copy operation is declared as being finished
+         */
+        btrfs_start_delalloc_inodes(root, 0);
+        btrfs_wait_ordered_extents(root, 0);
+        trans = btrfs_start_transaction(root, 0);
+        if (IS_ERR(trans)) {
+                mutex_unlock(&dev_replace->lock_finishing_cancel_unmount);
+                return PTR_ERR(trans);
+        }
+        ret = btrfs_commit_transaction(trans, root);
+        WARN_ON(ret);
+        /* keep away write_all_supers() during the finishing procedure */
+        mutex_lock(&root->fs_info->fs_devices->device_list_mutex);
+        btrfs_dev_replace_lock(dev_replace);
+        dev_replace->replace_state =
+                scrub_ret ? BTRFS_IOCTL_DEV_REPLACE_STATE_CANCELED
+                          : BTRFS_IOCTL_DEV_REPLACE_STATE_FINISHED;
+        dev_replace->tgtdev = NULL;
+        dev_replace->srcdev = NULL;
+        dev_replace->time_stopped = btrfs_get_seconds_since_1970();
+        dev_replace->item_needs_writeback = 1;
+        if (scrub_ret) {
+                printk_in_rcu(KERN_ERR
+                              "btrfs: btrfs_scrub_dev(%s, %llu, %s) failed %d\n",
+                              src_device->missing ? "<missing disk>" :
+                                rcu_str_deref(src_device->name),
+                              src_device->devid,
+                              rcu_str_deref(tgt_device->name), scrub_ret);
+                btrfs_dev_replace_unlock(dev_replace);
+                mutex_unlock(&root->fs_info->fs_devices->device_list_mutex);
+                if (tgt_device)
+                        btrfs_destroy_dev_replace_tgtdev(fs_info, tgt_device);
+                mutex_unlock(&dev_replace->lock_finishing_cancel_unmount);
+                return 0;
+        }
+        printk_in_rcu(KERN_INFO
+                      "btrfs: dev_replace from %s (devid %llu) to %s) finished\n",
+                      src_device->missing ? "<missing disk>" :
+                        rcu_str_deref(src_device->name),
+                      src_device->devid,
+                      rcu_str_deref(tgt_device->name));
+        tgt_device->is_tgtdev_for_dev_replace = 0;
+        tgt_device->devid = src_device->devid;
+        src_device->devid = BTRFS_DEV_REPLACE_DEVID;
+        tgt_device->bytes_used = src_device->bytes_used;
+        memcpy(uuid_tmp, tgt_device->uuid, sizeof(uuid_tmp));
+        memcpy(tgt_device->uuid, src_device->uuid, sizeof(tgt_device->uuid));
+        memcpy(src_device->uuid, uuid_tmp, sizeof(src_device->uuid));
+        tgt_device->total_bytes = src_device->total_bytes;
+        tgt_device->disk_total_bytes = src_device->disk_total_bytes;
+        tgt_device->bytes_used = src_device->bytes_used;
+        if (fs_info->sb->s_bdev == src_device->bdev)
+                fs_info->sb->s_bdev = tgt_device->bdev;
+        if (fs_info->fs_devices->latest_bdev == src_device->bdev)
+                fs_info->fs_devices->latest_bdev = tgt_device->bdev;
+        list_add(&tgt_device->dev_alloc_list, &fs_info->fs_devices->alloc_list);
+        btrfs_rm_dev_replace_srcdev(fs_info, src_device);
+        if (src_device->bdev) {
+                /* zero out the old super */
+                btrfs_scratch_superblock(src_device);
+        }
+        /*
+         * this is again a consistent state where no dev_replace procedure
+         * is running, the target device is part of the filesystem, the
+         * source device is not part of the filesystem anymore and its 1st
+         * superblock is scratched out so that it is no longer marked to
+         * belong to this filesystem.
+         */
+        btrfs_dev_replace_unlock(dev_replace);
+        mutex_unlock(&root->fs_info->fs_devices->device_list_mutex);
+        /* write back the superblocks */
+        trans = btrfs_start_transaction(root, 0);
+        if (!IS_ERR(trans))
+                btrfs_commit_transaction(trans, root);
+        mutex_unlock(&dev_replace->lock_finishing_cancel_unmount);
+        return 0;
+}
+static void btrfs_dev_replace_update_device_in_mapping_tree(
+                                                struct btrfs_fs_info *fs_info,
+                                                struct btrfs_device *srcdev,
+                                                struct btrfs_device *tgtdev)
+{
+        struct extent_map_tree *em_tree = &fs_info->mapping_tree.map_tree;
+        struct extent_map *em;
+        struct map_lookup *map;
+        u64 start = 0;
+        int i;
+        write_lock(&em_tree->lock);
+        do {
+                em = lookup_extent_mapping(em_tree, start, (u64)-1);
+                if (!em)
+                        break;
+                map = (struct map_lookup *)em->bdev;
+                for (i = 0; i < map->num_stripes; i++)
+                        if (srcdev == map->stripes[i].dev)
+                                map->stripes[i].dev = tgtdev;
+                start = em->start + em->len;
+                free_extent_map(em);
+        } while (start);
+        write_unlock(&em_tree->lock);
+}
+static int btrfs_dev_replace_find_srcdev(struct btrfs_root *root, u64 srcdevid,
+                                         char *srcdev_name,
+                                         struct btrfs_device **device)
+{
+        int ret;
+        if (srcdevid) {
+                ret = 0;
+                *device = btrfs_find_device(root->fs_info, srcdevid, NULL,
+                                            NULL);
+                if (!*device)
+                        ret = -ENOENT;
+        } else {
+                ret = btrfs_find_device_missing_or_by_path(root, srcdev_name,
+                                                           device);
+        }
+        return ret;
+}
+void btrfs_dev_replace_status(struct btrfs_fs_info *fs_info,
+                              struct btrfs_ioctl_dev_replace_args *args)
+{
+        struct btrfs_dev_replace *dev_replace = &fs_info->dev_replace;
+        btrfs_dev_replace_lock(dev_replace);
+        /* even if !dev_replace_is_valid, the values are good enough for
+         * the replace_status ioctl */
+        args->result = BTRFS_IOCTL_DEV_REPLACE_RESULT_NO_ERROR;
+        args->status.replace_state = dev_replace->replace_state;
+        args->status.time_started = dev_replace->time_started;
+        args->status.time_stopped = dev_replace->time_stopped;
+        args->status.num_write_errors =
+                atomic64_read(&dev_replace->num_write_errors);
+        args->status.num_uncorrectable_read_errors =
+                atomic64_read(&dev_replace->num_uncorrectable_read_errors);
+        switch (dev_replace->replace_state) {
+        case BTRFS_IOCTL_DEV_REPLACE_STATE_NEVER_STARTED:
+        case BTRFS_IOCTL_DEV_REPLACE_STATE_CANCELED:
+                args->status.progress_1000 = 0;
+                break;
+        case BTRFS_IOCTL_DEV_REPLACE_STATE_FINISHED:
+                args->status.progress_1000 = 1000;
+                break;
+        case BTRFS_IOCTL_DEV_REPLACE_STATE_STARTED:
+        case BTRFS_IOCTL_DEV_REPLACE_STATE_SUSPENDED:
+                args->status.progress_1000 = div64_u64(dev_replace->cursor_left,
+                        div64_u64(dev_replace->srcdev->total_bytes, 1000));
+                break;
+        }
+        btrfs_dev_replace_unlock(dev_replace);
+}
+int btrfs_dev_replace_cancel(struct btrfs_fs_info *fs_info,
+                             struct btrfs_ioctl_dev_replace_args *args)
+{
+        args->result = __btrfs_dev_replace_cancel(fs_info);
+        return 0;
+}
+static u64 __btrfs_dev_replace_cancel(struct btrfs_fs_info *fs_info)
+{
+        struct btrfs_dev_replace *dev_replace = &fs_info->dev_replace;
+        struct btrfs_device *tgt_device = NULL;
+        struct btrfs_trans_handle *trans;
+        struct btrfs_root *root = fs_info->tree_root;
+        u64 result;
+        int ret;
+        mutex_lock(&dev_replace->lock_finishing_cancel_unmount);
+        btrfs_dev_replace_lock(dev_replace);
+        switch (dev_replace->replace_state) {
+        case BTRFS_IOCTL_DEV_REPLACE_STATE_NEVER_STARTED:
+        case BTRFS_IOCTL_DEV_REPLACE_STATE_FINISHED:
+        case BTRFS_IOCTL_DEV_REPLACE_STATE_CANCELED:
+                result = BTRFS_IOCTL_DEV_REPLACE_RESULT_NOT_STARTED;
+                btrfs_dev_replace_unlock(dev_replace);
+                goto leave;
+        case BTRFS_IOCTL_DEV_REPLACE_STATE_STARTED:
+        case BTRFS_IOCTL_DEV_REPLACE_STATE_SUSPENDED:
+                result = BTRFS_IOCTL_DEV_REPLACE_RESULT_NO_ERROR;
+                tgt_device = dev_replace->tgtdev;
+                dev_replace->tgtdev = NULL;
+                dev_replace->srcdev = NULL;
+                break;
+        }
+        dev_replace->replace_state = BTRFS_IOCTL_DEV_REPLACE_STATE_CANCELED;
+        dev_replace->time_stopped = btrfs_get_seconds_since_1970();
+        dev_replace->item_needs_writeback = 1;
+        btrfs_dev_replace_unlock(dev_replace);
+        btrfs_scrub_cancel(fs_info);
+        trans = btrfs_start_transaction(root, 0);
+        if (IS_ERR(trans)) {
+                mutex_unlock(&dev_replace->lock_finishing_cancel_unmount);
+                return PTR_ERR(trans);
+        }
+        ret = btrfs_commit_transaction(trans, root);
+        WARN_ON(ret);
+        if (tgt_device)
+                btrfs_destroy_dev_replace_tgtdev(fs_info, tgt_device);
+leave:
+        mutex_unlock(&dev_replace->lock_finishing_cancel_unmount);
+        return result;
+}
+void btrfs_dev_replace_suspend_for_unmount(struct btrfs_fs_info *fs_info)
+{
+        struct btrfs_dev_replace *dev_replace = &fs_info->dev_replace;
+        mutex_lock(&dev_replace->lock_finishing_cancel_unmount);
+        btrfs_dev_replace_lock(dev_replace);
+        switch (dev_replace->replace_state) {
+        case BTRFS_IOCTL_DEV_REPLACE_STATE_NEVER_STARTED:
+        case BTRFS_IOCTL_DEV_REPLACE_STATE_FINISHED:
+        case BTRFS_IOCTL_DEV_REPLACE_STATE_CANCELED:
+        case BTRFS_IOCTL_DEV_REPLACE_STATE_SUSPENDED:
+                break;
+        case BTRFS_IOCTL_DEV_REPLACE_STATE_STARTED:
+                dev_replace->replace_state =
+                        BTRFS_IOCTL_DEV_REPLACE_STATE_SUSPENDED;
+                dev_replace->time_stopped = btrfs_get_seconds_since_1970();
+                dev_replace->item_needs_writeback = 1;
+                pr_info("btrfs: suspending dev_replace for unmount\n");
+                break;
+        }
+        btrfs_dev_replace_unlock(dev_replace);
+        mutex_unlock(&dev_replace->lock_finishing_cancel_unmount);
+}
+/* resume dev_replace procedure that was interrupted by unmount */
+int btrfs_resume_dev_replace_async(struct btrfs_fs_info *fs_info)
+{
+        struct task_struct *task;
+        struct btrfs_dev_replace *dev_replace = &fs_info->dev_replace;
+        btrfs_dev_replace_lock(dev_replace);
+        switch (dev_replace->replace_state) {
+        case BTRFS_IOCTL_DEV_REPLACE_STATE_NEVER_STARTED:
+        case BTRFS_IOCTL_DEV_REPLACE_STATE_FINISHED:
+        case BTRFS_IOCTL_DEV_REPLACE_STATE_CANCELED:
+                btrfs_dev_replace_unlock(dev_replace);
+                return 0;
+        case BTRFS_IOCTL_DEV_REPLACE_STATE_STARTED:
+                break;
+        case BTRFS_IOCTL_DEV_REPLACE_STATE_SUSPENDED:
+                dev_replace->replace_state =
+                        BTRFS_IOCTL_DEV_REPLACE_STATE_STARTED;
+                break;
+        }
+        if (!dev_replace->tgtdev || !dev_replace->tgtdev->bdev) {
+                pr_info("btrfs: cannot continue dev_replace, tgtdev is missing\n"
+                        "btrfs: you may cancel the operation after 'mount -o degraded'\n");
+                btrfs_dev_replace_unlock(dev_replace);
+                return 0;
+        }
+        btrfs_dev_replace_unlock(dev_replace);
+        WARN_ON(atomic_xchg(
+                &fs_info->mutually_exclusive_operation_running, 1));
+        task = kthread_run(btrfs_dev_replace_kthread, fs_info, "btrfs-devrepl");
+        return PTR_RET(task);
+}
+static int btrfs_dev_replace_kthread(void *data)
+{
+        struct btrfs_fs_info *fs_info = data;
+        struct btrfs_dev_replace *dev_replace = &fs_info->dev_replace;
+        struct btrfs_ioctl_dev_replace_args *status_args;
+        u64 progress;
+        status_args = kzalloc(sizeof(*status_args), GFP_NOFS);
+        if (status_args) {
+                btrfs_dev_replace_status(fs_info, status_args);
+                progress = status_args->status.progress_1000;
+                kfree(status_args);
+                do_div(progress, 10);
+                printk_in_rcu(KERN_INFO
+                              "btrfs: continuing dev_replace from %s (devid %llu) to %s @%u%%\n",
+                              dev_replace->srcdev->missing ? "<missing disk>" :
+                                rcu_str_deref(dev_replace->srcdev->name),
+                              dev_replace->srcdev->devid,
+                              dev_replace->tgtdev ?
+                                rcu_str_deref(dev_replace->tgtdev->name) :
+                                "<missing target disk>",
+                              (unsigned int)progress);
+        }
+        btrfs_dev_replace_continue_on_mount(fs_info);
+        atomic_set(&fs_info->mutually_exclusive_operation_running, 0);
+        return 0;
+}
+static int btrfs_dev_replace_continue_on_mount(struct btrfs_fs_info *fs_info)
+{
+        struct btrfs_dev_replace *dev_replace = &fs_info->dev_replace;
+        int ret;
+        ret = btrfs_scrub_dev(fs_info, dev_replace->srcdev->devid,
+                              dev_replace->committed_cursor_left,
+                              dev_replace->srcdev->total_bytes,
+                              &dev_replace->scrub_progress, 0, 1);
+        ret = btrfs_dev_replace_finishing(fs_info, ret);
+        WARN_ON(ret);
+        return 0;
+}
+int btrfs_dev_replace_is_ongoing(struct btrfs_dev_replace *dev_replace)
+{
+        if (!dev_replace->is_valid)
+                return 0;
+        switch (dev_replace->replace_state) {
+        case BTRFS_IOCTL_DEV_REPLACE_STATE_NEVER_STARTED:
+        case BTRFS_IOCTL_DEV_REPLACE_STATE_FINISHED:
+        case BTRFS_IOCTL_DEV_REPLACE_STATE_CANCELED:
+                return 0;
+        case BTRFS_IOCTL_DEV_REPLACE_STATE_STARTED:
+        case BTRFS_IOCTL_DEV_REPLACE_STATE_SUSPENDED:
+                /*
+                 * return true even if tgtdev is missing (this is
+                 * something that can happen if the dev_replace
+                 * procedure is suspended by an umount and then
+                 * the tgtdev is missing (or "btrfs dev scan") was
+                 * not called and the the filesystem is remounted
+                 * in degraded state. This does not stop the
+                 * dev_replace procedure. It needs to be canceled
+                 * manually if the cancelation is wanted.
+                 */
+                break;
+        }
+        return 1;
+}
+void btrfs_dev_replace_lock(struct btrfs_dev_replace *dev_replace)
+{
+        /* the beginning is just an optimization for the typical case */
+        if (atomic_read(&dev_replace->nesting_level) == 0) {
+acquire_lock:
+                /* this is not a nested case where the same thread
+                 * is trying to acqurire the same lock twice */
+                mutex_lock(&dev_replace->lock);
+                mutex_lock(&dev_replace->lock_management_lock);
+                dev_replace->lock_owner = current->pid;
+                atomic_inc(&dev_replace->nesting_level);
+                mutex_unlock(&dev_replace->lock_management_lock);
+                return;
+        }
+        mutex_lock(&dev_replace->lock_management_lock);
+        if (atomic_read(&dev_replace->nesting_level) > 0 &&
+            dev_replace->lock_owner == current->pid) {
+                WARN_ON(!mutex_is_locked(&dev_replace->lock));
+                atomic_inc(&dev_replace->nesting_level);
+                mutex_unlock(&dev_replace->lock_management_lock);
+                return;
+        }
+        mutex_unlock(&dev_replace->lock_management_lock);
+        goto acquire_lock;
+}
+void btrfs_dev_replace_unlock(struct btrfs_dev_replace *dev_replace)
+{
+        WARN_ON(!mutex_is_locked(&dev_replace->lock));
+        mutex_lock(&dev_replace->lock_management_lock);
+        WARN_ON(atomic_read(&dev_replace->nesting_level) < 1);
+        WARN_ON(dev_replace->lock_owner != current->pid);
+        atomic_dec(&dev_replace->nesting_level);
+        if (atomic_read(&dev_replace->nesting_level) == 0) {
+                dev_replace->lock_owner = 0;
+                mutex_unlock(&dev_replace->lock_management_lock);
+                mutex_unlock(&dev_replace->lock);
+        } else {
+                mutex_unlock(&dev_replace->lock_management_lock);
+        }
+}
diff --git a/fs/btrfs/dev-replace.h b/fs/btrfs/dev-replace.h
new file mode 100644
index 000000000000..20035cbbf021
--- /dev/null
+++ b/fs/btrfs/dev-replace.h
@@ -0,0 +1,44 @@
+/*
+ * Copyright (C) STRATO AG 2012.  All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public
+ * License v2 as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public
+ * License along with this program; if not, write to the
+ * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
+ * Boston, MA 021110-1307, USA.
+ */
+#if !defined(__BTRFS_DEV_REPLACE__)
+#define __BTRFS_DEV_REPLACE__
+struct btrfs_ioctl_dev_replace_args;
+int btrfs_init_dev_replace(struct btrfs_fs_info *fs_info);
+int btrfs_run_dev_replace(struct btrfs_trans_handle *trans,
+                          struct btrfs_fs_info *fs_info);
+void btrfs_after_dev_replace_commit(struct btrfs_fs_info *fs_info);
+int btrfs_dev_replace_start(struct btrfs_root *root,
+                            struct btrfs_ioctl_dev_replace_args *args);
+void btrfs_dev_replace_status(struct btrfs_fs_info *fs_info,
+                              struct btrfs_ioctl_dev_replace_args *args);
+int btrfs_dev_replace_cancel(struct btrfs_fs_info *fs_info,
+                             struct btrfs_ioctl_dev_replace_args *args);
+void btrfs_dev_replace_suspend_for_unmount(struct btrfs_fs_info *fs_info);
+int btrfs_resume_dev_replace_async(struct btrfs_fs_info *fs_info);
+int btrfs_dev_replace_is_ongoing(struct btrfs_dev_replace *dev_replace);
+void btrfs_dev_replace_lock(struct btrfs_dev_replace *dev_replace);
+void btrfs_dev_replace_unlock(struct btrfs_dev_replace *dev_replace);
+static inline void btrfs_dev_replace_stats_inc(atomic64_t *stat_value)
+{
+        atomic64_inc(stat_value);
+}
+#endif
diff --git a/fs/btrfs/dir-item.c b/fs/btrfs/dir-item.c
index c1a074d0696f..502c2158167c 100644
--- a/fs/btrfs/dir-item.c
+++ b/fs/btrfs/dir-item.c
@@ -213,6 +213,65 @@ struct btrfs_dir_item *btrfs_lookup_dir_item(struct btrfs_trans_handle *trans,
        return btrfs_match_dir_item_name(root, path, name, name_len);
 }
+int btrfs_check_dir_item_collision(struct btrfs_root *root, u64 dir,
+                                   const char *name, int name_len)
+{
+        int ret;
+        struct btrfs_key key;
+        struct btrfs_dir_item *di;
+        int data_size;
+        struct extent_buffer *leaf;
+        int slot;
+        struct btrfs_path *path;
+        path = btrfs_alloc_path();
+        if (!path)
+                return -ENOMEM;
+        key.objectid = dir;
+        btrfs_set_key_type(&key, BTRFS_DIR_ITEM_KEY);
+        key.offset = btrfs_name_hash(name, name_len);
+        ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
+        /* return back any errors */
+        if (ret < 0)
+                goto out;
+        /* nothing found, we're safe */
+        if (ret > 0) {
+                ret = 0;
+                goto out;
+        }
+        /* we found an item, look for our name in the item */
+        di = btrfs_match_dir_item_name(root, path, name, name_len);
+        if (di) {
+                /* our exact name was found */
+                ret = -EEXIST;
+                goto out;
+        }
+        /*
+         * see if there is room in the item to insert this
+         * name
+         */
+        data_size = sizeof(*di) + name_len + sizeof(struct btrfs_item);
+        leaf = path->nodes[0];
+        slot = path->slots[0];
+        if (data_size + btrfs_item_size_nr(leaf, slot) +
+            sizeof(struct btrfs_item) > BTRFS_LEAF_DATA_SIZE(root)) {
+                ret = -EOVERFLOW;
+        } else {
+                /* plenty of insertion room */
+                ret = 0;
+        }
+out:
+        btrfs_free_path(path);
+        return ret;
+}
 /*
 * lookup a directory item based on index.  'dir' is the objectid
 * we're searching in, and 'mod' tells us if you plan on deleting the
diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
index 7cda51995c1e..a8f652dc940b 100644
--- a/fs/btrfs/disk-io.c
+++ b/fs/btrfs/disk-io.c
@@ -45,6 +45,7 @@
 #include "inode-map.h"
 #include "check-integrity.h"
 #include "rcu-string.h"
+#include "dev-replace.h"
 #ifdef CONFIG_X86
 #include <asm/cpufeature.h>
@@ -387,7 +388,7 @@ static int btree_read_extent_buffer_pages(struct btrfs_root *root,
                if (test_bit(EXTENT_BUFFER_CORRUPT, &eb->bflags))
                        break;
-                num_copies = btrfs_num_copies(&root->fs_info->mapping_tree,
+                num_copies = btrfs_num_copies(root->fs_info,
                                              eb->start, eb->len);
                if (num_copies == 1)
                        break;
@@ -852,11 +853,16 @@ static int __btree_submit_bio_done(struct inode *inode, int rw, struct bio *bio,
                                 int mirror_num, unsigned long bio_flags,
                                 u64 bio_offset)
 {
+        int ret;
        /*
         * when we're called for a write, we're already in the async
         * submission context.  Just jump into btrfs_map_bio
         */
-        return btrfs_map_bio(BTRFS_I(inode)->root, rw, bio, mirror_num, 1);
+        ret = btrfs_map_bio(BTRFS_I(inode)->root, rw, bio, mirror_num, 1);
+        if (ret)
+                bio_endio(bio, ret);
+        return ret;
 }
 static int check_async_write(struct inode *inode, unsigned long bio_flags)
@@ -878,7 +884,6 @@ static int btree_submit_bio_hook(struct inode *inode, int rw, struct bio *bio,
        int ret;
        if (!(rw & REQ_WRITE)) {
                /*
                 * called for a read, do the setup so that checksum validation
                 * can happen in the async kernel threads
@@ -886,26 +891,32 @@ static int btree_submit_bio_hook(struct inode *inode, int rw, struct bio *bio,
                ret = btrfs_bio_wq_end_io(BTRFS_I(inode)->root->fs_info,
                                          bio, 1);
                if (ret)
-                        return ret;
+                        goto out_w_error;
-                return btrfs_map_bio(BTRFS_I(inode)->root, rw, bio,
+                ret = btrfs_map_bio(BTRFS_I(inode)->root, rw, bio,
-                                     mirror_num, 0);
+                                    mirror_num, 0);
        } else if (!async) {
                ret = btree_csum_one_bio(bio);
                if (ret)
-                        return ret;
+                        goto out_w_error;
-                return btrfs_map_bio(BTRFS_I(inode)->root, rw, bio,
+                ret = btrfs_map_bio(BTRFS_I(inode)->root, rw, bio,
-                                     mirror_num, 0);
+                                    mirror_num, 0);
+        } else {
+                /*
+                 * kthread helpers are used to submit writes so that
+                 * checksumming can happen in parallel across all CPUs
+                 */
+                ret = btrfs_wq_submit_bio(BTRFS_I(inode)->root->fs_info,
+                                          inode, rw, bio, mirror_num, 0,
+                                          bio_offset,
+                                          __btree_submit_bio_start,
+                                          __btree_submit_bio_done);
        }
-        /*
+        if (ret) {
-         * kthread helpers are used to submit writes so that checksumming
+out_w_error:
-         * can happen in parallel across all CPUs
+                bio_endio(bio, ret);
-         */
+        }
-        return btrfs_wq_submit_bio(BTRFS_I(inode)->root->fs_info,
+        return ret;
-                                   inode, rw, bio, mirror_num, 0,
-                                   bio_offset,
-                                   __btree_submit_bio_start,
-                                   __btree_submit_bio_done);
 }
 #ifdef CONFIG_MIGRATION
@@ -990,6 +1001,7 @@ static void btree_invalidatepage(struct page *page, unsigned long offset)
 static int btree_set_page_dirty(struct page *page)
 {
+#ifdef DEBUG
        struct extent_buffer *eb;
        BUG_ON(!PagePrivate(page));
@@ -998,6 +1010,7 @@ static int btree_set_page_dirty(struct page *page)
        BUG_ON(!test_bit(EXTENT_BUFFER_DIRTY, &eb->bflags));
        BUG_ON(!atomic_read(&eb->refs));
        btrfs_assert_tree_locked(eb);
+#endif
        return __set_page_dirty_nobuffers(page);
 }
@@ -1129,11 +1142,11 @@ void clean_tree_block(struct btrfs_trans_handle *trans, struct btrfs_root *root,
                                          root->fs_info->dirty_metadata_bytes);
                        }
                        spin_unlock(&root->fs_info->delalloc_lock);
-                }
-                /* ugh, clear_extent_buffer_dirty needs to lock the page */
+                        /* ugh, clear_extent_buffer_dirty needs to lock the page */
-                btrfs_set_lock_blocking(buf);
+                        btrfs_set_lock_blocking(buf);
-                clear_extent_buffer_dirty(buf);
+                        clear_extent_buffer_dirty(buf);
+                }
        }
 }
@@ -1193,7 +1206,7 @@ static void __setup_root(u32 nodesize, u32 leafsize, u32 sectorsize,
        root->root_key.objectid = objectid;
        root->anon_dev = 0;
-        spin_lock_init(&root->root_times_lock);
+        spin_lock_init(&root->root_item_lock);
 }
 static int __must_check find_and_setup_root(struct btrfs_root *tree_root,
@@ -2131,6 +2144,11 @@ int open_ctree(struct super_block *sb,
        init_rwsem(&fs_info->extent_commit_sem);
        init_rwsem(&fs_info->cleanup_work_sem);
        init_rwsem(&fs_info->subvol_sem);
+        fs_info->dev_replace.lock_owner = 0;
+        atomic_set(&fs_info->dev_replace.nesting_level, 0);
+        mutex_init(&fs_info->dev_replace.lock_finishing_cancel_unmount);
+        mutex_init(&fs_info->dev_replace.lock_management_lock);
+        mutex_init(&fs_info->dev_replace.lock);
        spin_lock_init(&fs_info->qgroup_lock);
        fs_info->qgroup_tree = RB_ROOT;
@@ -2279,6 +2297,10 @@ int open_ctree(struct super_block *sb,
                           fs_info->thread_pool_size,
                           &fs_info->generic_worker);
+        btrfs_init_workers(&fs_info->flush_workers, "flush_delalloc",
+                           fs_info->thread_pool_size,
+                           &fs_info->generic_worker);
        btrfs_init_workers(&fs_info->submit_workers, "submit",
                           min_t(u64, fs_devices->num_devices,
                           fs_info->thread_pool_size),
@@ -2350,6 +2372,7 @@ int open_ctree(struct super_block *sb,
        ret |= btrfs_start_workers(&fs_info->delayed_workers);
        ret |= btrfs_start_workers(&fs_info->caching_workers);
        ret |= btrfs_start_workers(&fs_info->readahead_workers);
+        ret |= btrfs_start_workers(&fs_info->flush_workers);
        if (ret) {
                err = -ENOMEM;
                goto fail_sb_buffer;
@@ -2418,7 +2441,11 @@ int open_ctree(struct super_block *sb,
                goto fail_tree_roots;
        }
-        btrfs_close_extra_devices(fs_devices);
+        /*
+         * keep the device that is marked to be the target device for the
+         * dev_replace procedure
+         */
+        btrfs_close_extra_devices(fs_info, fs_devices, 0);
        if (!fs_devices->latest_bdev) {
                printk(KERN_CRIT "btrfs: failed to read devices on %s\n",
@@ -2490,6 +2517,14 @@ retry_root_backup:
                goto fail_block_groups;
        }
+        ret = btrfs_init_dev_replace(fs_info);
+        if (ret) {
+                pr_err("btrfs: failed to init dev_replace: %d\n", ret);
+                goto fail_block_groups;
+        }
+        btrfs_close_extra_devices(fs_info, fs_devices, 1);
        ret = btrfs_init_space_info(fs_info);
        if (ret) {
                printk(KERN_ERR "Failed to initial space info: %d\n", ret);
@@ -2503,6 +2538,13 @@ retry_root_backup:
        }
        fs_info->num_tolerated_disk_barrier_failures =
                btrfs_calc_num_tolerated_disk_barrier_failures(fs_info);
+        if (fs_info->fs_devices->missing_devices >
+             fs_info->num_tolerated_disk_barrier_failures &&
+            !(sb->s_flags & MS_RDONLY)) {
+                printk(KERN_WARNING
+                       "Btrfs: too many missing devices, writeable mount is not allowed\n");
+                goto fail_block_groups;
+        }
        fs_info->cleaner_kthread = kthread_run(cleaner_kthread, tree_root,
                                               "btrfs-cleaner");
@@ -2631,6 +2673,13 @@ retry_root_backup:
                return ret;
        }
+        ret = btrfs_resume_dev_replace_async(fs_info);
+        if (ret) {
+                pr_warn("btrfs: failed to resume dev_replace\n");
+                close_ctree(tree_root);
+                return ret;
+        }
        return 0;
 fail_qgroup:
@@ -2667,6 +2716,7 @@ fail_sb_buffer:
        btrfs_stop_workers(&fs_info->submit_workers);
        btrfs_stop_workers(&fs_info->delayed_workers);
        btrfs_stop_workers(&fs_info->caching_workers);
+        btrfs_stop_workers(&fs_info->flush_workers);
 fail_alloc:
 fail_iput:
        btrfs_mapping_tree_free(&fs_info->mapping_tree);
@@ -3270,16 +3320,18 @@ int close_ctree(struct btrfs_root *root)
        smp_mb();
        /* pause restriper - we want to resume on mount */
-        btrfs_pause_balance(root->fs_info);
+        btrfs_pause_balance(fs_info);
+        btrfs_dev_replace_suspend_for_unmount(fs_info);
-        btrfs_scrub_cancel(root);
+        btrfs_scrub_cancel(fs_info);
        /* wait for any defraggers to finish */
        wait_event(fs_info->transaction_wait,
                   (atomic_read(&fs_info->defrag_running) == 0));
        /* clear out the rbtree of defraggable inodes */
-        btrfs_run_defrag_inodes(fs_info);
+        btrfs_cleanup_defrag_inodes(fs_info);
        if (!(fs_info->sb->s_flags & MS_RDONLY)) {
                ret = btrfs_commit_super(root);
@@ -3339,6 +3391,7 @@ int close_ctree(struct btrfs_root *root)
        btrfs_stop_workers(&fs_info->delayed_workers);
        btrfs_stop_workers(&fs_info->caching_workers);
        btrfs_stop_workers(&fs_info->readahead_workers);
+        btrfs_stop_workers(&fs_info->flush_workers);
 #ifdef CONFIG_BTRFS_FS_CHECK_INTEGRITY
        if (btrfs_test_opt(root, CHECK_INTEGRITY))
@@ -3383,14 +3436,12 @@ void btrfs_mark_buffer_dirty(struct extent_buffer *buf)
        int was_dirty;
        btrfs_assert_tree_locked(buf);
-        if (transid != root->fs_info->generation) {
+        if (transid != root->fs_info->generation)
-                printk(KERN_CRIT "btrfs transid mismatch buffer %llu, "
+                WARN(1, KERN_CRIT "btrfs transid mismatch buffer %llu, "
                       "found %llu running %llu\n",
                        (unsigned long long)buf->start,
                        (unsigned long long)transid,
                        (unsigned long long)root->fs_info->generation);
-                WARN_ON(1);
-        }
        was_dirty = set_extent_buffer_dirty(buf);
        if (!was_dirty) {
                spin_lock(&root->fs_info->delalloc_lock);
@@ -3399,7 +3450,8 @@ void btrfs_mark_buffer_dirty(struct extent_buffer *buf)
        }
 }
-void btrfs_btree_balance_dirty(struct btrfs_root *root, unsigned long nr)
+static void __btrfs_btree_balance_dirty(struct btrfs_root *root,
+                                        int flush_delayed)
 {
        /*
         * looks as though older kernels can get into trouble with
@@ -3411,36 +3463,26 @@ void btrfs_btree_balance_dirty(struct btrfs_root *root, unsigned long nr)
        if (current->flags & PF_MEMALLOC)
                return;
-        btrfs_balance_delayed_items(root);
+        if (flush_delayed)
+                btrfs_balance_delayed_items(root);
        num_dirty = root->fs_info->dirty_metadata_bytes;
        if (num_dirty > thresh) {
-                balance_dirty_pages_ratelimited_nr(
+                balance_dirty_pages_ratelimited(
-                                   root->fs_info->btree_inode->i_mapping, 1);
+                                   root->fs_info->btree_inode->i_mapping);
        }
        return;
 }
-void __btrfs_btree_balance_dirty(struct btrfs_root *root, unsigned long nr)
+void btrfs_btree_balance_dirty(struct btrfs_root *root)
 {
-        /*
+        __btrfs_btree_balance_dirty(root, 1);
-         * looks as though older kernels can get into trouble with
+}
-         * this code, they end up stuck in balance_dirty_pages forever
-         */
-        u64 num_dirty;
-        unsigned long thresh = 32 * 1024 * 1024;
-        if (current->flags & PF_MEMALLOC)
-                return;
-        num_dirty = root->fs_info->dirty_metadata_bytes;
-        if (num_dirty > thresh) {
+void btrfs_btree_balance_dirty_nodelay(struct btrfs_root *root)
-                balance_dirty_pages_ratelimited_nr(
+{
-                                   root->fs_info->btree_inode->i_mapping, 1);
+        __btrfs_btree_balance_dirty(root, 0);
-        }
-        return;
 }
 int btrfs_read_buffer(struct extent_buffer *buf, u64 parent_transid)
diff --git a/fs/btrfs/disk-io.h b/fs/btrfs/disk-io.h
index 2025a9132c16..305c33efb0e3 100644
--- a/fs/btrfs/disk-io.h
+++ b/fs/btrfs/disk-io.h
@@ -62,8 +62,8 @@ struct btrfs_root *btrfs_read_fs_root_no_radix(struct btrfs_root *tree_root,
 struct btrfs_root *btrfs_read_fs_root_no_name(struct btrfs_fs_info *fs_info,
                                              struct btrfs_key *location);
 int btrfs_cleanup_fs_roots(struct btrfs_fs_info *fs_info);
-void btrfs_btree_balance_dirty(struct btrfs_root *root, unsigned long nr);
+void btrfs_btree_balance_dirty(struct btrfs_root *root);
-void __btrfs_btree_balance_dirty(struct btrfs_root *root, unsigned long nr);
+void btrfs_btree_balance_dirty_nodelay(struct btrfs_root *root);
 void btrfs_free_fs_root(struct btrfs_fs_info *fs_info, struct btrfs_root *root);
 void btrfs_mark_buffer_dirty(struct extent_buffer *buf);
 int btrfs_buffer_uptodate(struct extent_buffer *buf, u64 parent_transid,
diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
index 3d3e2c17d8d1..521e9d4424f6 100644
--- a/fs/btrfs/extent-tree.c
+++ b/fs/btrfs/extent-tree.c
@@ -33,6 +33,7 @@
 #include "volumes.h"
 #include "locking.h"
 #include "free-space-cache.h"
+#include "math.h"
 #undef SCRAMBLE_DELAYED_REFS
@@ -649,24 +650,6 @@ void btrfs_clear_space_info_full(struct btrfs_fs_info *info)
        rcu_read_unlock();
 }
-static u64 div_factor(u64 num, int factor)
-{
-        if (factor == 10)
-                return num;
-        num *= factor;
-        do_div(num, 10);
-        return num;
-}
-static u64 div_factor_fine(u64 num, int factor)
-{
-        if (factor == 100)
-                return num;
-        num *= factor;
-        do_div(num, 100);
-        return num;
-}
 u64 btrfs_find_block_group(struct btrfs_root *root,
                           u64 search_start, u64 search_hint, int owner)
 {
@@ -1835,7 +1818,7 @@ static int btrfs_discard_extent(struct btrfs_root *root, u64 bytenr,
        /* Tell the block device(s) that the sectors can be discarded */
-        ret = btrfs_map_block(&root->fs_info->mapping_tree, REQ_DISCARD,
+        ret = btrfs_map_block(root->fs_info, REQ_DISCARD,
                              bytenr, &num_bytes, &bbio, 0);
        /* Error condition is -ENOMEM */
        if (!ret) {
@@ -2314,6 +2297,9 @@ static noinline int run_clustered_refs(struct btrfs_trans_handle *trans,
                                kfree(extent_op);
                                if (ret) {
+                                        list_del_init(&locked_ref->cluster);
+                                        mutex_unlock(&locked_ref->mutex);
                                        printk(KERN_DEBUG "btrfs: run_delayed_extent_op returned %d\n", ret);
                                        spin_lock(&delayed_refs->lock);
                                        return ret;
@@ -2356,6 +2342,10 @@ static noinline int run_clustered_refs(struct btrfs_trans_handle *trans,
                count++;
                if (ret) {
+                        if (locked_ref) {
+                                list_del_init(&locked_ref->cluster);
+                                mutex_unlock(&locked_ref->mutex);
+                        }
                        printk(KERN_DEBUG "btrfs: run_one_delayed_ref returned %d\n", ret);
                        spin_lock(&delayed_refs->lock);
                        return ret;
@@ -3661,7 +3651,7 @@ out:
 static int can_overcommit(struct btrfs_root *root,
                          struct btrfs_space_info *space_info, u64 bytes,
-                          int flush)
+                          enum btrfs_reserve_flush_enum flush)
 {
        u64 profile = btrfs_get_alloc_profile(root, 0);
        u64 avail;
@@ -3685,11 +3675,11 @@ static int can_overcommit(struct btrfs_root *root,
                avail >>= 1;
        /*
-         * If we aren't flushing don't let us overcommit too much, say
+         * If we aren't flushing all things, let us overcommit up to
-         * 1/8th of the space.  If we can flush, let it overcommit up to
+         * 1/2th of the space. If we can flush, don't let us overcommit
-         * 1/2 of the space.
+         * too much, let it overcommit up to 1/8 of the space.
         */
-        if (flush)
+        if (flush == BTRFS_RESERVE_FLUSH_ALL)
                avail >>= 3;
        else
                avail >>= 1;
@@ -3699,6 +3689,20 @@ static int can_overcommit(struct btrfs_root *root,
        return 0;
 }
+static int writeback_inodes_sb_nr_if_idle_safe(struct super_block *sb,
+                                               unsigned long nr_pages,
+                                               enum wb_reason reason)
+{
+        if (!writeback_in_progress(sb->s_bdi) &&
+            down_read_trylock(&sb->s_umount)) {
+                writeback_inodes_sb_nr(sb, nr_pages, reason);
+                up_read(&sb->s_umount);
+                return 1;
+        }
+        return 0;
+}
 /*
 * shrink metadata reservation for delalloc
 */
@@ -3713,6 +3717,7 @@ static void shrink_delalloc(struct btrfs_root *root, u64 to_reclaim, u64 orig,
        long time_left;
        unsigned long nr_pages = (2 * 1024 * 1024) >> PAGE_CACHE_SHIFT;
        int loops = 0;
+        enum btrfs_reserve_flush_enum flush;
        trans = (struct btrfs_trans_handle *)current->journal_info;
        block_rsv = &root->fs_info->delalloc_block_rsv;
@@ -3730,8 +3735,9 @@ static void shrink_delalloc(struct btrfs_root *root, u64 to_reclaim, u64 orig,
        while (delalloc_bytes && loops < 3) {
                max_reclaim = min(delalloc_bytes, to_reclaim);
                nr_pages = max_reclaim >> PAGE_CACHE_SHIFT;
-                writeback_inodes_sb_nr_if_idle(root->fs_info->sb, nr_pages,
+                writeback_inodes_sb_nr_if_idle_safe(root->fs_info->sb,
-                                               WB_REASON_FS_FREE_SPACE);
+                                                    nr_pages,
+                                                    WB_REASON_FS_FREE_SPACE);
                /*
                 * We need to wait for the async pages to actually start before
@@ -3740,8 +3746,12 @@ static void shrink_delalloc(struct btrfs_root *root, u64 to_reclaim, u64 orig,
                wait_event(root->fs_info->async_submit_wait,
                           !atomic_read(&root->fs_info->async_delalloc_pages));
+                if (!trans)
+                        flush = BTRFS_RESERVE_FLUSH_ALL;
+                else
+                        flush = BTRFS_RESERVE_NO_FLUSH;
                spin_lock(&space_info->lock);
-                if (can_overcommit(root, space_info, orig, !trans)) {
+                if (can_overcommit(root, space_info, orig, flush)) {
                        spin_unlock(&space_info->lock);
                        break;
                }
@@ -3888,7 +3898,7 @@ static int flush_space(struct btrfs_root *root,
 * @root - the root we're allocating for
 * @block_rsv - the block_rsv we're allocating for
 * @orig_bytes - the number of bytes we want
- * @flush - wether or not we can flush to make our reservation
+ * @flush - whether or not we can flush to make our reservation
 *
 * This will reserve orgi_bytes number of bytes from the space info associated
 * with the block_rsv.  If there is not enough space it will make an attempt to
@@ -3899,7 +3909,8 @@ static int flush_space(struct btrfs_root *root,
 */
 static int reserve_metadata_bytes(struct btrfs_root *root,
                                  struct btrfs_block_rsv *block_rsv,
-                                  u64 orig_bytes, int flush)
+                                  u64 orig_bytes,
+                                  enum btrfs_reserve_flush_enum flush)
 {
        struct btrfs_space_info *space_info = block_rsv->space_info;
        u64 used;
@@ -3912,10 +3923,11 @@ again:
        ret = 0;
        spin_lock(&space_info->lock);
        /*
-         * We only want to wait if somebody other than us is flushing and we are
+         * We only want to wait if somebody other than us is flushing and we
-         * actually alloed to flush.
+         * are actually allowed to flush all things.
         */
-        while (flush && !flushing && space_info->flush) {
+        while (flush == BTRFS_RESERVE_FLUSH_ALL && !flushing &&
+               space_info->flush) {
                spin_unlock(&space_info->lock);
                /*
                 * If we have a trans handle we can't wait because the flusher
@@ -3981,23 +3993,40 @@ again:
         * Couldn't make our reservation, save our place so while we're trying
         * to reclaim space we can actually use it instead of somebody else
         * stealing it from us.
+         *
+         * We make the other tasks wait for the flush only when we can flush
+         * all things.
         */
-        if (ret && flush) {
+        if (ret && flush == BTRFS_RESERVE_FLUSH_ALL) {
                flushing = true;
                space_info->flush = 1;
        }
        spin_unlock(&space_info->lock);
-        if (!ret || !flush)
+        if (!ret || flush == BTRFS_RESERVE_NO_FLUSH)
                goto out;
        ret = flush_space(root, space_info, num_bytes, orig_bytes,
                          flush_state);
        flush_state++;
+        /*
+         * If we are FLUSH_LIMIT, we can not flush delalloc, or the deadlock
+         * would happen. So skip delalloc flush.
+         */
+        if (flush == BTRFS_RESERVE_FLUSH_LIMIT &&
+            (flush_state == FLUSH_DELALLOC ||
+             flush_state == FLUSH_DELALLOC_WAIT))
+                flush_state = ALLOC_CHUNK;
        if (!ret)
                goto again;
-        else if (flush_state <= COMMIT_TRANS)
+        else if (flush == BTRFS_RESERVE_FLUSH_LIMIT &&
+                 flush_state < COMMIT_TRANS)
+                goto again;
+        else if (flush == BTRFS_RESERVE_FLUSH_ALL &&
+                 flush_state <= COMMIT_TRANS)
                goto again;
 out:
@@ -4148,9 +4177,9 @@ void btrfs_free_block_rsv(struct btrfs_root *root,
        kfree(rsv);
 }
-static inline int __block_rsv_add(struct btrfs_root *root,
+int btrfs_block_rsv_add(struct btrfs_root *root,
-                                  struct btrfs_block_rsv *block_rsv,
+                        struct btrfs_block_rsv *block_rsv, u64 num_bytes,
-                                  u64 num_bytes, int flush)
+                        enum btrfs_reserve_flush_enum flush)
 {
        int ret;
@@ -4166,20 +4195,6 @@ static inline int __block_rsv_add(struct btrfs_root *root,
        return ret;
 }
-int btrfs_block_rsv_add(struct btrfs_root *root,
-                        struct btrfs_block_rsv *block_rsv,
-                        u64 num_bytes)
-{
-        return __block_rsv_add(root, block_rsv, num_bytes, 1);
-}
-int btrfs_block_rsv_add_noflush(struct btrfs_root *root,
-                                struct btrfs_block_rsv *block_rsv,
-                                u64 num_bytes)
-{
-        return __block_rsv_add(root, block_rsv, num_bytes, 0);
-}
 int btrfs_block_rsv_check(struct btrfs_root *root,
                          struct btrfs_block_rsv *block_rsv, int min_factor)
 {
@@ -4198,9 +4213,9 @@ int btrfs_block_rsv_check(struct btrfs_root *root,
        return ret;
 }
-static inline int __btrfs_block_rsv_refill(struct btrfs_root *root,
+int btrfs_block_rsv_refill(struct btrfs_root *root,
-                                           struct btrfs_block_rsv *block_rsv,
+                           struct btrfs_block_rsv *block_rsv, u64 min_reserved,
-                                           u64 min_reserved, int flush)
+                           enum btrfs_reserve_flush_enum flush)
 {
        u64 num_bytes = 0;
        int ret = -ENOSPC;
@@ -4228,20 +4243,6 @@ static inline int __btrfs_block_rsv_refill(struct btrfs_root *root,
        return ret;
 }
-int btrfs_block_rsv_refill(struct btrfs_root *root,
-                           struct btrfs_block_rsv *block_rsv,
-                           u64 min_reserved)
-{
-        return __btrfs_block_rsv_refill(root, block_rsv, min_reserved, 1);
-}
-int btrfs_block_rsv_refill_noflush(struct btrfs_root *root,
-                                   struct btrfs_block_rsv *block_rsv,
-                                   u64 min_reserved)
-{
-        return __btrfs_block_rsv_refill(root, block_rsv, min_reserved, 0);
-}
 int btrfs_block_rsv_migrate(struct btrfs_block_rsv *src_rsv,
                            struct btrfs_block_rsv *dst_rsv,
                            u64 num_bytes)
@@ -4532,17 +4533,27 @@ int btrfs_delalloc_reserve_metadata(struct inode *inode, u64 num_bytes)
        u64 csum_bytes;
        unsigned nr_extents = 0;
        int extra_reserve = 0;
-        int flush = 1;
+        enum btrfs_reserve_flush_enum flush = BTRFS_RESERVE_FLUSH_ALL;
        int ret;
+        bool delalloc_lock = true;
-        /* Need to be holding the i_mutex here if we aren't free space cache */
+        /* If we are a free space inode we need to not flush since we will be in
-        if (btrfs_is_free_space_inode(inode))
+         * the middle of a transaction commit.  We also don't need the delalloc
-                flush = 0;
+         * mutex since we won't race with anybody.  We need this mostly to make
+         * lockdep shut its filthy mouth.
+         */
+        if (btrfs_is_free_space_inode(inode)) {
+                flush = BTRFS_RESERVE_NO_FLUSH;
+                delalloc_lock = false;
+        }
-        if (flush && btrfs_transaction_in_commit(root->fs_info))
+        if (flush != BTRFS_RESERVE_NO_FLUSH &&
+            btrfs_transaction_in_commit(root->fs_info))
                schedule_timeout(1);
-        mutex_lock(&BTRFS_I(inode)->delalloc_mutex);
+        if (delalloc_lock)
+                mutex_lock(&BTRFS_I(inode)->delalloc_mutex);
        num_bytes = ALIGN(num_bytes, root->sectorsize);
        spin_lock(&BTRFS_I(inode)->lock);
@@ -4572,7 +4583,11 @@ int btrfs_delalloc_reserve_metadata(struct inode *inode, u64 num_bytes)
                ret = btrfs_qgroup_reserve(root, num_bytes +
                                           nr_extents * root->leafsize);
                if (ret) {
-                        mutex_unlock(&BTRFS_I(inode)->delalloc_mutex);
+                        spin_lock(&BTRFS_I(inode)->lock);
+                        calc_csum_metadata_size(inode, num_bytes, 0);
+                        spin_unlock(&BTRFS_I(inode)->lock);
+                        if (delalloc_lock)
+                                mutex_unlock(&BTRFS_I(inode)->delalloc_mutex);
                        return ret;
                }
        }
@@ -4607,7 +4622,12 @@ int btrfs_delalloc_reserve_metadata(struct inode *inode, u64 num_bytes)
                                                      btrfs_ino(inode),
                                                      to_free, 0);
                }
-                mutex_unlock(&BTRFS_I(inode)->delalloc_mutex);
+                if (root->fs_info->quota_enabled) {
+                        btrfs_qgroup_free(root, num_bytes +
+                                                nr_extents * root->leafsize);
+                }
+                if (delalloc_lock)
+                        mutex_unlock(&BTRFS_I(inode)->delalloc_mutex);
                return ret;
        }
@@ -4619,7 +4639,9 @@ int btrfs_delalloc_reserve_metadata(struct inode *inode, u64 num_bytes)
        }
        BTRFS_I(inode)->reserved_extents += nr_extents;
        spin_unlock(&BTRFS_I(inode)->lock);
-        mutex_unlock(&BTRFS_I(inode)->delalloc_mutex);
+        if (delalloc_lock)
+                mutex_unlock(&BTRFS_I(inode)->delalloc_mutex);
        if (to_reserve)
                trace_btrfs_space_reservation(root->fs_info,"delalloc",
@@ -4969,9 +4991,13 @@ static int unpin_extent_range(struct btrfs_root *root, u64 start, u64 end)
 {
        struct btrfs_fs_info *fs_info = root->fs_info;
        struct btrfs_block_group_cache *cache = NULL;
+        struct btrfs_space_info *space_info;
+        struct btrfs_block_rsv *global_rsv = &fs_info->global_block_rsv;
        u64 len;
+        bool readonly;
        while (start <= end) {
+                readonly = false;
                if (!cache ||
                    start >= cache->key.objectid + cache->key.offset) {
                        if (cache)
@@ -4989,15 +5015,30 @@ static int unpin_extent_range(struct btrfs_root *root, u64 start, u64 end)
                }
                start += len;
+                space_info = cache->space_info;
-                spin_lock(&cache->space_info->lock);
+                spin_lock(&space_info->lock);
                spin_lock(&cache->lock);
                cache->pinned -= len;
-                cache->space_info->bytes_pinned -= len;
+                space_info->bytes_pinned -= len;
-                if (cache->ro)
+                if (cache->ro) {
-                        cache->space_info->bytes_readonly += len;
+                        space_info->bytes_readonly += len;
+                        readonly = true;
+                }
                spin_unlock(&cache->lock);
-                spin_unlock(&cache->space_info->lock);
+                if (!readonly && global_rsv->space_info == space_info) {
+                        spin_lock(&global_rsv->lock);
+                        if (!global_rsv->full) {
+                                len = min(len, global_rsv->size -
+                                          global_rsv->reserved);
+                                global_rsv->reserved += len;
+                                space_info->bytes_may_use += len;
+                                if (global_rsv->reserved >= global_rsv->size)
+                                        global_rsv->full = 1;
+                        }
+                        spin_unlock(&global_rsv->lock);
+                }
+                spin_unlock(&space_info->lock);
        }
        if (cache)
@@ -5466,7 +5507,7 @@ wait_block_group_cache_done(struct btrfs_block_group_cache *cache)
        return 0;
 }
-static int __get_block_group_index(u64 flags)
+int __get_raid_index(u64 flags)
 {
        int index;
@@ -5486,7 +5527,7 @@ static int __get_block_group_index(u64 flags)
 static int get_block_group_index(struct btrfs_block_group_cache *cache)
 {
-        return __get_block_group_index(cache->flags);
+        return __get_raid_index(cache->flags);
 }
 enum btrfs_loop_type {
@@ -6269,7 +6310,8 @@ use_block_rsv(struct btrfs_trans_handle *trans,
        block_rsv = get_block_rsv(trans, root);
        if (block_rsv->size == 0) {
-                ret = reserve_metadata_bytes(root, block_rsv, blocksize, 0);
+                ret = reserve_metadata_bytes(root, block_rsv, blocksize,
+                                             BTRFS_RESERVE_NO_FLUSH);
                /*
                 * If we couldn't reserve metadata bytes try and use some from
                 * the global reserve.
@@ -6292,11 +6334,11 @@ use_block_rsv(struct btrfs_trans_handle *trans,
                static DEFINE_RATELIMIT_STATE(_rs,
                                DEFAULT_RATELIMIT_INTERVAL,
                                /*DEFAULT_RATELIMIT_BURST*/ 2);
-                if (__ratelimit(&_rs)) {
+                if (__ratelimit(&_rs))
-                        printk(KERN_DEBUG "btrfs: block rsv returned %d\n", ret);
+                        WARN(1, KERN_DEBUG "btrfs: block rsv returned %d\n",
-                        WARN_ON(1);
+                             ret);
-                }
+                ret = reserve_metadata_bytes(root, block_rsv, blocksize,
-                ret = reserve_metadata_bytes(root, block_rsv, blocksize, 0);
+                                             BTRFS_RESERVE_NO_FLUSH);
                if (!ret) {
                        return block_rsv;
                } else if (ret && block_rsv != global_rsv) {
@@ -7427,7 +7469,7 @@ int btrfs_can_relocate(struct btrfs_root *root, u64 bytenr)
         */
        target = get_restripe_target(root->fs_info, block_group->flags);
        if (target) {
-                index = __get_block_group_index(extended_to_chunk(target));
+                index = __get_raid_index(extended_to_chunk(target));
        } else {
                /*
                 * this is just a balance, so if we were marked as full
@@ -7461,7 +7503,8 @@ int btrfs_can_relocate(struct btrfs_root *root, u64 bytenr)
                 * check to make sure we can actually find a chunk with enough
                 * space to fit our block group in.
                 */
-                if (device->total_bytes > device->bytes_used + min_free) {
+                if (device->total_bytes > device->bytes_used + min_free &&
+                    !device->is_tgtdev_for_dev_replace) {
                        ret = find_free_dev_extent(device, min_free,
                                                   &dev_offset, NULL);
                        if (!ret)
diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c
index 472873a94d96..1b319df29eee 100644
--- a/fs/btrfs/extent_io.c
+++ b/fs/btrfs/extent_io.c
@@ -341,12 +341,10 @@ static int insert_state(struct extent_io_tree *tree,
 {
        struct rb_node *node;
-        if (end < start) {
+        if (end < start)
-                printk(KERN_ERR "btrfs end < start %llu %llu\n",
+                WARN(1, KERN_ERR "btrfs end < start %llu %llu\n",
                       (unsigned long long)end,
                       (unsigned long long)start);
-                WARN_ON(1);
-        }
        state->start = start;
        state->end = end;
@@ -1919,12 +1917,12 @@ static void repair_io_failure_callback(struct bio *bio, int err)
 * the standard behavior is to write all copies in a raid setup. here we only
 * want to write the one bad copy. so we do the mapping for ourselves and issue
 * submit_bio directly.
- * to avoid any synchonization issues, wait for the data after writing, which
+ * to avoid any synchronization issues, wait for the data after writing, which
 * actually prevents the read that triggered the error from finishing.
 * currently, there can be no more than two copies of every data bit. thus,
 * exactly one rewrite is required.
 */
-int repair_io_failure(struct btrfs_mapping_tree *map_tree, u64 start,
+int repair_io_failure(struct btrfs_fs_info *fs_info, u64 start,
                        u64 length, u64 logical, struct page *page,
                        int mirror_num)
 {
@@ -1946,7 +1944,7 @@ int repair_io_failure(struct btrfs_mapping_tree *map_tree, u64 start,
        bio->bi_size = 0;
        map_length = length;
-        ret = btrfs_map_block(map_tree, WRITE, logical,
+        ret = btrfs_map_block(fs_info, WRITE, logical,
                              &map_length, &bbio, mirror_num);
        if (ret) {
                bio_put(bio);
@@ -1984,14 +1982,13 @@ int repair_io_failure(struct btrfs_mapping_tree *map_tree, u64 start,
 int repair_eb_io_failure(struct btrfs_root *root, struct extent_buffer *eb,
                         int mirror_num)
 {
-        struct btrfs_mapping_tree *map_tree = &root->fs_info->mapping_tree;
        u64 start = eb->start;
        unsigned long i, num_pages = num_extent_pages(eb->start, eb->len);
        int ret = 0;
        for (i = 0; i < num_pages; i++) {
                struct page *p = extent_buffer_page(eb, i);
-                ret = repair_io_failure(map_tree, start, PAGE_CACHE_SIZE,
+                ret = repair_io_failure(root->fs_info, start, PAGE_CACHE_SIZE,
                                        start, p, mirror_num);
                if (ret)
                        break;
@@ -2010,7 +2007,7 @@ static int clean_io_failure(u64 start, struct page *page)
        u64 private;
        u64 private_failure;
        struct io_failure_record *failrec;
-        struct btrfs_mapping_tree *map_tree;
+        struct btrfs_fs_info *fs_info;
        struct extent_state *state;
        int num_copies;
        int did_repair = 0;
@@ -2046,11 +2043,11 @@ static int clean_io_failure(u64 start, struct page *page)
        spin_unlock(&BTRFS_I(inode)->io_tree.lock);
        if (state && state->start == failrec->start) {
-                map_tree = &BTRFS_I(inode)->root->fs_info->mapping_tree;
+                fs_info = BTRFS_I(inode)->root->fs_info;
-                num_copies = btrfs_num_copies(map_tree, failrec->logical,
+                num_copies = btrfs_num_copies(fs_info, failrec->logical,
-                                                failrec->len);
+                                              failrec->len);
                if (num_copies > 1)  {
-                        ret = repair_io_failure(map_tree, start, failrec->len,
+                        ret = repair_io_failure(fs_info, start, failrec->len,
                                                failrec->logical, page,
                                                failrec->failed_mirror);
                        did_repair = !ret;
@@ -2159,9 +2156,8 @@ static int bio_readpage_error(struct bio *failed_bio, struct page *page,
                 * clean_io_failure() clean all those errors at once.
                 */
        }
-        num_copies = btrfs_num_copies(
+        num_copies = btrfs_num_copies(BTRFS_I(inode)->root->fs_info,
-                              &BTRFS_I(inode)->root->fs_info->mapping_tree,
+                                      failrec->logical, failrec->len);
-                              failrec->logical, failrec->len);
        if (num_copies == 1) {
                /*
                 * we only have a single copy of the data, so don't bother with
@@ -2466,10 +2462,6 @@ btrfs_bio_alloc(struct block_device *bdev, u64 first_sector, int nr_vecs,
        return bio;
 }
-/*
- * Since writes are async, they will only return -ENOMEM.
- * Reads can return the full range of I/O error conditions.
- */
 static int __must_check submit_one_bio(int rw, struct bio *bio,
                                       int mirror_num, unsigned long bio_flags)
 {
@@ -4721,10 +4713,9 @@ int map_private_extent_buffer(struct extent_buffer *eb, unsigned long start,
        }
        if (start + min_len > eb->len) {
-                printk(KERN_ERR "btrfs bad mapping eb start %llu len %lu, "
+                WARN(1, KERN_ERR "btrfs bad mapping eb start %llu len %lu, "
                       "wanted %lu %lu\n", (unsigned long long)eb->start,
                       eb->len, start, min_len);
-                WARN_ON(1);
                return -EINVAL;
        }
diff --git a/fs/btrfs/extent_io.h b/fs/btrfs/extent_io.h
index 711d12b80028..2eacfabd3263 100644
--- a/fs/btrfs/extent_io.h
+++ b/fs/btrfs/extent_io.h
@@ -337,9 +337,9 @@ struct bio *
 btrfs_bio_alloc(struct block_device *bdev, u64 first_sector, int nr_vecs,
                gfp_t gfp_flags);
-struct btrfs_mapping_tree;
+struct btrfs_fs_info;
-int repair_io_failure(struct btrfs_mapping_tree *map_tree, u64 start,
+int repair_io_failure(struct btrfs_fs_info *fs_info, u64 start,
                        u64 length, u64 logical, struct page *page,
                        int mirror_num);
 int end_extent_writepage(struct page *page, int err, u64 start, u64 end);
diff --git a/fs/btrfs/extent_map.c b/fs/btrfs/extent_map.c
index b8cbc8d5c7f7..f169d6b11d7f 100644
--- a/fs/btrfs/extent_map.c
+++ b/fs/btrfs/extent_map.c
@@ -49,7 +49,7 @@ void extent_map_tree_init(struct extent_map_tree *tree)
 struct extent_map *alloc_extent_map(void)
 {
        struct extent_map *em;
-        em = kmem_cache_alloc(extent_map_cache, GFP_NOFS);
+        em = kmem_cache_zalloc(extent_map_cache, GFP_NOFS);
        if (!em)
                return NULL;
        em->in_tree = 0;
@@ -198,16 +198,15 @@ static void try_merge_map(struct extent_map_tree *tree, struct extent_map *em)
                        merge = rb_entry(rb, struct extent_map, rb_node);
                if (rb && mergable_maps(merge, em)) {
                        em->start = merge->start;
+                        em->orig_start = merge->orig_start;
                        em->len += merge->len;
                        em->block_len += merge->block_len;
                        em->block_start = merge->block_start;
                        merge->in_tree = 0;
-                        if (merge->generation > em->generation) {
+                        em->mod_len = (em->mod_len + em->mod_start) - merge->mod_start;
-                                em->mod_start = em->start;
+                        em->mod_start = merge->mod_start;
-                                em->mod_len = em->len;
+                        em->generation = max(em->generation, merge->generation);
-                                em->generation = merge->generation;
+                        list_move(&em->list, &tree->modified_extents);
-                                list_move(&em->list, &tree->modified_extents);
-                        }
                        list_del_init(&merge->list);
                        rb_erase(&merge->rb_node, &tree->map);
@@ -223,23 +222,19 @@ static void try_merge_map(struct extent_map_tree *tree, struct extent_map *em)
                em->block_len += merge->len;
                rb_erase(&merge->rb_node, &tree->map);
                merge->in_tree = 0;
-                if (merge->generation > em->generation) {
+                em->mod_len = (merge->mod_start + merge->mod_len) - em->mod_start;
-                        em->mod_len = em->len;
+                em->generation = max(em->generation, merge->generation);
-                        em->generation = merge->generation;
-                        list_move(&em->list, &tree->modified_extents);
-                }
                list_del_init(&merge->list);
                free_extent_map(merge);
        }
 }
 /**
- * unpint_extent_cache - unpin an extent from the cache
+ * unpin_extent_cache - unpin an extent from the cache
 * @tree:       tree to unpin the extent in
 * @start:      logical offset in the file
 * @len:        length of the extent
 * @gen:        generation that this extent has been modified in
- * @prealloc:   if this is set we need to clear the prealloc flag
 *
 * Called after an extent has been written to disk properly.  Set the generation
 * to the generation that actually added the file item to the inode so we know
@@ -266,9 +261,9 @@ int unpin_extent_cache(struct extent_map_tree *tree, u64 start, u64 len,
        em->mod_start = em->start;
        em->mod_len = em->len;
-        if (test_bit(EXTENT_FLAG_PREALLOC, &em->flags)) {
+        if (test_bit(EXTENT_FLAG_FILLING, &em->flags)) {
                prealloc = true;
-                clear_bit(EXTENT_FLAG_PREALLOC, &em->flags);
+                clear_bit(EXTENT_FLAG_FILLING, &em->flags);
        }
        try_merge_map(tree, em);
diff --git a/fs/btrfs/extent_map.h b/fs/btrfs/extent_map.h
index 679225555f7b..922943ce29e8 100644
--- a/fs/btrfs/extent_map.h
+++ b/fs/btrfs/extent_map.h
@@ -14,6 +14,7 @@
 #define EXTENT_FLAG_VACANCY 2 /* no file extent item found */
 #define EXTENT_FLAG_PREALLOC 3 /* pre-allocated extent */
 #define EXTENT_FLAG_LOGGING 4 /* Logging this extent */
+#define EXTENT_FLAG_FILLING 5 /* Filling in a preallocated extent */
 struct extent_map {
        struct rb_node rb_node;
@@ -24,6 +25,7 @@ struct extent_map {
        u64 mod_start;
        u64 mod_len;
        u64 orig_start;
+        u64 orig_block_len;
        u64 block_start;
        u64 block_len;
        u64 generation;
diff --git a/fs/btrfs/file-item.c b/fs/btrfs/file-item.c
index 1ad08e4e4a15..bd38cef42358 100644
--- a/fs/btrfs/file-item.c
+++ b/fs/btrfs/file-item.c
@@ -133,7 +133,6 @@ fail:
        return ERR_PTR(ret);
 }
 int btrfs_lookup_file_extent(struct btrfs_trans_handle *trans,
                             struct btrfs_root *root,
                             struct btrfs_path *path, u64 objectid,
@@ -151,6 +150,26 @@ int btrfs_lookup_file_extent(struct btrfs_trans_handle *trans,
        return ret;
 }
+u64 btrfs_file_extent_length(struct btrfs_path *path)
+{
+        int extent_type;
+        struct btrfs_file_extent_item *fi;
+        u64 len;
+        fi = btrfs_item_ptr(path->nodes[0], path->slots[0],
+                            struct btrfs_file_extent_item);
+        extent_type = btrfs_file_extent_type(path->nodes[0], fi);
+        if (extent_type == BTRFS_FILE_EXTENT_REG ||
+            extent_type == BTRFS_FILE_EXTENT_PREALLOC)
+                len = btrfs_file_extent_num_bytes(path->nodes[0], fi);
+        else if (extent_type == BTRFS_FILE_EXTENT_INLINE)
+                len = btrfs_file_extent_inline_len(path->nodes[0], fi);
+        else
+                BUG();
+        return len;
+}
 static int __btrfs_lookup_bio_sums(struct btrfs_root *root,
                                   struct inode *inode, struct bio *bio,
diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c
index 9ab1bed88116..77061bf43edb 100644
--- a/fs/btrfs/file.c
+++ b/fs/btrfs/file.c
@@ -41,6 +41,7 @@
 #include "compat.h"
 #include "volumes.h"
+static struct kmem_cache *btrfs_inode_defrag_cachep;
 /*
 * when auto defrag is enabled we
 * queue up these defrag structs to remember which
@@ -90,7 +91,7 @@ static int __compare_inode_defrag(struct inode_defrag *defrag1,
 * If an existing record is found the defrag item you
 * pass in is freed
 */
-static void __btrfs_add_inode_defrag(struct inode *inode,
+static int __btrfs_add_inode_defrag(struct inode *inode,
                                    struct inode_defrag *defrag)
 {
        struct btrfs_root *root = BTRFS_I(inode)->root;
@@ -118,18 +119,24 @@ static void __btrfs_add_inode_defrag(struct inode *inode,
                                entry->transid = defrag->transid;
                        if (defrag->last_offset > entry->last_offset)
                                entry->last_offset = defrag->last_offset;
-                        goto exists;
+                        return -EEXIST;
                }
        }
        set_bit(BTRFS_INODE_IN_DEFRAG, &BTRFS_I(inode)->runtime_flags);
        rb_link_node(&defrag->rb_node, parent, p);
        rb_insert_color(&defrag->rb_node, &root->fs_info->defrag_inodes);
-        return;
+        return 0;
+}
-exists:
+static inline int __need_auto_defrag(struct btrfs_root *root)
-        kfree(defrag);
+{
-        return;
+        if (!btrfs_test_opt(root, AUTO_DEFRAG))
+                return 0;
+        if (btrfs_fs_closing(root->fs_info))
+                return 0;
+        return 1;
 }
 /*
@@ -142,11 +149,9 @@ int btrfs_add_inode_defrag(struct btrfs_trans_handle *trans,
        struct btrfs_root *root = BTRFS_I(inode)->root;
        struct inode_defrag *defrag;
        u64 transid;
+        int ret;
-        if (!btrfs_test_opt(root, AUTO_DEFRAG))
+        if (!__need_auto_defrag(root))
-                return 0;
-        if (btrfs_fs_closing(root->fs_info))
                return 0;
        if (test_bit(BTRFS_INODE_IN_DEFRAG, &BTRFS_I(inode)->runtime_flags))
@@ -157,7 +162,7 @@ int btrfs_add_inode_defrag(struct btrfs_trans_handle *trans,
        else
                transid = BTRFS_I(inode)->root->last_trans;
-        defrag = kzalloc(sizeof(*defrag), GFP_NOFS);
+        defrag = kmem_cache_zalloc(btrfs_inode_defrag_cachep, GFP_NOFS);
        if (!defrag)
                return -ENOMEM;
@@ -166,20 +171,56 @@ int btrfs_add_inode_defrag(struct btrfs_trans_handle *trans,
        defrag->root = root->root_key.objectid;
        spin_lock(&root->fs_info->defrag_inodes_lock);
-        if (!test_bit(BTRFS_INODE_IN_DEFRAG, &BTRFS_I(inode)->runtime_flags))
+        if (!test_bit(BTRFS_INODE_IN_DEFRAG, &BTRFS_I(inode)->runtime_flags)) {
-                __btrfs_add_inode_defrag(inode, defrag);
+                /*
-        else
+                 * If we set IN_DEFRAG flag and evict the inode from memory,
-                kfree(defrag);
+                 * and then re-read this inode, this new inode doesn't have
+                 * IN_DEFRAG flag. At the case, we may find the existed defrag.
+                 */
+                ret = __btrfs_add_inode_defrag(inode, defrag);
+                if (ret)
+                        kmem_cache_free(btrfs_inode_defrag_cachep, defrag);
+        } else {
+                kmem_cache_free(btrfs_inode_defrag_cachep, defrag);
+        }
        spin_unlock(&root->fs_info->defrag_inodes_lock);
        return 0;
 }
 /*
- * must be called with the defrag_inodes lock held
+ * Requeue the defrag object. If there is a defrag object that points to
+ * the same inode in the tree, we will merge them together (by
+ * __btrfs_add_inode_defrag()) and free the one that we want to requeue.
 */
-struct inode_defrag *btrfs_find_defrag_inode(struct btrfs_fs_info *info,
+void btrfs_requeue_inode_defrag(struct inode *inode,
-                                             u64 root, u64 ino,
+                                struct inode_defrag *defrag)
-                                             struct rb_node **next)
+{
+        struct btrfs_root *root = BTRFS_I(inode)->root;
+        int ret;
+        if (!__need_auto_defrag(root))
+                goto out;
+        /*
+         * Here we don't check the IN_DEFRAG flag, because we need merge
+         * them together.
+         */
+        spin_lock(&root->fs_info->defrag_inodes_lock);
+        ret = __btrfs_add_inode_defrag(inode, defrag);
+        spin_unlock(&root->fs_info->defrag_inodes_lock);
+        if (ret)
+                goto out;
+        return;
+out:
+        kmem_cache_free(btrfs_inode_defrag_cachep, defrag);
+}
+/*
+ * pick the defragable inode that we want, if it doesn't exist, we will get
+ * the next one.
+ */
+static struct inode_defrag *
+btrfs_pick_defrag_inode(struct btrfs_fs_info *fs_info, u64 root, u64 ino)
 {
        struct inode_defrag *entry = NULL;
        struct inode_defrag tmp;
@@ -190,7 +231,8 @@ struct inode_defrag *btrfs_find_defrag_inode(struct btrfs_fs_info *info,
        tmp.ino = ino;
        tmp.root = root;
-        p = info->defrag_inodes.rb_node;
+        spin_lock(&fs_info->defrag_inodes_lock);
+        p = fs_info->defrag_inodes.rb_node;
        while (p) {
                parent = p;
                entry = rb_entry(parent, struct inode_defrag, rb_node);
@@ -201,52 +243,131 @@ struct inode_defrag *btrfs_find_defrag_inode(struct btrfs_fs_info *info,
                else if (ret > 0)
                        p = parent->rb_right;
                else
-                        return entry;
+                        goto out;
        }
-        if (next) {
+        if (parent && __compare_inode_defrag(&tmp, entry) > 0) {
-                while (parent && __compare_inode_defrag(&tmp, entry) > 0) {
+                parent = rb_next(parent);
-                        parent = rb_next(parent);
+                if (parent)
                        entry = rb_entry(parent, struct inode_defrag, rb_node);
-                }
+                else
-                *next = parent;
+                        entry = NULL;
        }
-        return NULL;
+out:
+        if (entry)
+                rb_erase(parent, &fs_info->defrag_inodes);
+        spin_unlock(&fs_info->defrag_inodes_lock);
+        return entry;
 }
-/*
+void btrfs_cleanup_defrag_inodes(struct btrfs_fs_info *fs_info)
- * run through the list of inodes in the FS that need
- * defragging
- */
-int btrfs_run_defrag_inodes(struct btrfs_fs_info *fs_info)
 {
        struct inode_defrag *defrag;
+        struct rb_node *node;
+        spin_lock(&fs_info->defrag_inodes_lock);
+        node = rb_first(&fs_info->defrag_inodes);
+        while (node) {
+                rb_erase(node, &fs_info->defrag_inodes);
+                defrag = rb_entry(node, struct inode_defrag, rb_node);
+                kmem_cache_free(btrfs_inode_defrag_cachep, defrag);
+                if (need_resched()) {
+                        spin_unlock(&fs_info->defrag_inodes_lock);
+                        cond_resched();
+                        spin_lock(&fs_info->defrag_inodes_lock);
+                }
+                node = rb_first(&fs_info->defrag_inodes);
+        }
+        spin_unlock(&fs_info->defrag_inodes_lock);
+}
+#define BTRFS_DEFRAG_BATCH      1024
+static int __btrfs_run_defrag_inode(struct btrfs_fs_info *fs_info,
+                                    struct inode_defrag *defrag)
+{
        struct btrfs_root *inode_root;
        struct inode *inode;
-        struct rb_node *n;
        struct btrfs_key key;
        struct btrfs_ioctl_defrag_range_args range;
-        u64 first_ino = 0;
-        u64 root_objectid = 0;
        int num_defrag;
-        int defrag_batch = 1024;
+        /* get the inode */
+        key.objectid = defrag->root;
+        btrfs_set_key_type(&key, BTRFS_ROOT_ITEM_KEY);
+        key.offset = (u64)-1;
+        inode_root = btrfs_read_fs_root_no_name(fs_info, &key);
+        if (IS_ERR(inode_root)) {
+                kmem_cache_free(btrfs_inode_defrag_cachep, defrag);
+                return PTR_ERR(inode_root);
+        }
+        key.objectid = defrag->ino;
+        btrfs_set_key_type(&key, BTRFS_INODE_ITEM_KEY);
+        key.offset = 0;
+        inode = btrfs_iget(fs_info->sb, &key, inode_root, NULL);
+        if (IS_ERR(inode)) {
+                kmem_cache_free(btrfs_inode_defrag_cachep, defrag);
+                return PTR_ERR(inode);
+        }
+        /* do a chunk of defrag */
+        clear_bit(BTRFS_INODE_IN_DEFRAG, &BTRFS_I(inode)->runtime_flags);
        memset(&range, 0, sizeof(range));
        range.len = (u64)-1;
+        range.start = defrag->last_offset;
+        sb_start_write(fs_info->sb);
+        num_defrag = btrfs_defrag_file(inode, NULL, &range, defrag->transid,
+                                       BTRFS_DEFRAG_BATCH);
+        sb_end_write(fs_info->sb);
+        /*
+         * if we filled the whole defrag batch, there
+         * must be more work to do.  Queue this defrag
+         * again
+         */
+        if (num_defrag == BTRFS_DEFRAG_BATCH) {
+                defrag->last_offset = range.start;
+                btrfs_requeue_inode_defrag(inode, defrag);
+        } else if (defrag->last_offset && !defrag->cycled) {
+                /*
+                 * we didn't fill our defrag batch, but
+                 * we didn't start at zero.  Make sure we loop
+                 * around to the start of the file.
+                 */
+                defrag->last_offset = 0;
+                defrag->cycled = 1;
+                btrfs_requeue_inode_defrag(inode, defrag);
+        } else {
+                kmem_cache_free(btrfs_inode_defrag_cachep, defrag);
+        }
+        iput(inode);
+        return 0;
+}
+/*
+ * run through the list of inodes in the FS that need
+ * defragging
+ */
+int btrfs_run_defrag_inodes(struct btrfs_fs_info *fs_info)
+{
+        struct inode_defrag *defrag;
+        u64 first_ino = 0;
+        u64 root_objectid = 0;
        atomic_inc(&fs_info->defrag_running);
-        spin_lock(&fs_info->defrag_inodes_lock);
        while(1) {
-                n = NULL;
+                if (!__need_auto_defrag(fs_info->tree_root))
+                        break;
                /* find an inode to defrag */
-                defrag = btrfs_find_defrag_inode(fs_info, root_objectid,
+                defrag = btrfs_pick_defrag_inode(fs_info, root_objectid,
-                                                 first_ino, &n);
+                                                 first_ino);
                if (!defrag) {
-                        if (n) {
+                        if (root_objectid || first_ino) {
-                                defrag = rb_entry(n, struct inode_defrag,
-                                                  rb_node);
-                        } else if (root_objectid || first_ino) {
                                root_objectid = 0;
                                first_ino = 0;
                                continue;
@@ -255,70 +376,11 @@ int btrfs_run_defrag_inodes(struct btrfs_fs_info *fs_info)
                        }
                }
-                /* remove it from the rbtree */
                first_ino = defrag->ino + 1;
                root_objectid = defrag->root;
-                rb_erase(&defrag->rb_node, &fs_info->defrag_inodes);
-                if (btrfs_fs_closing(fs_info))
-                        goto next_free;
-                spin_unlock(&fs_info->defrag_inodes_lock);
-                /* get the inode */
-                key.objectid = defrag->root;
-                btrfs_set_key_type(&key, BTRFS_ROOT_ITEM_KEY);
-                key.offset = (u64)-1;
-                inode_root = btrfs_read_fs_root_no_name(fs_info, &key);
-                if (IS_ERR(inode_root))
-                        goto next;
-                key.objectid = defrag->ino;
-                btrfs_set_key_type(&key, BTRFS_INODE_ITEM_KEY);
-                key.offset = 0;
-                inode = btrfs_iget(fs_info->sb, &key, inode_root, NULL);
-                if (IS_ERR(inode))
-                        goto next;
-                /* do a chunk of defrag */
+                __btrfs_run_defrag_inode(fs_info, defrag);
-                clear_bit(BTRFS_INODE_IN_DEFRAG, &BTRFS_I(inode)->runtime_flags);
-                range.start = defrag->last_offset;
-                num_defrag = btrfs_defrag_file(inode, NULL, &range, defrag->transid,
-                                               defrag_batch);
-                /*
-                 * if we filled the whole defrag batch, there
-                 * must be more work to do.  Queue this defrag
-                 * again
-                 */
-                if (num_defrag == defrag_batch) {
-                        defrag->last_offset = range.start;
-                        __btrfs_add_inode_defrag(inode, defrag);
-                        /*
-                         * we don't want to kfree defrag, we added it back to
-                         * the rbtree
-                         */
-                        defrag = NULL;
-                } else if (defrag->last_offset && !defrag->cycled) {
-                        /*
-                         * we didn't fill our defrag batch, but
-                         * we didn't start at zero.  Make sure we loop
-                         * around to the start of the file.
-                         */
-                        defrag->last_offset = 0;
-                        defrag->cycled = 1;
-                        __btrfs_add_inode_defrag(inode, defrag);
-                        defrag = NULL;
-                }
-                iput(inode);
-next:
-                spin_lock(&fs_info->defrag_inodes_lock);
-next_free:
-                kfree(defrag);
        }
-        spin_unlock(&fs_info->defrag_inodes_lock);
        atomic_dec(&fs_info->defrag_running);
        /*
@@ -526,6 +588,8 @@ void btrfs_drop_extent_cache(struct inode *inode, u64 start, u64 end,
                                split->block_len = em->block_len;
                        else
                                split->block_len = split->len;
+                        split->orig_block_len = max(split->block_len,
+                                                    em->orig_block_len);
                        split->generation = gen;
                        split->bdev = em->bdev;
                        split->flags = flags;
@@ -547,6 +611,8 @@ void btrfs_drop_extent_cache(struct inode *inode, u64 start, u64 end,
                        split->flags = flags;
                        split->compress_type = em->compress_type;
                        split->generation = gen;
+                        split->orig_block_len = max(em->block_len,
+                                                    em->orig_block_len);
                        if (compressed) {
                                split->block_len = em->block_len;
@@ -555,7 +621,7 @@ void btrfs_drop_extent_cache(struct inode *inode, u64 start, u64 end,
                        } else {
                                split->block_len = split->len;
                                split->block_start = em->block_start + diff;
-                                split->orig_start = split->start;
+                                split->orig_start = em->orig_start;
                        }
                        ret = add_extent_mapping(em_tree, split);
@@ -1346,10 +1412,9 @@ static noinline ssize_t __btrfs_buffered_write(struct file *file,
                cond_resched();
-                balance_dirty_pages_ratelimited_nr(inode->i_mapping,
+                balance_dirty_pages_ratelimited(inode->i_mapping);
-                                                   dirty_pages);
                if (dirty_pages < (root->leafsize >> PAGE_CACHE_SHIFT) + 1)
-                        btrfs_btree_balance_dirty(root, 1);
+                        btrfs_btree_balance_dirty(root);
                pos += copied;
                num_written += copied;
@@ -1398,6 +1463,24 @@ out:
        return written ? written : err;
 }
+static void update_time_for_write(struct inode *inode)
+{
+        struct timespec now;
+        if (IS_NOCMTIME(inode))
+                return;
+        now = current_fs_time(inode->i_sb);
+        if (!timespec_equal(&inode->i_mtime, &now))
+                inode->i_mtime = now;
+        if (!timespec_equal(&inode->i_ctime, &now))
+                inode->i_ctime = now;
+        if (IS_I_VERSION(inode))
+                inode_inc_iversion(inode);
+}
 static ssize_t btrfs_file_aio_write(struct kiocb *iocb,
                                    const struct iovec *iov,
                                    unsigned long nr_segs, loff_t pos)
@@ -1410,6 +1493,7 @@ static ssize_t btrfs_file_aio_write(struct kiocb *iocb,
        ssize_t num_written = 0;
        ssize_t err = 0;
        size_t count, ocount;
+        bool sync = (file->f_flags & O_DSYNC) || IS_SYNC(file->f_mapping->host);
        sb_start_write(inode->i_sb);
@@ -1452,11 +1536,13 @@ static ssize_t btrfs_file_aio_write(struct kiocb *iocb,
                goto out;
        }
-        err = file_update_time(file);
+        /*
-        if (err) {
+         * We reserve space for updating the inode when we reserve space for the
-                mutex_unlock(&inode->i_mutex);
+         * extent we are going to write, so we will enospc out there.  We don't
-                goto out;
+         * need to start yet another transaction to update the inode as we will
-        }
+         * update the inode when we finish writing whatever data we write.
+         */
+        update_time_for_write(inode);
        start_pos = round_down(pos, root->sectorsize);
        if (start_pos > i_size_read(inode)) {
@@ -1467,6 +1553,9 @@ static ssize_t btrfs_file_aio_write(struct kiocb *iocb,
                }
        }
+        if (sync)
+                atomic_inc(&BTRFS_I(inode)->sync_writers);
        if (unlikely(file->f_flags & O_DIRECT)) {
                num_written = __btrfs_direct_write(iocb, iov, nr_segs,
                                                   pos, ppos, count, ocount);
@@ -1493,14 +1582,21 @@ static ssize_t btrfs_file_aio_write(struct kiocb *iocb,
         * this will either be one more than the running transaction
         * or the generation used for the next transaction if there isn't
         * one running right now.
+         *
+         * We also have to set last_sub_trans to the current log transid,
+         * otherwise subsequent syncs to a file that's been synced in this
+         * transaction will appear to have already occured.
         */
        BTRFS_I(inode)->last_trans = root->fs_info->generation + 1;
+        BTRFS_I(inode)->last_sub_trans = root->log_transid;
        if (num_written > 0 || num_written == -EIOCBQUEUED) {
                err = generic_write_sync(file, pos, num_written);
                if (err < 0 && num_written > 0)
                        num_written = err;
        }
 out:
+        if (sync)
+                atomic_dec(&BTRFS_I(inode)->sync_writers);
        sb_end_write(inode->i_sb);
        current->backing_dev_info = NULL;
        return num_written ? num_written : err;
@@ -1551,7 +1647,9 @@ int btrfs_sync_file(struct file *file, loff_t start, loff_t end, int datasync)
         * out of the ->i_mutex. If so, we can flush the dirty pages by
         * multi-task, and make the performance up.
         */
+        atomic_inc(&BTRFS_I(inode)->sync_writers);
        ret = filemap_write_and_wait_range(inode->i_mapping, start, end);
+        atomic_dec(&BTRFS_I(inode)->sync_writers);
        if (ret)
                return ret;
@@ -1562,7 +1660,7 @@ int btrfs_sync_file(struct file *file, loff_t start, loff_t end, int datasync)
         * range being left.
         */
        atomic_inc(&root->log_batch);
-        btrfs_wait_ordered_range(inode, start, end);
+        btrfs_wait_ordered_range(inode, start, end - start + 1);
        atomic_inc(&root->log_batch);
        /*
@@ -1768,6 +1866,7 @@ out:
                hole_em->block_start = EXTENT_MAP_HOLE;
                hole_em->block_len = 0;
+                hole_em->orig_block_len = 0;
                hole_em->bdev = root->fs_info->fs_devices->latest_bdev;
                hole_em->compress_type = BTRFS_COMPRESS_NONE;
                hole_em->generation = trans->transid;
@@ -1797,48 +1896,51 @@ static int btrfs_punch_hole(struct inode *inode, loff_t offset, loff_t len)
        struct btrfs_path *path;
        struct btrfs_block_rsv *rsv;
        struct btrfs_trans_handle *trans;
-        u64 mask = BTRFS_I(inode)->root->sectorsize - 1;
+        u64 lockstart = round_up(offset, BTRFS_I(inode)->root->sectorsize);
-        u64 lockstart = (offset + mask) & ~mask;
+        u64 lockend = round_down(offset + len,
-        u64 lockend = ((offset + len) & ~mask) - 1;
+                                 BTRFS_I(inode)->root->sectorsize) - 1;
        u64 cur_offset = lockstart;
        u64 min_size = btrfs_calc_trunc_metadata_size(root, 1);
        u64 drop_end;
-        unsigned long nr;
        int ret = 0;
        int err = 0;
-        bool same_page = (offset >> PAGE_CACHE_SHIFT) ==
+        bool same_page = ((offset >> PAGE_CACHE_SHIFT) ==
-                ((offset + len) >> PAGE_CACHE_SHIFT);
+                          ((offset + len - 1) >> PAGE_CACHE_SHIFT));
        btrfs_wait_ordered_range(inode, offset, len);
        mutex_lock(&inode->i_mutex);
-        if (offset >= inode->i_size) {
+        /*
-                mutex_unlock(&inode->i_mutex);
+         * We needn't truncate any page which is beyond the end of the file
-                return 0;
+         * because we are sure there is no data there.
-        }
+         */
        /*
         * Only do this if we are in the same page and we aren't doing the
         * entire page.
         */
        if (same_page && len < PAGE_CACHE_SIZE) {
-                ret = btrfs_truncate_page(inode, offset, len, 0);
+                if (offset < round_up(inode->i_size, PAGE_CACHE_SIZE))
+                        ret = btrfs_truncate_page(inode, offset, len, 0);
                mutex_unlock(&inode->i_mutex);
                return ret;
        }
        /* zero back part of the first page */
-        ret = btrfs_truncate_page(inode, offset, 0, 0);
+        if (offset < round_up(inode->i_size, PAGE_CACHE_SIZE)) {
-        if (ret) {
+                ret = btrfs_truncate_page(inode, offset, 0, 0);
-                mutex_unlock(&inode->i_mutex);
+                if (ret) {
-                return ret;
+                        mutex_unlock(&inode->i_mutex);
+                        return ret;
+                }
        }
        /* zero the front end of the last page */
-        ret = btrfs_truncate_page(inode, offset + len, 0, 1);
+        if (offset + len < round_up(inode->i_size, PAGE_CACHE_SIZE)) {
-        if (ret) {
+                ret = btrfs_truncate_page(inode, offset + len, 0, 1);
-                mutex_unlock(&inode->i_mutex);
+                if (ret) {
-                return ret;
+                        mutex_unlock(&inode->i_mutex);
+                        return ret;
+                }
        }
        if (lockend < lockstart) {
@@ -1931,9 +2033,8 @@ static int btrfs_punch_hole(struct inode *inode, loff_t offset, loff_t len)
                        break;
                }
-                nr = trans->blocks_used;
                btrfs_end_transaction(trans, root);
-                btrfs_btree_balance_dirty(root, nr);
+                btrfs_btree_balance_dirty(root);
                trans = btrfs_start_transaction(root, 3);
                if (IS_ERR(trans)) {
@@ -1964,11 +2065,13 @@ out_trans:
        if (!trans)
                goto out_free;
+        inode_inc_iversion(inode);
+        inode->i_mtime = inode->i_ctime = CURRENT_TIME;
        trans->block_rsv = &root->fs_info->trans_block_rsv;
        ret = btrfs_update_inode(trans, root, inode);
-        nr = trans->blocks_used;
        btrfs_end_transaction(trans, root);
-        btrfs_btree_balance_dirty(root, nr);
+        btrfs_btree_balance_dirty(root);
 out_free:
        btrfs_free_path(path);
        btrfs_free_block_rsv(root, rsv);
@@ -1992,12 +2095,12 @@ static long btrfs_fallocate(struct file *file, int mode,
        u64 alloc_end;
        u64 alloc_hint = 0;
        u64 locked_end;
-        u64 mask = BTRFS_I(inode)->root->sectorsize - 1;
        struct extent_map *em;
+        int blocksize = BTRFS_I(inode)->root->sectorsize;
        int ret;
-        alloc_start = offset & ~mask;
+        alloc_start = round_down(offset, blocksize);
-        alloc_end =  (offset + len + mask) & ~mask;
+        alloc_end = round_up(offset + len, blocksize);
        /* Make sure we aren't being give some crap mode */
        if (mode & ~(FALLOC_FL_KEEP_SIZE | FALLOC_FL_PUNCH_HOLE))
@@ -2010,7 +2113,7 @@ static long btrfs_fallocate(struct file *file, int mode,
         * Make sure we have enough space before we do the
         * allocation.
         */
-        ret = btrfs_check_data_free_space(inode, alloc_end - alloc_start + 1);
+        ret = btrfs_check_data_free_space(inode, alloc_end - alloc_start);
        if (ret)
                return ret;
@@ -2078,7 +2181,7 @@ static long btrfs_fallocate(struct file *file, int mode,
                }
                last_byte = min(extent_map_end(em), alloc_end);
                actual_end = min_t(u64, extent_map_end(em), offset + len);
-                last_byte = (last_byte + mask) & ~mask;
+                last_byte = ALIGN(last_byte, blocksize);
                if (em->block_start == EXTENT_MAP_HOLE ||
                    (cur_offset >= inode->i_size &&
@@ -2117,11 +2220,11 @@ static long btrfs_fallocate(struct file *file, int mode,
 out:
        mutex_unlock(&inode->i_mutex);
        /* Let go of our reservation. */
-        btrfs_free_reserved_data_space(inode, alloc_end - alloc_start + 1);
+        btrfs_free_reserved_data_space(inode, alloc_end - alloc_start);
        return ret;
 }
-static int find_desired_extent(struct inode *inode, loff_t *offset, int origin)
+static int find_desired_extent(struct inode *inode, loff_t *offset, int whence)
 {
        struct btrfs_root *root = BTRFS_I(inode)->root;
        struct extent_map *em;
@@ -2155,7 +2258,7 @@ static int find_desired_extent(struct inode *inode, loff_t *offset, int origin)
         * before the position we want in case there is outstanding delalloc
         * going on here.
         */
-        if (origin == SEEK_HOLE && start != 0) {
+        if (whence == SEEK_HOLE && start != 0) {
                if (start <= root->sectorsize)
                        em = btrfs_get_extent_fiemap(inode, NULL, 0, 0,
                                                     root->sectorsize, 0);
@@ -2189,13 +2292,13 @@ static int find_desired_extent(struct inode *inode, loff_t *offset, int origin)
                                }
                        }
-                        if (origin == SEEK_HOLE) {
+                        if (whence == SEEK_HOLE) {
                                *offset = start;
                                free_extent_map(em);
                                break;
                        }
                } else {
-                        if (origin == SEEK_DATA) {
+                        if (whence == SEEK_DATA) {
                                if (em->block_start == EXTENT_MAP_DELALLOC) {
                                        if (start >= inode->i_size) {
                                                free_extent_map(em);
@@ -2232,16 +2335,16 @@ out:
        return ret;
 }
-static loff_t btrfs_file_llseek(struct file *file, loff_t offset, int origin)
+static loff_t btrfs_file_llseek(struct file *file, loff_t offset, int whence)
 {
        struct inode *inode = file->f_mapping->host;
        int ret;
        mutex_lock(&inode->i_mutex);
-        switch (origin) {
+        switch (whence) {
        case SEEK_END:
        case SEEK_CUR:
-                offset = generic_file_llseek(file, offset, origin);
+                offset = generic_file_llseek(file, offset, whence);
                goto out;
        case SEEK_DATA:
        case SEEK_HOLE:
@@ -2250,7 +2353,7 @@ static loff_t btrfs_file_llseek(struct file *file, loff_t offset, int origin)
                        return -ENXIO;
                }
-                ret = find_desired_extent(inode, &offset, origin);
+                ret = find_desired_extent(inode, &offset, whence);
                if (ret) {
                        mutex_unlock(&inode->i_mutex);
                        return ret;
@@ -2293,3 +2396,21 @@ const struct file_operations btrfs_file_operations = {
        .compat_ioctl   = btrfs_ioctl,
 #endif
 };
+void btrfs_auto_defrag_exit(void)
+{
+        if (btrfs_inode_defrag_cachep)
+                kmem_cache_destroy(btrfs_inode_defrag_cachep);
+}
+int btrfs_auto_defrag_init(void)
+{
+        btrfs_inode_defrag_cachep = kmem_cache_create("btrfs_inode_defrag",
+                                        sizeof(struct inode_defrag), 0,
+                                        SLAB_RECLAIM_ACCOUNT | SLAB_MEM_SPREAD,
+                                        NULL);
+        if (!btrfs_inode_defrag_cachep)
+                return -ENOMEM;
+        return 0;
+}
diff --git a/fs/btrfs/free-space-cache.c b/fs/btrfs/free-space-cache.c
index 1027b854b90c..59ea2e4349c9 100644
--- a/fs/btrfs/free-space-cache.c
+++ b/fs/btrfs/free-space-cache.c
@@ -307,7 +307,6 @@ static void io_ctl_unmap_page(struct io_ctl *io_ctl)
 static void io_ctl_map_page(struct io_ctl *io_ctl, int clear)
 {
-        WARN_ON(io_ctl->cur);
        BUG_ON(io_ctl->index >= io_ctl->num_pages);
        io_ctl->page = io_ctl->pages[io_ctl->index++];
        io_ctl->cur = kmap(io_ctl->page);
@@ -1250,18 +1249,13 @@ tree_search_offset(struct btrfs_free_space_ctl *ctl,
                         * if previous extent entry covers the offset,
                         * we should return it instead of the bitmap entry
                         */
-                        n = &entry->offset_index;
+                        n = rb_prev(&entry->offset_index);
-                        while (1) {
+                        if (n) {
-                                n = rb_prev(n);
-                                if (!n)
-                                        break;
                                prev = rb_entry(n, struct btrfs_free_space,
                                                offset_index);
-                                if (!prev->bitmap) {
+                                if (!prev->bitmap &&
-                                        if (prev->offset + prev->bytes > offset)
+                                    prev->offset + prev->bytes > offset)
-                                                entry = prev;
+                                        entry = prev;
-                                        break;
-                                }
                        }
                }
                return entry;
@@ -1287,18 +1281,13 @@ tree_search_offset(struct btrfs_free_space_ctl *ctl,
        }
        if (entry->bitmap) {
-                n = &entry->offset_index;
+                n = rb_prev(&entry->offset_index);
-                while (1) {
+                if (n) {
-                        n = rb_prev(n);
-                        if (!n)
-                                break;
                        prev = rb_entry(n, struct btrfs_free_space,
                                        offset_index);
-                        if (!prev->bitmap) {
+                        if (!prev->bitmap &&
-                                if (prev->offset + prev->bytes > offset)
+                            prev->offset + prev->bytes > offset)
-                                        return prev;
+                                return prev;
-                                break;
-                        }
                }
                if (entry->offset + BITS_PER_BITMAP * ctl->unit > offset)
                        return entry;
@@ -1364,7 +1353,7 @@ static void recalculate_thresholds(struct btrfs_free_space_ctl *ctl)
        u64 bitmap_bytes;
        u64 extent_bytes;
        u64 size = block_group->key.offset;
-        u64 bytes_per_bg = BITS_PER_BITMAP * block_group->sectorsize;
+        u64 bytes_per_bg = BITS_PER_BITMAP * ctl->unit;
        int max_bitmaps = div64_u64(size + bytes_per_bg - 1, bytes_per_bg);
        BUG_ON(ctl->total_bitmaps > max_bitmaps);
@@ -1650,8 +1639,7 @@ static bool use_bitmap(struct btrfs_free_space_ctl *ctl,
         * some block groups are so tiny they can't be enveloped by a bitmap, so
         * don't even bother to create a bitmap for this
         */
-        if (BITS_PER_BITMAP * block_group->sectorsize >
+        if (BITS_PER_BITMAP * ctl->unit > block_group->key.offset)
-            block_group->key.offset)
                return false;
        return true;
@@ -2298,10 +2286,10 @@ static int btrfs_bitmap_cluster(struct btrfs_block_group_cache *block_group,
        unsigned long total_found = 0;
        int ret;
-        i = offset_to_bit(entry->offset, block_group->sectorsize,
+        i = offset_to_bit(entry->offset, ctl->unit,
                          max_t(u64, offset, entry->offset));
-        want_bits = bytes_to_bits(bytes, block_group->sectorsize);
+        want_bits = bytes_to_bits(bytes, ctl->unit);
-        min_bits = bytes_to_bits(min_bytes, block_group->sectorsize);
+        min_bits = bytes_to_bits(min_bytes, ctl->unit);
 again:
        found_bits = 0;
@@ -2325,23 +2313,22 @@ again:
        total_found += found_bits;
-        if (cluster->max_size < found_bits * block_group->sectorsize)
+        if (cluster->max_size < found_bits * ctl->unit)
-                cluster->max_size = found_bits * block_group->sectorsize;
+                cluster->max_size = found_bits * ctl->unit;
        if (total_found < want_bits || cluster->max_size < cont1_bytes) {
                i = next_zero + 1;
                goto again;
        }
-        cluster->window_start = start * block_group->sectorsize +
+        cluster->window_start = start * ctl->unit + entry->offset;
-                entry->offset;
        rb_erase(&entry->offset_index, &ctl->free_space_offset);
        ret = tree_insert_offset(&cluster->root, entry->offset,
                                 &entry->offset_index, 1);
        BUG_ON(ret); /* -EEXIST; Logic error */
        trace_btrfs_setup_cluster(block_group, cluster,
-                                  total_found * block_group->sectorsize, 1);
+                                  total_found * ctl->unit, 1);
        return 0;
 }
diff --git a/fs/btrfs/inode-map.c b/fs/btrfs/inode-map.c
index b1a1c929ba80..d26f67a59e36 100644
--- a/fs/btrfs/inode-map.c
+++ b/fs/btrfs/inode-map.c
@@ -434,8 +434,9 @@ int btrfs_save_ino_cache(struct btrfs_root *root,
         * 3 items for pre-allocation
         */
        trans->bytes_reserved = btrfs_calc_trans_metadata_size(root, 8);
-        ret = btrfs_block_rsv_add_noflush(root, trans->block_rsv,
+        ret = btrfs_block_rsv_add(root, trans->block_rsv,
-                                          trans->bytes_reserved);
+                                  trans->bytes_reserved,
+                                  BTRFS_RESERVE_NO_FLUSH);
        if (ret)
                goto out;
        trace_btrfs_space_reservation(root->fs_info, "ino_cache",
diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index 95542a1b3dfc..67ed24ae86bb 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -71,6 +71,7 @@ static const struct file_operations btrfs_dir_file_operations;
 static struct extent_io_ops btrfs_extent_io_ops;
 static struct kmem_cache *btrfs_inode_cachep;
+static struct kmem_cache *btrfs_delalloc_work_cachep;
 struct kmem_cache *btrfs_trans_handle_cachep;
 struct kmem_cache *btrfs_transaction_cachep;
 struct kmem_cache *btrfs_path_cachep;
@@ -94,6 +95,10 @@ static noinline int cow_file_range(struct inode *inode,
                                   struct page *locked_page,
                                   u64 start, u64 end, int *page_started,
                                   unsigned long *nr_written, int unlock);
+static struct extent_map *create_pinned_em(struct inode *inode, u64 start,
+                                           u64 len, u64 orig_start,
+                                           u64 block_start, u64 block_len,
+                                           u64 orig_block_len, int type);
 static int btrfs_init_inode_security(struct btrfs_trans_handle *trans,
                                     struct inode *inode,  struct inode *dir,
@@ -698,14 +703,19 @@ retry:
                em->block_start = ins.objectid;
                em->block_len = ins.offset;
+                em->orig_block_len = ins.offset;
                em->bdev = root->fs_info->fs_devices->latest_bdev;
                em->compress_type = async_extent->compress_type;
                set_bit(EXTENT_FLAG_PINNED, &em->flags);
                set_bit(EXTENT_FLAG_COMPRESSED, &em->flags);
+                em->generation = -1;
                while (1) {
                        write_lock(&em_tree->lock);
                        ret = add_extent_mapping(em_tree, em);
+                        if (!ret)
+                                list_move(&em->list,
+                                          &em_tree->modified_extents);
                        write_unlock(&em_tree->lock);
                        if (ret != -EEXIST) {
                                free_extent_map(em);
@@ -803,14 +813,14 @@ static u64 get_extent_allocation_hint(struct inode *inode, u64 start,
 * required to start IO on it.  It may be clean and already done with
 * IO when we return.
 */
-static noinline int cow_file_range(struct inode *inode,
+static noinline int __cow_file_range(struct btrfs_trans_handle *trans,
-                                   struct page *locked_page,
+                                     struct inode *inode,
-                                   u64 start, u64 end, int *page_started,
+                                     struct btrfs_root *root,
-                                   unsigned long *nr_written,
+                                     struct page *locked_page,
-                                   int unlock)
+                                     u64 start, u64 end, int *page_started,
+                                     unsigned long *nr_written,
+                                     int unlock)
 {
-        struct btrfs_root *root = BTRFS_I(inode)->root;
-        struct btrfs_trans_handle *trans;
        u64 alloc_hint = 0;
        u64 num_bytes;
        unsigned long ram_size;
@@ -823,25 +833,10 @@ static noinline int cow_file_range(struct inode *inode,
        int ret = 0;
        BUG_ON(btrfs_is_free_space_inode(inode));
-        trans = btrfs_join_transaction(root);
-        if (IS_ERR(trans)) {
-                extent_clear_unlock_delalloc(inode,
-                             &BTRFS_I(inode)->io_tree,
-                             start, end, locked_page,
-                             EXTENT_CLEAR_UNLOCK_PAGE |
-                             EXTENT_CLEAR_UNLOCK |
-                             EXTENT_CLEAR_DELALLOC |
-                             EXTENT_CLEAR_DIRTY |
-                             EXTENT_SET_WRITEBACK |
-                             EXTENT_END_WRITEBACK);
-                return PTR_ERR(trans);
-        }
-        trans->block_rsv = &root->fs_info->delalloc_block_rsv;
        num_bytes = (end - start + blocksize) & ~(blocksize - 1);
        num_bytes = max(blocksize,  num_bytes);
        disk_num_bytes = num_bytes;
-        ret = 0;
        /* if this is a small write inside eof, kick off defrag */
        if (num_bytes < 64 * 1024 &&
@@ -900,12 +895,17 @@ static noinline int cow_file_range(struct inode *inode,
                em->block_start = ins.objectid;
                em->block_len = ins.offset;
+                em->orig_block_len = ins.offset;
                em->bdev = root->fs_info->fs_devices->latest_bdev;
                set_bit(EXTENT_FLAG_PINNED, &em->flags);
+                em->generation = -1;
                while (1) {
                        write_lock(&em_tree->lock);
                        ret = add_extent_mapping(em_tree, em);
+                        if (!ret)
+                                list_move(&em->list,
+                                          &em_tree->modified_extents);
                        write_unlock(&em_tree->lock);
                        if (ret != -EEXIST) {
                                free_extent_map(em);
@@ -952,11 +952,9 @@ static noinline int cow_file_range(struct inode *inode,
                alloc_hint = ins.objectid + ins.offset;
                start += cur_alloc_size;
        }
-        ret = 0;
 out:
-        btrfs_end_transaction(trans, root);
        return ret;
 out_unlock:
        extent_clear_unlock_delalloc(inode,
                     &BTRFS_I(inode)->io_tree,
@@ -971,6 +969,39 @@ out_unlock:
        goto out;
 }
+static noinline int cow_file_range(struct inode *inode,
+                                   struct page *locked_page,
+                                   u64 start, u64 end, int *page_started,
+                                   unsigned long *nr_written,
+                                   int unlock)
+{
+        struct btrfs_trans_handle *trans;
+        struct btrfs_root *root = BTRFS_I(inode)->root;
+        int ret;
+        trans = btrfs_join_transaction(root);
+        if (IS_ERR(trans)) {
+                extent_clear_unlock_delalloc(inode,
+                             &BTRFS_I(inode)->io_tree,
+                             start, end, locked_page,
+                             EXTENT_CLEAR_UNLOCK_PAGE |
+                             EXTENT_CLEAR_UNLOCK |
+                             EXTENT_CLEAR_DELALLOC |
+                             EXTENT_CLEAR_DIRTY |
+                             EXTENT_SET_WRITEBACK |
+                             EXTENT_END_WRITEBACK);
+                return PTR_ERR(trans);
+        }
+        trans->block_rsv = &root->fs_info->delalloc_block_rsv;
+        ret = __cow_file_range(trans, inode, root, locked_page, start, end,
+                               page_started, nr_written, unlock);
+        btrfs_end_transaction(trans, root);
+        return ret;
+}
 /*
 * work queue call back to started compression on a file and pages
 */
@@ -1126,6 +1157,7 @@ static noinline int run_delalloc_nocow(struct inode *inode,
        u64 extent_offset;
        u64 disk_bytenr;
        u64 num_bytes;
+        u64 disk_num_bytes;
        int extent_type;
        int ret, err;
        int type;
@@ -1228,6 +1260,8 @@ next_slot:
                        extent_offset = btrfs_file_extent_offset(leaf, fi);
                        extent_end = found_key.offset +
                                btrfs_file_extent_num_bytes(leaf, fi);
+                        disk_num_bytes =
+                                btrfs_file_extent_disk_num_bytes(leaf, fi);
                        if (extent_end <= start) {
                                path->slots[0]++;
                                goto next_slot;
@@ -1281,9 +1315,9 @@ out_check:
                btrfs_release_path(path);
                if (cow_start != (u64)-1) {
-                        ret = cow_file_range(inode, locked_page, cow_start,
+                        ret = __cow_file_range(trans, inode, root, locked_page,
-                                        found_key.offset - 1, page_started,
+                                               cow_start, found_key.offset - 1,
-                                        nr_written, 1);
+                                               page_started, nr_written, 1);
                        if (ret) {
                                btrfs_abort_transaction(trans, root, ret);
                                goto error;
@@ -1298,16 +1332,21 @@ out_check:
                        em = alloc_extent_map();
                        BUG_ON(!em); /* -ENOMEM */
                        em->start = cur_offset;
-                        em->orig_start = em->start;
+                        em->orig_start = found_key.offset - extent_offset;
                        em->len = num_bytes;
                        em->block_len = num_bytes;
                        em->block_start = disk_bytenr;
+                        em->orig_block_len = disk_num_bytes;
                        em->bdev = root->fs_info->fs_devices->latest_bdev;
                        set_bit(EXTENT_FLAG_PINNED, &em->flags);
-                        set_bit(EXTENT_FLAG_PREALLOC, &em->flags);
+                        set_bit(EXTENT_FLAG_FILLING, &em->flags);
+                        em->generation = -1;
                        while (1) {
                                write_lock(&em_tree->lock);
                                ret = add_extent_mapping(em_tree, em);
+                                if (!ret)
+                                        list_move(&em->list,
+                                                  &em_tree->modified_extents);
                                write_unlock(&em_tree->lock);
                                if (ret != -EEXIST) {
                                        free_extent_map(em);
@@ -1352,8 +1391,9 @@ out_check:
        }
        if (cow_start != (u64)-1) {
-                ret = cow_file_range(inode, locked_page, cow_start, end,
+                ret = __cow_file_range(trans, inode, root, locked_page,
-                                     page_started, nr_written, 1);
+                                       cow_start, end,
+                                       page_started, nr_written, 1);
                if (ret) {
                        btrfs_abort_transaction(trans, root, ret);
                        goto error;
@@ -1531,7 +1571,6 @@ int btrfs_merge_bio_hook(struct page *page, unsigned long offset,
                         unsigned long bio_flags)
 {
        struct btrfs_root *root = BTRFS_I(page->mapping->host)->root;
-        struct btrfs_mapping_tree *map_tree;
        u64 logical = (u64)bio->bi_sector << 9;
        u64 length = 0;
        u64 map_length;
@@ -1541,11 +1580,10 @@ int btrfs_merge_bio_hook(struct page *page, unsigned long offset,
                return 0;
        length = bio->bi_size;
-        map_tree = &root->fs_info->mapping_tree;
        map_length = length;
-        ret = btrfs_map_block(map_tree, READ, logical,
+        ret = btrfs_map_block(root->fs_info, READ, logical,
                              &map_length, NULL, 0);
-        /* Will always return 0 or 1 with map_multi == NULL */
+        /* Will always return 0 with map_multi == NULL */
        BUG_ON(ret < 0);
        if (map_length < length + size)
                return 1;
@@ -1586,7 +1624,12 @@ static int __btrfs_submit_bio_done(struct inode *inode, int rw, struct bio *bio,
                          u64 bio_offset)
 {
        struct btrfs_root *root = BTRFS_I(inode)->root;
-        return btrfs_map_bio(root, rw, bio, mirror_num, 1);
+        int ret;
+        ret = btrfs_map_bio(root, rw, bio, mirror_num, 1);
+        if (ret)
+                bio_endio(bio, ret);
+        return ret;
 }
 /*
@@ -1601,6 +1644,7 @@ static int btrfs_submit_bio_hook(struct inode *inode, int rw, struct bio *bio,
        int ret = 0;
        int skip_sum;
        int metadata = 0;
+        int async = !atomic_read(&BTRFS_I(inode)->sync_writers);
        skip_sum = BTRFS_I(inode)->flags & BTRFS_INODE_NODATASUM;
@@ -1610,31 +1654,43 @@ static int btrfs_submit_bio_hook(struct inode *inode, int rw, struct bio *bio,
        if (!(rw & REQ_WRITE)) {
                ret = btrfs_bio_wq_end_io(root->fs_info, bio, metadata);
                if (ret)
-                        return ret;
+                        goto out;
                if (bio_flags & EXTENT_BIO_COMPRESSED) {
-                        return btrfs_submit_compressed_read(inode, bio,
+                        ret = btrfs_submit_compressed_read(inode, bio,
-                                                    mirror_num, bio_flags);
+                                                           mirror_num,
+                                                           bio_flags);
+                        goto out;
                } else if (!skip_sum) {
                        ret = btrfs_lookup_bio_sums(root, inode, bio, NULL);
                        if (ret)
-                                return ret;
+                                goto out;
                }
                goto mapit;
-        } else if (!skip_sum) {
+        } else if (async && !skip_sum) {
                /* csum items have already been cloned */
                if (root->root_key.objectid == BTRFS_DATA_RELOC_TREE_OBJECTID)
                        goto mapit;
                /* we're doing a write, do the async checksumming */
-                return btrfs_wq_submit_bio(BTRFS_I(inode)->root->fs_info,
+                ret = btrfs_wq_submit_bio(BTRFS_I(inode)->root->fs_info,
                                   inode, rw, bio, mirror_num,
                                   bio_flags, bio_offset,
                                   __btrfs_submit_bio_start,
                                   __btrfs_submit_bio_done);
+                goto out;
+        } else if (!skip_sum) {
+                ret = btrfs_csum_one_bio(root, inode, bio, 0, 0);
+                if (ret)
+                        goto out;
        }
 mapit:
-        return btrfs_map_bio(root, rw, bio, mirror_num, 0);
+        ret = btrfs_map_bio(root, rw, bio, mirror_num, 0);
+out:
+        if (ret < 0)
+                bio_endio(bio, ret);
+        return ret;
 }
 /*
@@ -1657,8 +1713,7 @@ static noinline int add_pending_csums(struct btrfs_trans_handle *trans,
 int btrfs_set_extent_delalloc(struct inode *inode, u64 start, u64 end,
                              struct extent_state **cached_state)
 {
-        if ((end & (PAGE_CACHE_SIZE - 1)) == 0)
+        WARN_ON((end & (PAGE_CACHE_SIZE - 1)) == 0);
-                WARN_ON(1);
        return set_extent_delalloc(&BTRFS_I(inode)->io_tree, start, end,
                                   cached_state, GFP_NOFS);
 }
@@ -1867,22 +1922,20 @@ static int btrfs_finish_ordered_io(struct btrfs_ordered_extent *ordered_extent)
        if (test_bit(BTRFS_ORDERED_NOCOW, &ordered_extent->flags)) {
                BUG_ON(!list_empty(&ordered_extent->list)); /* Logic error */
-                ret = btrfs_ordered_update_i_size(inode, 0, ordered_extent);
+                btrfs_ordered_update_i_size(inode, 0, ordered_extent);
-                if (!ret) {
+                if (nolock)
-                        if (nolock)
+                        trans = btrfs_join_transaction_nolock(root);
-                                trans = btrfs_join_transaction_nolock(root);
+                else
-                        else
+                        trans = btrfs_join_transaction(root);
-                                trans = btrfs_join_transaction(root);
+                if (IS_ERR(trans)) {
-                        if (IS_ERR(trans)) {
+                        ret = PTR_ERR(trans);
-                                ret = PTR_ERR(trans);
+                        trans = NULL;
-                                trans = NULL;
+                        goto out;
-                                goto out;
-                        }
-                        trans->block_rsv = &root->fs_info->delalloc_block_rsv;
-                        ret = btrfs_update_inode_fallback(trans, root, inode);
-                        if (ret) /* -ENOMEM or corruption */
-                                btrfs_abort_transaction(trans, root, ret);
                }
+                trans->block_rsv = &root->fs_info->delalloc_block_rsv;
+                ret = btrfs_update_inode_fallback(trans, root, inode);
+                if (ret) /* -ENOMEM or corruption */
+                        btrfs_abort_transaction(trans, root, ret);
                goto out;
        }
@@ -1931,15 +1984,11 @@ static int btrfs_finish_ordered_io(struct btrfs_ordered_extent *ordered_extent)
        add_pending_csums(trans, inode, ordered_extent->file_offset,
                          &ordered_extent->list);
-        ret = btrfs_ordered_update_i_size(inode, 0, ordered_extent);
+        btrfs_ordered_update_i_size(inode, 0, ordered_extent);
-        if (!ret || !test_bit(BTRFS_ORDERED_PREALLOC, &ordered_extent->flags)) {
+        ret = btrfs_update_inode_fallback(trans, root, inode);
-                ret = btrfs_update_inode_fallback(trans, root, inode);
+        if (ret) { /* -ENOMEM or corruption */
-                if (ret) { /* -ENOMEM or corruption */
+                btrfs_abort_transaction(trans, root, ret);
-                        btrfs_abort_transaction(trans, root, ret);
+                goto out_unlock;
-                        goto out_unlock;
-                }
-        } else {
-                btrfs_set_inode_last_trans(trans, inode);
        }
        ret = 0;
 out_unlock:
@@ -3074,7 +3123,6 @@ static int btrfs_unlink(struct inode *dir, struct dentry *dentry)
        struct btrfs_trans_handle *trans;
        struct inode *inode = dentry->d_inode;
        int ret;
-        unsigned long nr = 0;
        trans = __unlink_start_trans(dir, dentry);
        if (IS_ERR(trans))
@@ -3094,9 +3142,8 @@ static int btrfs_unlink(struct inode *dir, struct dentry *dentry)
        }
 out:
-        nr = trans->blocks_used;
        __unlink_end_trans(trans, root);
-        btrfs_btree_balance_dirty(root, nr);
+        btrfs_btree_balance_dirty(root);
        return ret;
 }
@@ -3186,7 +3233,6 @@ static int btrfs_rmdir(struct inode *dir, struct dentry *dentry)
        int err = 0;
        struct btrfs_root *root = BTRFS_I(dir)->root;
        struct btrfs_trans_handle *trans;
-        unsigned long nr = 0;
        if (inode->i_size > BTRFS_EMPTY_DIR_SIZE)
                return -ENOTEMPTY;
@@ -3215,9 +3261,8 @@ static int btrfs_rmdir(struct inode *dir, struct dentry *dentry)
        if (!err)
                btrfs_i_size_write(inode, 0);
 out:
-        nr = trans->blocks_used;
        __unlink_end_trans(trans, root);
-        btrfs_btree_balance_dirty(root, nr);
+        btrfs_btree_balance_dirty(root);
        return err;
 }
@@ -3497,11 +3542,11 @@ int btrfs_truncate_page(struct inode *inode, loff_t from, loff_t len,
        if (ret)
                goto out;
-        ret = -ENOMEM;
 again:
        page = find_or_create_page(mapping, index, mask);
        if (!page) {
                btrfs_delalloc_release_space(inode, PAGE_CACHE_SIZE);
+                ret = -ENOMEM;
                goto out;
        }
@@ -3550,7 +3595,6 @@ again:
                goto out_unlock;
        }
-        ret = 0;
        if (offset != PAGE_CACHE_SIZE) {
                if (!len)
                        len = PAGE_CACHE_SIZE - offset;
@@ -3668,6 +3712,7 @@ int btrfs_cont_expand(struct inode *inode, loff_t oldsize, loff_t size)
                        hole_em->block_start = EXTENT_MAP_HOLE;
                        hole_em->block_len = 0;
+                        hole_em->orig_block_len = 0;
                        hole_em->bdev = root->fs_info->fs_devices->latest_bdev;
                        hole_em->compress_type = BTRFS_COMPRESS_NONE;
                        hole_em->generation = trans->transid;
@@ -3783,7 +3828,6 @@ void btrfs_evict_inode(struct inode *inode)
        struct btrfs_root *root = BTRFS_I(inode)->root;
        struct btrfs_block_rsv *rsv, *global_rsv;
        u64 min_size = btrfs_calc_trunc_metadata_size(root, 1);
-        unsigned long nr;
        int ret;
        trace_btrfs_inode_evict(inode);
@@ -3829,7 +3873,8 @@ void btrfs_evict_inode(struct inode *inode)
         * inode item when doing the truncate.
         */
        while (1) {
-                ret = btrfs_block_rsv_refill_noflush(root, rsv, min_size);
+                ret = btrfs_block_rsv_refill(root, rsv, min_size,
+                                             BTRFS_RESERVE_FLUSH_LIMIT);
                /*
                 * Try and steal from the global reserve since we will
@@ -3847,7 +3892,7 @@ void btrfs_evict_inode(struct inode *inode)
                        goto no_delete;
                }
-                trans = btrfs_start_transaction_noflush(root, 1);
+                trans = btrfs_start_transaction_lflush(root, 1);
                if (IS_ERR(trans)) {
                        btrfs_orphan_del(NULL, inode);
                        btrfs_free_block_rsv(root, rsv);
@@ -3864,10 +3909,9 @@ void btrfs_evict_inode(struct inode *inode)
                ret = btrfs_update_inode(trans, root, inode);
                BUG_ON(ret);
-                nr = trans->blocks_used;
                btrfs_end_transaction(trans, root);
                trans = NULL;
-                btrfs_btree_balance_dirty(root, nr);
+                btrfs_btree_balance_dirty(root);
        }
        btrfs_free_block_rsv(root, rsv);
@@ -3883,9 +3927,8 @@ void btrfs_evict_inode(struct inode *inode)
              root->root_key.objectid == BTRFS_TREE_RELOC_OBJECTID))
                btrfs_return_ino(root, btrfs_ino(inode));
-        nr = trans->blocks_used;
        btrfs_end_transaction(trans, root);
-        btrfs_btree_balance_dirty(root, nr);
+        btrfs_btree_balance_dirty(root);
 no_delete:
        clear_inode(inode);
        return;
@@ -4775,8 +4818,7 @@ static struct inode *btrfs_new_inode(struct btrfs_trans_handle *trans,
        if (S_ISREG(mode)) {
                if (btrfs_test_opt(root, NODATASUM))
                        BTRFS_I(inode)->flags |= BTRFS_INODE_NODATASUM;
-                if (btrfs_test_opt(root, NODATACOW) ||
+                if (btrfs_test_opt(root, NODATACOW))
-                    (BTRFS_I(dir)->flags & BTRFS_INODE_NODATACOW))
                        BTRFS_I(inode)->flags |= BTRFS_INODE_NODATACOW;
        }
@@ -4842,7 +4884,7 @@ int btrfs_add_link(struct btrfs_trans_handle *trans,
        ret = btrfs_insert_dir_item(trans, root, name, name_len,
                                    parent_inode, &key,
                                    btrfs_inode_type(inode), index);
-        if (ret == -EEXIST)
+        if (ret == -EEXIST || ret == -EOVERFLOW)
                goto fail_dir_item;
        else if (ret) {
                btrfs_abort_transaction(trans, root, ret);
@@ -4897,7 +4939,6 @@ static int btrfs_mknod(struct inode *dir, struct dentry *dentry,
        int err;
        int drop_inode = 0;
        u64 objectid;
-        unsigned long nr = 0;
        u64 index = 0;
        if (!new_valid_dev(rdev))
@@ -4930,6 +4971,12 @@ static int btrfs_mknod(struct inode *dir, struct dentry *dentry,
                goto out_unlock;
        }
+        err = btrfs_update_inode(trans, root, inode);
+        if (err) {
+                drop_inode = 1;
+                goto out_unlock;
+        }
        /*
        * If the active LSM wants to access the inode during
        * d_instantiate it needs these. Smack checks to see
@@ -4947,9 +4994,8 @@ static int btrfs_mknod(struct inode *dir, struct dentry *dentry,
                d_instantiate(dentry, inode);
        }
 out_unlock:
-        nr = trans->blocks_used;
        btrfs_end_transaction(trans, root);
-        btrfs_btree_balance_dirty(root, nr);
+        btrfs_btree_balance_dirty(root);
        if (drop_inode) {
                inode_dec_link_count(inode);
                iput(inode);
@@ -4963,9 +5009,8 @@ static int btrfs_create(struct inode *dir, struct dentry *dentry,
        struct btrfs_trans_handle *trans;
        struct btrfs_root *root = BTRFS_I(dir)->root;
        struct inode *inode = NULL;
-        int drop_inode = 0;
+        int drop_inode_on_err = 0;
        int err;
-        unsigned long nr = 0;
        u64 objectid;
        u64 index = 0;
@@ -4989,12 +5034,15 @@ static int btrfs_create(struct inode *dir, struct dentry *dentry,
                err = PTR_ERR(inode);
                goto out_unlock;
        }
+        drop_inode_on_err = 1;
        err = btrfs_init_inode_security(trans, inode, dir, &dentry->d_name);
-        if (err) {
+        if (err)
-                drop_inode = 1;
+                goto out_unlock;
+        err = btrfs_update_inode(trans, root, inode);
+        if (err)
                goto out_unlock;
-        }
        /*
        * If the active LSM wants to access the inode during
@@ -5007,21 +5055,20 @@ static int btrfs_create(struct inode *dir, struct dentry *dentry,
        err = btrfs_add_nondir(trans, dir, dentry, inode, 0, index);
        if (err)
-                drop_inode = 1;
+                goto out_unlock;
-        else {
-                inode->i_mapping->a_ops = &btrfs_aops;
+        inode->i_mapping->a_ops = &btrfs_aops;
-                inode->i_mapping->backing_dev_info = &root->fs_info->bdi;
+        inode->i_mapping->backing_dev_info = &root->fs_info->bdi;
-                BTRFS_I(inode)->io_tree.ops = &btrfs_extent_io_ops;
+        BTRFS_I(inode)->io_tree.ops = &btrfs_extent_io_ops;
-                d_instantiate(dentry, inode);
+        d_instantiate(dentry, inode);
-        }
 out_unlock:
-        nr = trans->blocks_used;
        btrfs_end_transaction(trans, root);
-        if (drop_inode) {
+        if (err && drop_inode_on_err) {
                inode_dec_link_count(inode);
                iput(inode);
        }
-        btrfs_btree_balance_dirty(root, nr);
+        btrfs_btree_balance_dirty(root);
        return err;
 }
@@ -5032,7 +5079,6 @@ static int btrfs_link(struct dentry *old_dentry, struct inode *dir,
        struct btrfs_root *root = BTRFS_I(dir)->root;
        struct inode *inode = old_dentry->d_inode;
        u64 index;
-        unsigned long nr = 0;
        int err;
        int drop_inode = 0;
@@ -5062,6 +5108,7 @@ static int btrfs_link(struct dentry *old_dentry, struct inode *dir,
        inode_inc_iversion(inode);
        inode->i_ctime = CURRENT_TIME;
        ihold(inode);
+        set_bit(BTRFS_INODE_COPY_EVERYTHING, &BTRFS_I(inode)->runtime_flags);
        err = btrfs_add_nondir(trans, dir, dentry, inode, 1, index);
@@ -5076,14 +5123,13 @@ static int btrfs_link(struct dentry *old_dentry, struct inode *dir,
                btrfs_log_new_name(trans, inode, NULL, parent);
        }
-        nr = trans->blocks_used;
        btrfs_end_transaction(trans, root);
 fail:
        if (drop_inode) {
                inode_dec_link_count(inode);
                iput(inode);
        }
-        btrfs_btree_balance_dirty(root, nr);
+        btrfs_btree_balance_dirty(root);
        return err;
 }
@@ -5096,7 +5142,6 @@ static int btrfs_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode)
        int drop_on_err = 0;
        u64 objectid = 0;
        u64 index = 0;
-        unsigned long nr = 1;
        /*
         * 2 items for inode and ref
@@ -5142,11 +5187,10 @@ static int btrfs_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode)
        drop_on_err = 0;
 out_fail:
-        nr = trans->blocks_used;
        btrfs_end_transaction(trans, root);
        if (drop_on_err)
                iput(inode);
-        btrfs_btree_balance_dirty(root, nr);
+        btrfs_btree_balance_dirty(root);
        return err;
 }
@@ -5340,6 +5384,7 @@ again:
                if (start + len <= found_key.offset)
                        goto not_found;
                em->start = start;
+                em->orig_start = start;
                em->len = found_key.offset - start;
                goto not_found_em;
        }
@@ -5350,6 +5395,8 @@ again:
                em->len = extent_end - extent_start;
                em->orig_start = extent_start -
                                 btrfs_file_extent_offset(leaf, item);
+                em->orig_block_len = btrfs_file_extent_disk_num_bytes(leaf,
+                                                                      item);
                bytenr = btrfs_file_extent_disk_bytenr(leaf, item);
                if (bytenr == 0) {
                        em->block_start = EXTENT_MAP_HOLE;
@@ -5359,8 +5406,7 @@ again:
                        set_bit(EXTENT_FLAG_COMPRESSED, &em->flags);
                        em->compress_type = compress_type;
                        em->block_start = bytenr;
-                        em->block_len = btrfs_file_extent_disk_num_bytes(leaf,
+                        em->block_len = em->orig_block_len;
-                                                                         item);
                } else {
                        bytenr += btrfs_file_extent_offset(leaf, item);
                        em->block_start = bytenr;
@@ -5390,7 +5436,8 @@ again:
                em->start = extent_start + extent_offset;
                em->len = (copy_size + root->sectorsize - 1) &
                        ~((u64)root->sectorsize - 1);
-                em->orig_start = EXTENT_MAP_INLINE;
+                em->orig_block_len = em->len;
+                em->orig_start = em->start;
                if (compress_type) {
                        set_bit(EXTENT_FLAG_COMPRESSED, &em->flags);
                        em->compress_type = compress_type;
@@ -5439,11 +5486,11 @@ again:
                                    extent_map_end(em) - 1, NULL, GFP_NOFS);
                goto insert;
        } else {
-                printk(KERN_ERR "btrfs unknown found_type %d\n", found_type);
+                WARN(1, KERN_ERR "btrfs unknown found_type %d\n", found_type);
-                WARN_ON(1);
        }
 not_found:
        em->start = start;
+        em->orig_start = start;
        em->len = len;
 not_found_em:
        em->block_start = EXTENT_MAP_HOLE;
@@ -5645,38 +5692,19 @@ out:
 }
 static struct extent_map *btrfs_new_extent_direct(struct inode *inode,
-                                                  struct extent_map *em,
                                                  u64 start, u64 len)
 {
        struct btrfs_root *root = BTRFS_I(inode)->root;
        struct btrfs_trans_handle *trans;
-        struct extent_map_tree *em_tree = &BTRFS_I(inode)->extent_tree;
+        struct extent_map *em;
        struct btrfs_key ins;
        u64 alloc_hint;
        int ret;
-        bool insert = false;
-        /*
-         * Ok if the extent map we looked up is a hole and is for the exact
-         * range we want, there is no reason to allocate a new one, however if
-         * it is not right then we need to free this one and drop the cache for
-         * our range.
-         */
-        if (em->block_start != EXTENT_MAP_HOLE || em->start != start ||
-            em->len != len) {
-                free_extent_map(em);
-                em = NULL;
-                insert = true;
-                btrfs_drop_extent_cache(inode, start, start + len - 1, 0);
-        }
        trans = btrfs_join_transaction(root);
        if (IS_ERR(trans))
                return ERR_CAST(trans);
-        if (start <= BTRFS_I(inode)->disk_i_size && len < 64 * 1024)
-                btrfs_add_inode_defrag(trans, inode);
        trans->block_rsv = &root->fs_info->delalloc_block_rsv;
        alloc_hint = get_extent_allocation_hint(inode, start, len);
@@ -5687,37 +5715,10 @@ static struct extent_map *btrfs_new_extent_direct(struct inode *inode,
                goto out;
        }
-        if (!em) {
+        em = create_pinned_em(inode, start, ins.offset, start, ins.objectid,
-                em = alloc_extent_map();
+                              ins.offset, ins.offset, 0);
-                if (!em) {
+        if (IS_ERR(em))
-                        em = ERR_PTR(-ENOMEM);
+                goto out;
-                        goto out;
-                }
-        }
-        em->start = start;
-        em->orig_start = em->start;
-        em->len = ins.offset;
-        em->block_start = ins.objectid;
-        em->block_len = ins.offset;
-        em->bdev = root->fs_info->fs_devices->latest_bdev;
-        /*
-         * We need to do this because if we're using the original em we searched
-         * for, we could have EXTENT_FLAG_VACANCY set, and we don't want that.
-         */
-        em->flags = 0;
-        set_bit(EXTENT_FLAG_PINNED, &em->flags);
-        while (insert) {
-                write_lock(&em_tree->lock);
-                ret = add_extent_mapping(em_tree, em);
-                write_unlock(&em_tree->lock);
-                if (ret != -EEXIST)
-                        break;
-                btrfs_drop_extent_cache(inode, start, start + em->len - 1, 0);
-        }
        ret = btrfs_add_ordered_extent_dio(inode, start, ins.objectid,
                                           ins.offset, ins.offset, 0);
@@ -5894,7 +5895,7 @@ static int lock_extent_direct(struct inode *inode, u64 lockstart, u64 lockend,
 static struct extent_map *create_pinned_em(struct inode *inode, u64 start,
                                           u64 len, u64 orig_start,
                                           u64 block_start, u64 block_len,
-                                           int type)
+                                           u64 orig_block_len, int type)
 {
        struct extent_map_tree *em_tree;
        struct extent_map *em;
@@ -5912,15 +5913,20 @@ static struct extent_map *create_pinned_em(struct inode *inode, u64 start,
        em->block_len = block_len;
        em->block_start = block_start;
        em->bdev = root->fs_info->fs_devices->latest_bdev;
+        em->orig_block_len = orig_block_len;
+        em->generation = -1;
        set_bit(EXTENT_FLAG_PINNED, &em->flags);
        if (type == BTRFS_ORDERED_PREALLOC)
-                set_bit(EXTENT_FLAG_PREALLOC, &em->flags);
+                set_bit(EXTENT_FLAG_FILLING, &em->flags);
        do {
                btrfs_drop_extent_cache(inode, em->start,
                                em->start + em->len - 1, 0);
                write_lock(&em_tree->lock);
                ret = add_extent_mapping(em_tree, em);
+                if (!ret)
+                        list_move(&em->list,
+                                  &em_tree->modified_extents);
                write_unlock(&em_tree->lock);
        } while (ret == -EEXIST);
@@ -6047,13 +6053,15 @@ static int btrfs_get_blocks_direct(struct inode *inode, sector_t iblock,
                        goto must_cow;
                if (can_nocow_odirect(trans, inode, start, len) == 1) {
-                        u64 orig_start = em->start;
+                        u64 orig_start = em->orig_start;
+                        u64 orig_block_len = em->orig_block_len;
                        if (type == BTRFS_ORDERED_PREALLOC) {
                                free_extent_map(em);
                                em = create_pinned_em(inode, start, len,
                                                       orig_start,
-                                                       block_start, len, type);
+                                                       block_start, len,
+                                                       orig_block_len, type);
                                if (IS_ERR(em)) {
                                        btrfs_end_transaction(trans, root);
                                        goto unlock_err;
@@ -6077,7 +6085,8 @@ must_cow:
         * it above
         */
        len = bh_result->b_size;
-        em = btrfs_new_extent_direct(inode, em, start, len);
+        free_extent_map(em);
+        em = btrfs_new_extent_direct(inode, start, len);
        if (IS_ERR(em)) {
                ret = PTR_ERR(em);
                goto unlock_err;
@@ -6318,6 +6327,9 @@ static inline int __btrfs_submit_dio_bio(struct bio *bio, struct inode *inode,
        struct btrfs_root *root = BTRFS_I(inode)->root;
        int ret;
+        if (async_submit)
+                async_submit = !atomic_read(&BTRFS_I(inode)->sync_writers);
        bio_get(bio);
        if (!write) {
@@ -6362,7 +6374,6 @@ static int btrfs_submit_direct_hook(int rw, struct btrfs_dio_private *dip,
 {
        struct inode *inode = dip->inode;
        struct btrfs_root *root = BTRFS_I(inode)->root;
-        struct btrfs_mapping_tree *map_tree = &root->fs_info->mapping_tree;
        struct bio *bio;
        struct bio *orig_bio = dip->orig_bio;
        struct bio_vec *bvec = orig_bio->bi_io_vec;
@@ -6375,7 +6386,7 @@ static int btrfs_submit_direct_hook(int rw, struct btrfs_dio_private *dip,
        int async_submit = 0;
        map_length = orig_bio->bi_size;
-        ret = btrfs_map_block(map_tree, READ, start_sector << 9,
+        ret = btrfs_map_block(root->fs_info, READ, start_sector << 9,
                              &map_length, NULL, 0);
        if (ret) {
                bio_put(orig_bio);
@@ -6429,7 +6440,8 @@ static int btrfs_submit_direct_hook(int rw, struct btrfs_dio_private *dip,
                        bio->bi_end_io = btrfs_end_dio_bio;
                        map_length = orig_bio->bi_size;
-                        ret = btrfs_map_block(map_tree, READ, start_sector << 9,
+                        ret = btrfs_map_block(root->fs_info, READ,
+                                              start_sector << 9,
                                              &map_length, NULL, 0);
                        if (ret) {
                                bio_put(bio);
@@ -6582,9 +6594,17 @@ static ssize_t btrfs_direct_IO(int rw, struct kiocb *iocb,
                   btrfs_submit_direct, 0);
 }
+#define BTRFS_FIEMAP_FLAGS      (FIEMAP_FLAG_SYNC)
 static int btrfs_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
                __u64 start, __u64 len)
 {
+        int     ret;
+        ret = fiemap_check_flags(fieinfo, BTRFS_FIEMAP_FLAGS);
+        if (ret)
+                return ret;
        return extent_fiemap(inode, fieinfo, start, len, btrfs_get_extent_fiemap);
 }
@@ -6855,7 +6875,6 @@ static int btrfs_truncate(struct inode *inode)
        int ret;
        int err = 0;
        struct btrfs_trans_handle *trans;
-        unsigned long nr;
        u64 mask = root->sectorsize - 1;
        u64 min_size = btrfs_calc_trunc_metadata_size(root, 1);
@@ -6978,9 +6997,8 @@ static int btrfs_truncate(struct inode *inode)
                        break;
                }
-                nr = trans->blocks_used;
                btrfs_end_transaction(trans, root);
-                btrfs_btree_balance_dirty(root, nr);
+                btrfs_btree_balance_dirty(root);
                trans = btrfs_start_transaction(root, 2);
                if (IS_ERR(trans)) {
@@ -7014,9 +7032,8 @@ static int btrfs_truncate(struct inode *inode)
                if (ret && !err)
                        err = ret;
-                nr = trans->blocks_used;
                ret = btrfs_end_transaction(trans, root);
-                btrfs_btree_balance_dirty(root, nr);
+                btrfs_btree_balance_dirty(root);
        }
 out:
@@ -7093,6 +7110,7 @@ struct inode *btrfs_alloc_inode(struct super_block *sb)
        extent_io_tree_init(&ei->io_failure_tree, &inode->i_data);
        ei->io_tree.track_uptodate = 1;
        ei->io_failure_tree.track_uptodate = 1;
+        atomic_set(&ei->sync_writers, 0);
        mutex_init(&ei->log_mutex);
        mutex_init(&ei->delalloc_mutex);
        btrfs_ordered_inode_tree_init(&ei->ordered_tree);
@@ -7203,6 +7221,8 @@ void btrfs_destroy_cachep(void)
                kmem_cache_destroy(btrfs_path_cachep);
        if (btrfs_free_space_cachep)
                kmem_cache_destroy(btrfs_free_space_cachep);
+        if (btrfs_delalloc_work_cachep)
+                kmem_cache_destroy(btrfs_delalloc_work_cachep);
 }
 int btrfs_init_cachep(void)
@@ -7237,6 +7257,13 @@ int btrfs_init_cachep(void)
        if (!btrfs_free_space_cachep)
                goto fail;
+        btrfs_delalloc_work_cachep = kmem_cache_create("btrfs_delalloc_work",
+                        sizeof(struct btrfs_delalloc_work), 0,
+                        SLAB_RECLAIM_ACCOUNT | SLAB_MEM_SPREAD,
+                        NULL);
+        if (!btrfs_delalloc_work_cachep)
+                goto fail;
        return 0;
 fail:
        btrfs_destroy_cachep();
@@ -7308,6 +7335,28 @@ static int btrfs_rename(struct inode *old_dir, struct dentry *old_dentry,
        if (S_ISDIR(old_inode->i_mode) && new_inode &&
            new_inode->i_size > BTRFS_EMPTY_DIR_SIZE)
                return -ENOTEMPTY;
+        /* check for collisions, even if the  name isn't there */
+        ret = btrfs_check_dir_item_collision(root, new_dir->i_ino,
+                             new_dentry->d_name.name,
+                             new_dentry->d_name.len);
+        if (ret) {
+                if (ret == -EEXIST) {
+                        /* we shouldn't get
+                         * eexist without a new_inode */
+                        if (!new_inode) {
+                                WARN_ON(1);
+                                return ret;
+                        }
+                } else {
+                        /* maybe -EOVERFLOW */
+                        return ret;
+                }
+        }
+        ret = 0;
        /*
         * we're using rename to replace one file with another.
         * and the replacement file is large.  Start IO on it now so
@@ -7447,6 +7496,49 @@ out_notrans:
        return ret;
 }
+static void btrfs_run_delalloc_work(struct btrfs_work *work)
+{
+        struct btrfs_delalloc_work *delalloc_work;
+        delalloc_work = container_of(work, struct btrfs_delalloc_work,
+                                     work);
+        if (delalloc_work->wait)
+                btrfs_wait_ordered_range(delalloc_work->inode, 0, (u64)-1);
+        else
+                filemap_flush(delalloc_work->inode->i_mapping);
+        if (delalloc_work->delay_iput)
+                btrfs_add_delayed_iput(delalloc_work->inode);
+        else
+                iput(delalloc_work->inode);
+        complete(&delalloc_work->completion);
+}
+struct btrfs_delalloc_work *btrfs_alloc_delalloc_work(struct inode *inode,
+                                                    int wait, int delay_iput)
+{
+        struct btrfs_delalloc_work *work;
+        work = kmem_cache_zalloc(btrfs_delalloc_work_cachep, GFP_NOFS);
+        if (!work)
+                return NULL;
+        init_completion(&work->completion);
+        INIT_LIST_HEAD(&work->list);
+        work->inode = inode;
+        work->wait = wait;
+        work->delay_iput = delay_iput;
+        work->work.func = btrfs_run_delalloc_work;
+        return work;
+}
+void btrfs_wait_and_free_delalloc_work(struct btrfs_delalloc_work *work)
+{
+        wait_for_completion(&work->completion);
+        kmem_cache_free(btrfs_delalloc_work_cachep, work);
+}
 /*
 * some fairly slow code that needs optimization. This walks the list
 * of all the inodes with pending delalloc and forces them to disk.
@@ -7456,10 +7548,15 @@ int btrfs_start_delalloc_inodes(struct btrfs_root *root, int delay_iput)
        struct list_head *head = &root->fs_info->delalloc_inodes;
        struct btrfs_inode *binode;
        struct inode *inode;
+        struct btrfs_delalloc_work *work, *next;
+        struct list_head works;
+        int ret = 0;
        if (root->fs_info->sb->s_flags & MS_RDONLY)
                return -EROFS;
+        INIT_LIST_HEAD(&works);
        spin_lock(&root->fs_info->delalloc_lock);
        while (!list_empty(head)) {
                binode = list_entry(head->next, struct btrfs_inode,
@@ -7469,11 +7566,14 @@ int btrfs_start_delalloc_inodes(struct btrfs_root *root, int delay_iput)
                        list_del_init(&binode->delalloc_inodes);
                spin_unlock(&root->fs_info->delalloc_lock);
                if (inode) {
-                        filemap_flush(inode->i_mapping);
+                        work = btrfs_alloc_delalloc_work(inode, 0, delay_iput);
-                        if (delay_iput)
+                        if (!work) {
-                                btrfs_add_delayed_iput(inode);
+                                ret = -ENOMEM;
-                        else
+                                goto out;
-                                iput(inode);
+                        }
+                        list_add_tail(&work->list, &works);
+                        btrfs_queue_worker(&root->fs_info->flush_workers,
+                                           &work->work);
                }
                cond_resched();
                spin_lock(&root->fs_info->delalloc_lock);
@@ -7492,7 +7592,12 @@ int btrfs_start_delalloc_inodes(struct btrfs_root *root, int delay_iput)
                    atomic_read(&root->fs_info->async_delalloc_pages) == 0));
        }
        atomic_dec(&root->fs_info->async_submit_draining);
-        return 0;
+out:
+        list_for_each_entry_safe(work, next, &works, list) {
+                list_del_init(&work->list);
+                btrfs_wait_and_free_delalloc_work(work);
+        }
+        return ret;
 }
 static int btrfs_symlink(struct inode *dir, struct dentry *dentry,
@@ -7512,7 +7617,6 @@ static int btrfs_symlink(struct inode *dir, struct dentry *dentry,
        unsigned long ptr;
        struct btrfs_file_extent_item *ei;
        struct extent_buffer *leaf;
-        unsigned long nr = 0;
        name_len = strlen(symname) + 1;
        if (name_len > BTRFS_MAX_INLINE_DATA_SIZE(root))
@@ -7610,13 +7714,12 @@ static int btrfs_symlink(struct inode *dir, struct dentry *dentry,
 out_unlock:
        if (!err)
                d_instantiate(dentry, inode);
-        nr = trans->blocks_used;
        btrfs_end_transaction(trans, root);
        if (drop_inode) {
                inode_dec_link_count(inode);
                iput(inode);
        }
-        btrfs_btree_balance_dirty(root, nr);
+        btrfs_btree_balance_dirty(root);
        return err;
 }
@@ -7679,6 +7782,7 @@ static int __btrfs_prealloc_file_range(struct inode *inode, int mode,
                em->len = ins.offset;
                em->block_start = ins.objectid;
                em->block_len = ins.offset;
+                em->orig_block_len = ins.offset;
                em->bdev = root->fs_info->fs_devices->latest_bdev;
                set_bit(EXTENT_FLAG_PREALLOC, &em->flags);
                em->generation = trans->transid;
diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c
index 8fcf9a59c28d..4b4516770f05 100644
--- a/fs/btrfs/ioctl.c
+++ b/fs/btrfs/ioctl.c
@@ -55,6 +55,7 @@
 #include "backref.h"
 #include "rcu-string.h"
 #include "send.h"
+#include "dev-replace.h"
 /* Mask out flags that are inappropriate for the given type of inode. */
 static inline __u32 btrfs_mask_flags(umode_t mode, __u32 flags)
@@ -140,8 +141,11 @@ void btrfs_inherit_iflags(struct inode *inode, struct inode *dir)
                BTRFS_I(inode)->flags |= BTRFS_INODE_COMPRESS;
        }
-        if (flags & BTRFS_INODE_NODATACOW)
+        if (flags & BTRFS_INODE_NODATACOW) {
                BTRFS_I(inode)->flags |= BTRFS_INODE_NODATACOW;
+                if (S_ISREG(inode->i_mode))
+                        BTRFS_I(inode)->flags |= BTRFS_INODE_NODATASUM;
+        }
        btrfs_update_iflags(inode);
 }
@@ -571,8 +575,12 @@ static int create_snapshot(struct btrfs_root *root, struct dentry *dentry,
                ret = btrfs_commit_transaction(trans,
                                               root->fs_info->extent_root);
        }
-        if (ret)
+        if (ret) {
+                /* cleanup_transaction has freed this for us */
+                if (trans->aborted)
+                        pending_snapshot = NULL;
                goto fail;
+        }
        ret = pending_snapshot->error;
        if (ret)
@@ -705,6 +713,16 @@ static noinline int btrfs_mksubvol(struct path *parent,
        if (error)
                goto out_dput;
+        /*
+         * even if this name doesn't exist, we may get hash collisions.
+         * check for them now when we can safely fail
+         */
+        error = btrfs_check_dir_item_collision(BTRFS_I(dir)->root,
+                                               dir->i_ino, name,
+                                               namelen);
+        if (error)
+                goto out_dput;
        down_read(&BTRFS_I(dir)->root->fs_info->subvol_sem);
        if (btrfs_root_refs(&BTRFS_I(dir)->root->root_item) == 0)
@@ -1225,7 +1243,7 @@ int btrfs_defrag_file(struct inode *inode, struct file *file,
                }
                defrag_count += ret;
-                balance_dirty_pages_ratelimited_nr(inode->i_mapping, ret);
+                balance_dirty_pages_ratelimited(inode->i_mapping);
                mutex_unlock(&inode->i_mutex);
                if (newer_than) {
@@ -1293,12 +1311,13 @@ out_ra:
        return ret;
 }
-static noinline int btrfs_ioctl_resize(struct btrfs_root *root,
+static noinline int btrfs_ioctl_resize(struct file *file,
                                        void __user *arg)
 {
        u64 new_size;
        u64 old_size;
        u64 devid = 1;
+        struct btrfs_root *root = BTRFS_I(fdentry(file)->d_inode)->root;
        struct btrfs_ioctl_vol_args *vol_args;
        struct btrfs_trans_handle *trans;
        struct btrfs_device *device = NULL;
@@ -1313,13 +1332,17 @@ static noinline int btrfs_ioctl_resize(struct btrfs_root *root,
        if (!capable(CAP_SYS_ADMIN))
                return -EPERM;
-        mutex_lock(&root->fs_info->volume_mutex);
+        ret = mnt_want_write_file(file);
-        if (root->fs_info->balance_ctl) {
+        if (ret)
-                printk(KERN_INFO "btrfs: balance in progress\n");
+                return ret;
-                ret = -EINVAL;
-                goto out;
+        if (atomic_xchg(&root->fs_info->mutually_exclusive_operation_running,
+                        1)) {
+                pr_info("btrfs: dev add/delete/balance/replace/resize operation in progress\n");
+                return -EINPROGRESS;
        }
+        mutex_lock(&root->fs_info->volume_mutex);
        vol_args = memdup_user(arg, sizeof(*vol_args));
        if (IS_ERR(vol_args)) {
                ret = PTR_ERR(vol_args);
@@ -1339,7 +1362,7 @@ static noinline int btrfs_ioctl_resize(struct btrfs_root *root,
                printk(KERN_INFO "btrfs: resizing devid %llu\n",
                       (unsigned long long)devid);
        }
-        device = btrfs_find_device(root, devid, NULL, NULL);
+        device = btrfs_find_device(root->fs_info, devid, NULL, NULL);
        if (!device) {
                printk(KERN_INFO "btrfs: resizer unable to find device %llu\n",
                       (unsigned long long)devid);
@@ -1371,6 +1394,11 @@ static noinline int btrfs_ioctl_resize(struct btrfs_root *root,
                }
        }
+        if (device->is_tgtdev_for_dev_replace) {
+                ret = -EINVAL;
+                goto out_free;
+        }
        old_size = device->total_bytes;
        if (mod < 0) {
@@ -1409,12 +1437,14 @@ static noinline int btrfs_ioctl_resize(struct btrfs_root *root,
                btrfs_commit_transaction(trans, root);
        } else if (new_size < old_size) {
                ret = btrfs_shrink_device(device, new_size);
-        }
+        } /* equal, nothing need to do */
 out_free:
        kfree(vol_args);
 out:
        mutex_unlock(&root->fs_info->volume_mutex);
+        mnt_drop_write_file(file);
+        atomic_set(&root->fs_info->mutually_exclusive_operation_running, 0);
        return ret;
 }
@@ -2156,9 +2186,17 @@ static int btrfs_ioctl_defrag(struct file *file, void __user *argp)
        if (btrfs_root_readonly(root))
                return -EROFS;
+        if (atomic_xchg(&root->fs_info->mutually_exclusive_operation_running,
+                        1)) {
+                pr_info("btrfs: dev add/delete/balance/replace/resize operation in progress\n");
+                return -EINPROGRESS;
+        }
        ret = mnt_want_write_file(file);
-        if (ret)
+        if (ret) {
+                atomic_set(&root->fs_info->mutually_exclusive_operation_running,
+                           0);
                return ret;
+        }
        switch (inode->i_mode & S_IFMT) {
        case S_IFDIR:
@@ -2210,6 +2248,7 @@ static int btrfs_ioctl_defrag(struct file *file, void __user *argp)
        }
 out:
        mnt_drop_write_file(file);
+        atomic_set(&root->fs_info->mutually_exclusive_operation_running, 0);
        return ret;
 }
@@ -2221,13 +2260,13 @@ static long btrfs_ioctl_add_dev(struct btrfs_root *root, void __user *arg)
        if (!capable(CAP_SYS_ADMIN))
                return -EPERM;
-        mutex_lock(&root->fs_info->volume_mutex);
+        if (atomic_xchg(&root->fs_info->mutually_exclusive_operation_running,
-        if (root->fs_info->balance_ctl) {
+                        1)) {
-                printk(KERN_INFO "btrfs: balance in progress\n");
+                pr_info("btrfs: dev add/delete/balance/replace/resize operation in progress\n");
-                ret = -EINVAL;
+                return -EINPROGRESS;
-                goto out;
        }
+        mutex_lock(&root->fs_info->volume_mutex);
        vol_args = memdup_user(arg, sizeof(*vol_args));
        if (IS_ERR(vol_args)) {
                ret = PTR_ERR(vol_args);
@@ -2240,27 +2279,31 @@ static long btrfs_ioctl_add_dev(struct btrfs_root *root, void __user *arg)
        kfree(vol_args);
 out:
        mutex_unlock(&root->fs_info->volume_mutex);
+        atomic_set(&root->fs_info->mutually_exclusive_operation_running, 0);
        return ret;
 }
-static long btrfs_ioctl_rm_dev(struct btrfs_root *root, void __user *arg)
+static long btrfs_ioctl_rm_dev(struct file *file, void __user *arg)
 {
+        struct btrfs_root *root = BTRFS_I(fdentry(file)->d_inode)->root;
        struct btrfs_ioctl_vol_args *vol_args;
        int ret;
        if (!capable(CAP_SYS_ADMIN))
                return -EPERM;
-        if (root->fs_info->sb->s_flags & MS_RDONLY)
+        ret = mnt_want_write_file(file);
-                return -EROFS;
+        if (ret)
+                return ret;
-        mutex_lock(&root->fs_info->volume_mutex);
+        if (atomic_xchg(&root->fs_info->mutually_exclusive_operation_running,
-        if (root->fs_info->balance_ctl) {
+                        1)) {
-                printk(KERN_INFO "btrfs: balance in progress\n");
+                pr_info("btrfs: dev add/delete/balance/replace/resize operation in progress\n");
-                ret = -EINVAL;
+                mnt_drop_write_file(file);
-                goto out;
+                return -EINPROGRESS;
        }
+        mutex_lock(&root->fs_info->volume_mutex);
        vol_args = memdup_user(arg, sizeof(*vol_args));
        if (IS_ERR(vol_args)) {
                ret = PTR_ERR(vol_args);
@@ -2273,6 +2316,8 @@ static long btrfs_ioctl_rm_dev(struct btrfs_root *root, void __user *arg)
        kfree(vol_args);
 out:
        mutex_unlock(&root->fs_info->volume_mutex);
+        mnt_drop_write_file(file);
+        atomic_set(&root->fs_info->mutually_exclusive_operation_running, 0);
        return ret;
 }
@@ -2328,7 +2373,7 @@ static long btrfs_ioctl_dev_info(struct btrfs_root *root, void __user *arg)
                s_uuid = di_args->uuid;
        mutex_lock(&fs_devices->device_list_mutex);
-        dev = btrfs_find_device(root, di_args->devid, s_uuid, NULL);
+        dev = btrfs_find_device(root->fs_info, di_args->devid, s_uuid, NULL);
        mutex_unlock(&fs_devices->device_list_mutex);
        if (!dev) {
@@ -2821,12 +2866,19 @@ static long btrfs_ioctl_default_subvol(struct file *file, void __user *argp)
        struct btrfs_disk_key disk_key;
        u64 objectid = 0;
        u64 dir_id;
+        int ret;
        if (!capable(CAP_SYS_ADMIN))
                return -EPERM;
-        if (copy_from_user(&objectid, argp, sizeof(objectid)))
+        ret = mnt_want_write_file(file);
-                return -EFAULT;
+        if (ret)
+                return ret;
+        if (copy_from_user(&objectid, argp, sizeof(objectid))) {
+                ret = -EFAULT;
+                goto out;
+        }
        if (!objectid)
                objectid = root->root_key.objectid;
@@ -2836,21 +2888,28 @@ static long btrfs_ioctl_default_subvol(struct file *file, void __user *argp)
        location.offset = (u64)-1;
        new_root = btrfs_read_fs_root_no_name(root->fs_info, &location);
-        if (IS_ERR(new_root))
+        if (IS_ERR(new_root)) {
-                return PTR_ERR(new_root);
+                ret = PTR_ERR(new_root);
+                goto out;
+        }
-        if (btrfs_root_refs(&new_root->root_item) == 0)
+        if (btrfs_root_refs(&new_root->root_item) == 0) {
-                return -ENOENT;
+                ret = -ENOENT;
+                goto out;
+        }
        path = btrfs_alloc_path();
-        if (!path)
+        if (!path) {
-                return -ENOMEM;
+                ret = -ENOMEM;
+                goto out;
+        }
        path->leave_spinning = 1;
        trans = btrfs_start_transaction(root, 1);
        if (IS_ERR(trans)) {
                btrfs_free_path(path);
-                return PTR_ERR(trans);
+                ret = PTR_ERR(trans);
+                goto out;
        }
        dir_id = btrfs_super_root_dir(root->fs_info->super_copy);
@@ -2861,7 +2920,8 @@ static long btrfs_ioctl_default_subvol(struct file *file, void __user *argp)
                btrfs_end_transaction(trans, root);
                printk(KERN_ERR "Umm, you don't have the default dir item, "
                       "this isn't going to work\n");
-                return -ENOENT;
+                ret = -ENOENT;
+                goto out;
        }
        btrfs_cpu_key_to_disk(&disk_key, &new_root->root_key);
@@ -2871,8 +2931,9 @@ static long btrfs_ioctl_default_subvol(struct file *file, void __user *argp)
        btrfs_set_fs_incompat(root->fs_info, DEFAULT_SUBVOL);
        btrfs_end_transaction(trans, root);
+out:
-        return 0;
+        mnt_drop_write_file(file);
+        return ret;
 }
 void btrfs_get_block_group_info(struct list_head *groups_list,
@@ -3036,32 +3097,38 @@ long btrfs_ioctl_trans_end(struct file *file)
        return 0;
 }
-static noinline long btrfs_ioctl_start_sync(struct file *file, void __user *argp)
+static noinline long btrfs_ioctl_start_sync(struct btrfs_root *root,
+                                            void __user *argp)
 {
-        struct btrfs_root *root = BTRFS_I(file->f_dentry->d_inode)->root;
        struct btrfs_trans_handle *trans;
        u64 transid;
        int ret;
-        trans = btrfs_start_transaction(root, 0);
+        trans = btrfs_attach_transaction(root);
-        if (IS_ERR(trans))
+        if (IS_ERR(trans)) {
-                return PTR_ERR(trans);
+                if (PTR_ERR(trans) != -ENOENT)
+                        return PTR_ERR(trans);
+                /* No running transaction, don't bother */
+                transid = root->fs_info->last_trans_committed;
+                goto out;
+        }
        transid = trans->transid;
        ret = btrfs_commit_transaction_async(trans, root, 0);
        if (ret) {
                btrfs_end_transaction(trans, root);
                return ret;
        }
+out:
        if (argp)
                if (copy_to_user(argp, &transid, sizeof(transid)))
                        return -EFAULT;
        return 0;
 }
-static noinline long btrfs_ioctl_wait_sync(struct file *file, void __user *argp)
+static noinline long btrfs_ioctl_wait_sync(struct btrfs_root *root,
+                                           void __user *argp)
 {
-        struct btrfs_root *root = BTRFS_I(file->f_dentry->d_inode)->root;
        u64 transid;
        if (argp) {
@@ -3073,10 +3140,11 @@ static noinline long btrfs_ioctl_wait_sync(struct file *file, void __user *argp)
        return btrfs_wait_for_commit(root, transid);
 }
-static long btrfs_ioctl_scrub(struct btrfs_root *root, void __user *arg)
+static long btrfs_ioctl_scrub(struct file *file, void __user *arg)
 {
-        int ret;
+        struct btrfs_root *root = BTRFS_I(fdentry(file)->d_inode)->root;
        struct btrfs_ioctl_scrub_args *sa;
+        int ret;
        if (!capable(CAP_SYS_ADMIN))
                return -EPERM;
@@ -3085,12 +3153,22 @@ static long btrfs_ioctl_scrub(struct btrfs_root *root, void __user *arg)
        if (IS_ERR(sa))
                return PTR_ERR(sa);
-        ret = btrfs_scrub_dev(root, sa->devid, sa->start, sa->end,
+        if (!(sa->flags & BTRFS_SCRUB_READONLY)) {
-                              &sa->progress, sa->flags & BTRFS_SCRUB_READONLY);
+                ret = mnt_want_write_file(file);
+                if (ret)
+                        goto out;
+        }
+        ret = btrfs_scrub_dev(root->fs_info, sa->devid, sa->start, sa->end,
+                              &sa->progress, sa->flags & BTRFS_SCRUB_READONLY,
+                              0);
        if (copy_to_user(arg, sa, sizeof(*sa)))
                ret = -EFAULT;
+        if (!(sa->flags & BTRFS_SCRUB_READONLY))
+                mnt_drop_write_file(file);
+out:
        kfree(sa);
        return ret;
 }
@@ -3100,7 +3178,7 @@ static long btrfs_ioctl_scrub_cancel(struct btrfs_root *root, void __user *arg)
        if (!capable(CAP_SYS_ADMIN))
                return -EPERM;
-        return btrfs_scrub_cancel(root);
+        return btrfs_scrub_cancel(root->fs_info);
 }
 static long btrfs_ioctl_scrub_progress(struct btrfs_root *root,
@@ -3149,6 +3227,51 @@ static long btrfs_ioctl_get_dev_stats(struct btrfs_root *root,
        return ret;
 }
+static long btrfs_ioctl_dev_replace(struct btrfs_root *root, void __user *arg)
+{
+        struct btrfs_ioctl_dev_replace_args *p;
+        int ret;
+        if (!capable(CAP_SYS_ADMIN))
+                return -EPERM;
+        p = memdup_user(arg, sizeof(*p));
+        if (IS_ERR(p))
+                return PTR_ERR(p);
+        switch (p->cmd) {
+        case BTRFS_IOCTL_DEV_REPLACE_CMD_START:
+                if (atomic_xchg(
+                        &root->fs_info->mutually_exclusive_operation_running,
+                        1)) {
+                        pr_info("btrfs: dev add/delete/balance/replace/resize operation in progress\n");
+                        ret = -EINPROGRESS;
+                } else {
+                        ret = btrfs_dev_replace_start(root, p);
+                        atomic_set(
+                         &root->fs_info->mutually_exclusive_operation_running,
+                         0);
+                }
+                break;
+        case BTRFS_IOCTL_DEV_REPLACE_CMD_STATUS:
+                btrfs_dev_replace_status(root->fs_info, p);
+                ret = 0;
+                break;
+        case BTRFS_IOCTL_DEV_REPLACE_CMD_CANCEL:
+                ret = btrfs_dev_replace_cancel(root->fs_info, p);
+                break;
+        default:
+                ret = -EINVAL;
+                break;
+        }
+        if (copy_to_user(arg, p, sizeof(*p)))
+                ret = -EFAULT;
+        kfree(p);
+        return ret;
+}
 static long btrfs_ioctl_ino_to_path(struct btrfs_root *root, void __user *arg)
 {
        int ret = 0;
@@ -3315,6 +3438,7 @@ static long btrfs_ioctl_balance(struct file *file, void __user *arg)
        struct btrfs_ioctl_balance_args *bargs;
        struct btrfs_balance_control *bctl;
        int ret;
+        int need_to_clear_lock = 0;
        if (!capable(CAP_SYS_ADMIN))
                return -EPERM;
@@ -3350,10 +3474,13 @@ static long btrfs_ioctl_balance(struct file *file, void __user *arg)
                bargs = NULL;
        }
-        if (fs_info->balance_ctl) {
+        if (atomic_xchg(&root->fs_info->mutually_exclusive_operation_running,
+                        1)) {
+                pr_info("btrfs: dev add/delete/balance/replace/resize operation in progress\n");
                ret = -EINPROGRESS;
                goto out_bargs;
        }
+        need_to_clear_lock = 1;
        bctl = kzalloc(sizeof(*bctl), GFP_NOFS);
        if (!bctl) {
@@ -3387,6 +3514,9 @@ do_balance:
 out_bargs:
        kfree(bargs);
 out:
+        if (need_to_clear_lock)
+                atomic_set(&root->fs_info->mutually_exclusive_operation_running,
+                           0);
        mutex_unlock(&fs_info->balance_mutex);
        mutex_unlock(&fs_info->volume_mutex);
        mnt_drop_write_file(file);
@@ -3441,8 +3571,9 @@ out:
        return ret;
 }
-static long btrfs_ioctl_quota_ctl(struct btrfs_root *root, void __user *arg)
+static long btrfs_ioctl_quota_ctl(struct file *file, void __user *arg)
 {
+        struct btrfs_root *root = BTRFS_I(fdentry(file)->d_inode)->root;
        struct btrfs_ioctl_quota_ctl_args *sa;
        struct btrfs_trans_handle *trans = NULL;
        int ret;
@@ -3451,12 +3582,15 @@ static long btrfs_ioctl_quota_ctl(struct btrfs_root *root, void __user *arg)
        if (!capable(CAP_SYS_ADMIN))
                return -EPERM;
-        if (root->fs_info->sb->s_flags & MS_RDONLY)
+        ret = mnt_want_write_file(file);
-                return -EROFS;
+        if (ret)
+                return ret;
        sa = memdup_user(arg, sizeof(*sa));
-        if (IS_ERR(sa))
+        if (IS_ERR(sa)) {
-                return PTR_ERR(sa);
+                ret = PTR_ERR(sa);
+                goto drop_write;
+        }
        if (sa->cmd != BTRFS_QUOTA_CTL_RESCAN) {
                trans = btrfs_start_transaction(root, 2);
@@ -3489,14 +3623,16 @@ static long btrfs_ioctl_quota_ctl(struct btrfs_root *root, void __user *arg)
                if (err && !ret)
                        ret = err;
        }
 out:
        kfree(sa);
+drop_write:
+        mnt_drop_write_file(file);
        return ret;
 }
-static long btrfs_ioctl_qgroup_assign(struct btrfs_root *root, void __user *arg)
+static long btrfs_ioctl_qgroup_assign(struct file *file, void __user *arg)
 {
+        struct btrfs_root *root = BTRFS_I(fdentry(file)->d_inode)->root;
        struct btrfs_ioctl_qgroup_assign_args *sa;
        struct btrfs_trans_handle *trans;
        int ret;
@@ -3505,12 +3641,15 @@ static long btrfs_ioctl_qgroup_assign(struct btrfs_root *root, void __user *arg)
        if (!capable(CAP_SYS_ADMIN))
                return -EPERM;
-        if (root->fs_info->sb->s_flags & MS_RDONLY)
+        ret = mnt_want_write_file(file);
-                return -EROFS;
+        if (ret)
+                return ret;
        sa = memdup_user(arg, sizeof(*sa));
-        if (IS_ERR(sa))
+        if (IS_ERR(sa)) {
-                return PTR_ERR(sa);
+                ret = PTR_ERR(sa);
+                goto drop_write;
+        }
        trans = btrfs_join_transaction(root);
        if (IS_ERR(trans)) {
@@ -3533,11 +3672,14 @@ static long btrfs_ioctl_qgroup_assign(struct btrfs_root *root, void __user *arg)
 out:
        kfree(sa);
+drop_write:
+        mnt_drop_write_file(file);
        return ret;
 }
-static long btrfs_ioctl_qgroup_create(struct btrfs_root *root, void __user *arg)
+static long btrfs_ioctl_qgroup_create(struct file *file, void __user *arg)
 {
+        struct btrfs_root *root = BTRFS_I(fdentry(file)->d_inode)->root;
        struct btrfs_ioctl_qgroup_create_args *sa;
        struct btrfs_trans_handle *trans;
        int ret;
@@ -3546,12 +3688,15 @@ static long btrfs_ioctl_qgroup_create(struct btrfs_root *root, void __user *arg)
        if (!capable(CAP_SYS_ADMIN))
                return -EPERM;
-        if (root->fs_info->sb->s_flags & MS_RDONLY)
+        ret = mnt_want_write_file(file);
-                return -EROFS;
+        if (ret)
+                return ret;
        sa = memdup_user(arg, sizeof(*sa));
-        if (IS_ERR(sa))
+        if (IS_ERR(sa)) {
-                return PTR_ERR(sa);
+                ret = PTR_ERR(sa);
+                goto drop_write;
+        }
        trans = btrfs_join_transaction(root);
        if (IS_ERR(trans)) {
@@ -3573,11 +3718,14 @@ static long btrfs_ioctl_qgroup_create(struct btrfs_root *root, void __user *arg)
 out:
        kfree(sa);
+drop_write:
+        mnt_drop_write_file(file);
        return ret;
 }
-static long btrfs_ioctl_qgroup_limit(struct btrfs_root *root, void __user *arg)
+static long btrfs_ioctl_qgroup_limit(struct file *file, void __user *arg)
 {
+        struct btrfs_root *root = BTRFS_I(fdentry(file)->d_inode)->root;
        struct btrfs_ioctl_qgroup_limit_args *sa;
        struct btrfs_trans_handle *trans;
        int ret;
@@ -3587,12 +3735,15 @@ static long btrfs_ioctl_qgroup_limit(struct btrfs_root *root, void __user *arg)
        if (!capable(CAP_SYS_ADMIN))
                return -EPERM;
-        if (root->fs_info->sb->s_flags & MS_RDONLY)
+        ret = mnt_want_write_file(file);
-                return -EROFS;
+        if (ret)
+                return ret;
        sa = memdup_user(arg, sizeof(*sa));
-        if (IS_ERR(sa))
+        if (IS_ERR(sa)) {
-                return PTR_ERR(sa);
+                ret = PTR_ERR(sa);
+                goto drop_write;
+        }
        trans = btrfs_join_transaction(root);
        if (IS_ERR(trans)) {
@@ -3615,6 +3766,8 @@ static long btrfs_ioctl_qgroup_limit(struct btrfs_root *root, void __user *arg)
 out:
        kfree(sa);
+drop_write:
+        mnt_drop_write_file(file);
        return ret;
 }
@@ -3735,11 +3888,11 @@ long btrfs_ioctl(struct file *file, unsigned int
        case BTRFS_IOC_DEFRAG_RANGE:
                return btrfs_ioctl_defrag(file, argp);
        case BTRFS_IOC_RESIZE:
-                return btrfs_ioctl_resize(root, argp);
+                return btrfs_ioctl_resize(file, argp);
        case BTRFS_IOC_ADD_DEV:
                return btrfs_ioctl_add_dev(root, argp);
        case BTRFS_IOC_RM_DEV:
-                return btrfs_ioctl_rm_dev(root, argp);
+                return btrfs_ioctl_rm_dev(file, argp);
        case BTRFS_IOC_FS_INFO:
                return btrfs_ioctl_fs_info(root, argp);
        case BTRFS_IOC_DEV_INFO:
@@ -3768,11 +3921,11 @@ long btrfs_ioctl(struct file *file, unsigned int
                btrfs_sync_fs(file->f_dentry->d_sb, 1);
                return 0;
        case BTRFS_IOC_START_SYNC:
-                return btrfs_ioctl_start_sync(file, argp);
+                return btrfs_ioctl_start_sync(root, argp);
        case BTRFS_IOC_WAIT_SYNC:
-                return btrfs_ioctl_wait_sync(file, argp);
+                return btrfs_ioctl_wait_sync(root, argp);
        case BTRFS_IOC_SCRUB:
-                return btrfs_ioctl_scrub(root, argp);
+                return btrfs_ioctl_scrub(file, argp);
        case BTRFS_IOC_SCRUB_CANCEL:
                return btrfs_ioctl_scrub_cancel(root, argp);
        case BTRFS_IOC_SCRUB_PROGRESS:
@@ -3790,13 +3943,15 @@ long btrfs_ioctl(struct file *file, unsigned int
        case BTRFS_IOC_GET_DEV_STATS:
                return btrfs_ioctl_get_dev_stats(root, argp);
        case BTRFS_IOC_QUOTA_CTL:
-                return btrfs_ioctl_quota_ctl(root, argp);
+                return btrfs_ioctl_quota_ctl(file, argp);
        case BTRFS_IOC_QGROUP_ASSIGN:
-                return btrfs_ioctl_qgroup_assign(root, argp);
+                return btrfs_ioctl_qgroup_assign(file, argp);
        case BTRFS_IOC_QGROUP_CREATE:
-                return btrfs_ioctl_qgroup_create(root, argp);
+                return btrfs_ioctl_qgroup_create(file, argp);
        case BTRFS_IOC_QGROUP_LIMIT:
-                return btrfs_ioctl_qgroup_limit(root, argp);
+                return btrfs_ioctl_qgroup_limit(file, argp);
+        case BTRFS_IOC_DEV_REPLACE:
+                return btrfs_ioctl_dev_replace(root, argp);
        }
        return -ENOTTY;
diff --git a/fs/btrfs/ioctl.h b/fs/btrfs/ioctl.h
index 731e2875ab93..dabca9cc8c2e 100644
--- a/fs/btrfs/ioctl.h
+++ b/fs/btrfs/ioctl.h
@@ -30,6 +30,8 @@ struct btrfs_ioctl_vol_args {
        char name[BTRFS_PATH_NAME_MAX + 1];
 };
+#define BTRFS_DEVICE_PATH_NAME_MAX 1024
 #define BTRFS_SUBVOL_CREATE_ASYNC       (1ULL << 0)
 #define BTRFS_SUBVOL_RDONLY             (1ULL << 1)
 #define BTRFS_SUBVOL_QGROUP_INHERIT     (1ULL << 2)
@@ -123,7 +125,48 @@ struct btrfs_ioctl_scrub_args {
        __u64 unused[(1024-32-sizeof(struct btrfs_scrub_progress))/8];
 };
-#define BTRFS_DEVICE_PATH_NAME_MAX 1024
+#define BTRFS_IOCTL_DEV_REPLACE_CONT_READING_FROM_SRCDEV_MODE_ALWAYS    0
+#define BTRFS_IOCTL_DEV_REPLACE_CONT_READING_FROM_SRCDEV_MODE_AVOID     1
+struct btrfs_ioctl_dev_replace_start_params {
+        __u64 srcdevid; /* in, if 0, use srcdev_name instead */
+        __u64 cont_reading_from_srcdev_mode;    /* in, see #define
+                                                 * above */
+        __u8 srcdev_name[BTRFS_DEVICE_PATH_NAME_MAX + 1];       /* in */
+        __u8 tgtdev_name[BTRFS_DEVICE_PATH_NAME_MAX + 1];       /* in */
+};
+#define BTRFS_IOCTL_DEV_REPLACE_STATE_NEVER_STARTED     0
+#define BTRFS_IOCTL_DEV_REPLACE_STATE_STARTED           1
+#define BTRFS_IOCTL_DEV_REPLACE_STATE_FINISHED          2
+#define BTRFS_IOCTL_DEV_REPLACE_STATE_CANCELED          3
+#define BTRFS_IOCTL_DEV_REPLACE_STATE_SUSPENDED         4
+struct btrfs_ioctl_dev_replace_status_params {
+        __u64 replace_state;    /* out, see #define above */
+        __u64 progress_1000;    /* out, 0 <= x <= 1000 */
+        __u64 time_started;     /* out, seconds since 1-Jan-1970 */
+        __u64 time_stopped;     /* out, seconds since 1-Jan-1970 */
+        __u64 num_write_errors; /* out */
+        __u64 num_uncorrectable_read_errors;    /* out */
+};
+#define BTRFS_IOCTL_DEV_REPLACE_CMD_START                       0
+#define BTRFS_IOCTL_DEV_REPLACE_CMD_STATUS                      1
+#define BTRFS_IOCTL_DEV_REPLACE_CMD_CANCEL                      2
+#define BTRFS_IOCTL_DEV_REPLACE_RESULT_NO_ERROR                 0
+#define BTRFS_IOCTL_DEV_REPLACE_RESULT_NOT_STARTED              1
+#define BTRFS_IOCTL_DEV_REPLACE_RESULT_ALREADY_STARTED          2
+struct btrfs_ioctl_dev_replace_args {
+        __u64 cmd;      /* in */
+        __u64 result;   /* out */
+        union {
+                struct btrfs_ioctl_dev_replace_start_params start;
+                struct btrfs_ioctl_dev_replace_status_params status;
+        };      /* in/out */
+        __u64 spare[64];
+};
 struct btrfs_ioctl_dev_info_args {
        __u64 devid;                            /* in/out */
        __u8 uuid[BTRFS_UUID_SIZE];             /* in/out */
@@ -453,4 +496,7 @@ struct btrfs_ioctl_send_args {
                               struct btrfs_ioctl_qgroup_limit_args)
 #define BTRFS_IOC_GET_DEV_STATS _IOWR(BTRFS_IOCTL_MAGIC, 52, \
                                      struct btrfs_ioctl_get_dev_stats)
+#define BTRFS_IOC_DEV_REPLACE _IOWR(BTRFS_IOCTL_MAGIC, 53, \
+                                    struct btrfs_ioctl_dev_replace_args)
 #endif
diff --git a/fs/btrfs/math.h b/fs/btrfs/math.h
new file mode 100644
index 000000000000..b7816cefbd13
--- /dev/null
+++ b/fs/btrfs/math.h
@@ -0,0 +1,44 @@
+/*
+ * Copyright (C) 2012 Fujitsu.  All rights reserved.
+ * Written by Miao Xie <miaox@cn.fujitsu.com>
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public
+ * License v2 as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public
+ * License along with this program; if not, write to the
+ * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
+ * Boston, MA 021110-1307, USA.
+ */
+#ifndef __BTRFS_MATH_H
+#define __BTRFS_MATH_H
+#include <asm/div64.h>
+static inline u64 div_factor(u64 num, int factor)
+{
+        if (factor == 10)
+                return num;
+        num *= factor;
+        do_div(num, 10);
+        return num;
+}
+static inline u64 div_factor_fine(u64 num, int factor)
+{
+        if (factor == 100)
+                return num;
+        num *= factor;
+        do_div(num, 100);
+        return num;
+}
+#endif
diff --git a/fs/btrfs/ordered-data.c b/fs/btrfs/ordered-data.c
index 7772f02ba28e..f10731297040 100644
--- a/fs/btrfs/ordered-data.c
+++ b/fs/btrfs/ordered-data.c
@@ -211,6 +211,8 @@ static int __btrfs_add_ordered_extent(struct inode *inode, u64 file_offset,
        init_waitqueue_head(&entry->wait);
        INIT_LIST_HEAD(&entry->list);
        INIT_LIST_HEAD(&entry->root_extent_list);
+        INIT_LIST_HEAD(&entry->work_list);
+        init_completion(&entry->completion);
        trace_btrfs_ordered_extent_add(inode, entry);
@@ -464,18 +466,28 @@ void btrfs_remove_ordered_extent(struct inode *inode,
        wake_up(&entry->wait);
 }
+static void btrfs_run_ordered_extent_work(struct btrfs_work *work)
+{
+        struct btrfs_ordered_extent *ordered;
+        ordered = container_of(work, struct btrfs_ordered_extent, flush_work);
+        btrfs_start_ordered_extent(ordered->inode, ordered, 1);
+        complete(&ordered->completion);
+}
 /*
 * wait for all the ordered extents in a root.  This is done when balancing
 * space between drives.
 */
 void btrfs_wait_ordered_extents(struct btrfs_root *root, int delay_iput)
 {
-        struct list_head splice;
+        struct list_head splice, works;
        struct list_head *cur;
-        struct btrfs_ordered_extent *ordered;
+        struct btrfs_ordered_extent *ordered, *next;
        struct inode *inode;
        INIT_LIST_HEAD(&splice);
+        INIT_LIST_HEAD(&works);
        spin_lock(&root->fs_info->ordered_extent_lock);
        list_splice_init(&root->fs_info->ordered_extents, &splice);
@@ -494,19 +506,32 @@ void btrfs_wait_ordered_extents(struct btrfs_root *root, int delay_iput)
                spin_unlock(&root->fs_info->ordered_extent_lock);
                if (inode) {
-                        btrfs_start_ordered_extent(inode, ordered, 1);
+                        ordered->flush_work.func = btrfs_run_ordered_extent_work;
-                        btrfs_put_ordered_extent(ordered);
+                        list_add_tail(&ordered->work_list, &works);
-                        if (delay_iput)
+                        btrfs_queue_worker(&root->fs_info->flush_workers,
-                                btrfs_add_delayed_iput(inode);
+                                           &ordered->flush_work);
-                        else
-                                iput(inode);
                } else {
                        btrfs_put_ordered_extent(ordered);
                }
+                cond_resched();
                spin_lock(&root->fs_info->ordered_extent_lock);
        }
        spin_unlock(&root->fs_info->ordered_extent_lock);
+        list_for_each_entry_safe(ordered, next, &works, work_list) {
+                list_del_init(&ordered->work_list);
+                wait_for_completion(&ordered->completion);
+                inode = ordered->inode;
+                btrfs_put_ordered_extent(ordered);
+                if (delay_iput)
+                        btrfs_add_delayed_iput(inode);
+                else
+                        iput(inode);
+                cond_resched();
+        }
 }
 /*
@@ -519,13 +544,17 @@ void btrfs_wait_ordered_extents(struct btrfs_root *root, int delay_iput)
 * extra check to make sure the ordered operation list really is empty
 * before we return
 */
-void btrfs_run_ordered_operations(struct btrfs_root *root, int wait)
+int btrfs_run_ordered_operations(struct btrfs_root *root, int wait)
 {
        struct btrfs_inode *btrfs_inode;
        struct inode *inode;
        struct list_head splice;
+        struct list_head works;
+        struct btrfs_delalloc_work *work, *next;
+        int ret = 0;
        INIT_LIST_HEAD(&splice);
+        INIT_LIST_HEAD(&works);
        mutex_lock(&root->fs_info->ordered_operations_mutex);
        spin_lock(&root->fs_info->ordered_extent_lock);
@@ -533,6 +562,7 @@ again:
        list_splice_init(&root->fs_info->ordered_operations, &splice);
        while (!list_empty(&splice)) {
                btrfs_inode = list_entry(splice.next, struct btrfs_inode,
                                   ordered_operations);
@@ -549,15 +579,26 @@ again:
                        list_add_tail(&BTRFS_I(inode)->ordered_operations,
                              &root->fs_info->ordered_operations);
                }
+                if (!inode)
+                        continue;
                spin_unlock(&root->fs_info->ordered_extent_lock);
-                if (inode) {
+                work = btrfs_alloc_delalloc_work(inode, wait, 1);
-                        if (wait)
+                if (!work) {
-                                btrfs_wait_ordered_range(inode, 0, (u64)-1);
+                        if (list_empty(&BTRFS_I(inode)->ordered_operations))
-                        else
+                                list_add_tail(&btrfs_inode->ordered_operations,
-                                filemap_flush(inode->i_mapping);
+                                              &splice);
-                        btrfs_add_delayed_iput(inode);
+                        spin_lock(&root->fs_info->ordered_extent_lock);
+                        list_splice_tail(&splice,
+                                         &root->fs_info->ordered_operations);
+                        spin_unlock(&root->fs_info->ordered_extent_lock);
+                        ret = -ENOMEM;
+                        goto out;
                }
+                list_add_tail(&work->list, &works);
+                btrfs_queue_worker(&root->fs_info->flush_workers,
+                                   &work->work);
                cond_resched();
                spin_lock(&root->fs_info->ordered_extent_lock);
@@ -566,7 +607,13 @@ again:
                goto again;
        spin_unlock(&root->fs_info->ordered_extent_lock);
+out:
+        list_for_each_entry_safe(work, next, &works, list) {
+                list_del_init(&work->list);
+                btrfs_wait_and_free_delalloc_work(work);
+        }
        mutex_unlock(&root->fs_info->ordered_operations_mutex);
+        return ret;
 }
 /*
@@ -606,7 +653,6 @@ void btrfs_wait_ordered_range(struct inode *inode, u64 start, u64 len)
        u64 end;
        u64 orig_end;
        struct btrfs_ordered_extent *ordered;
-        int found;
        if (start + len < start) {
                orig_end = INT_LIMIT(loff_t);
@@ -642,7 +688,6 @@ void btrfs_wait_ordered_range(struct inode *inode, u64 start, u64 len)
        filemap_fdatawait_range(inode->i_mapping, start, orig_end);
        end = orig_end;
-        found = 0;
        while (1) {
                ordered = btrfs_lookup_first_ordered_extent(inode, end);
                if (!ordered)
@@ -655,7 +700,6 @@ void btrfs_wait_ordered_range(struct inode *inode, u64 start, u64 len)
                        btrfs_put_ordered_extent(ordered);
                        break;
                }
-                found++;
                btrfs_start_ordered_extent(inode, ordered, 1);
                end = ordered->file_offset;
                btrfs_put_ordered_extent(ordered);
@@ -934,15 +978,6 @@ void btrfs_add_ordered_operation(struct btrfs_trans_handle *trans,
        if (last_mod < root->fs_info->last_trans_committed)
                return;
-        /*
-         * the transaction is already committing.  Just start the IO and
-         * don't bother with all of this list nonsense
-         */
-        if (trans && root->fs_info->running_transaction->blocked) {
-                btrfs_wait_ordered_range(inode, 0, (u64)-1);
-                return;
-        }
        spin_lock(&root->fs_info->ordered_extent_lock);
        if (list_empty(&BTRFS_I(inode)->ordered_operations)) {
                list_add_tail(&BTRFS_I(inode)->ordered_operations,
@@ -959,6 +994,7 @@ int __init ordered_data_init(void)
                                     NULL);
        if (!btrfs_ordered_extent_cache)
                return -ENOMEM;
        return 0;
 }
diff --git a/fs/btrfs/ordered-data.h b/fs/btrfs/ordered-data.h
index dd27a0b46a37..f29d4bf5fbe7 100644
--- a/fs/btrfs/ordered-data.h
+++ b/fs/btrfs/ordered-data.h
@@ -76,7 +76,7 @@ struct btrfs_ordered_sum {
 #define BTRFS_ORDERED_IOERR 6 /* We had an io error when writing this out */
-#define BTRFS_ORDERED_UPDATED_ISIZE 7 /* indicates wether this ordered extent
+#define BTRFS_ORDERED_UPDATED_ISIZE 7 /* indicates whether this ordered extent
                                       * has done its due diligence in updating
                                       * the isize. */
@@ -128,8 +128,11 @@ struct btrfs_ordered_extent {
        struct list_head root_extent_list;
        struct btrfs_work work;
-};
+        struct completion completion;
+        struct btrfs_work flush_work;
+        struct list_head work_list;
+};
 /*
 * calculates the total size you need to allocate for an ordered sum
@@ -186,7 +189,7 @@ struct btrfs_ordered_extent *btrfs_lookup_ordered_range(struct inode *inode,
 int btrfs_ordered_update_i_size(struct inode *inode, u64 offset,
                                struct btrfs_ordered_extent *ordered);
 int btrfs_find_ordered_sum(struct inode *inode, u64 offset, u64 disk_bytenr, u32 *sum);
-void btrfs_run_ordered_operations(struct btrfs_root *root, int wait);
+int btrfs_run_ordered_operations(struct btrfs_root *root, int wait);
 void btrfs_add_ordered_operation(struct btrfs_trans_handle *trans,
                                 struct btrfs_root *root,
                                 struct inode *inode);
diff --git a/fs/btrfs/print-tree.c b/fs/btrfs/print-tree.c
index 5e23684887eb..50d95fd190a5 100644
--- a/fs/btrfs/print-tree.c
+++ b/fs/btrfs/print-tree.c
@@ -297,6 +297,9 @@ void btrfs_print_leaf(struct btrfs_root *root, struct extent_buffer *l)
                case BTRFS_DEV_STATS_KEY:
                        printk(KERN_INFO "\t\tdevice stats\n");
                        break;
+                case BTRFS_DEV_REPLACE_KEY:
+                        printk(KERN_INFO "\t\tdev replace\n");
+                        break;
                };
        }
 }
diff --git a/fs/btrfs/reada.c b/fs/btrfs/reada.c
index a955669519a2..96b93daa0bbb 100644
--- a/fs/btrfs/reada.c
+++ b/fs/btrfs/reada.c
@@ -27,6 +27,7 @@
 #include "volumes.h"
 #include "disk-io.h"
 #include "transaction.h"
+#include "dev-replace.h"
 #undef DEBUG
@@ -323,7 +324,6 @@ static struct reada_extent *reada_find_extent(struct btrfs_root *root,
        struct reada_extent *re = NULL;
        struct reada_extent *re_exist = NULL;
        struct btrfs_fs_info *fs_info = root->fs_info;
-        struct btrfs_mapping_tree *map_tree = &fs_info->mapping_tree;
        struct btrfs_bio *bbio = NULL;
        struct btrfs_device *dev;
        struct btrfs_device *prev_dev;
@@ -332,6 +332,7 @@ static struct reada_extent *reada_find_extent(struct btrfs_root *root,
        int nzones = 0;
        int i;
        unsigned long index = logical >> PAGE_CACHE_SHIFT;
+        int dev_replace_is_ongoing;
        spin_lock(&fs_info->reada_lock);
        re = radix_tree_lookup(&fs_info->reada_tree, index);
@@ -358,7 +359,8 @@ static struct reada_extent *reada_find_extent(struct btrfs_root *root,
         * map block
         */
        length = blocksize;
-        ret = btrfs_map_block(map_tree, REQ_WRITE, logical, &length, &bbio, 0);
+        ret = btrfs_map_block(fs_info, REQ_GET_READ_MIRRORS, logical, &length,
+                              &bbio, 0);
        if (ret || !bbio || length < blocksize)
                goto error;
@@ -393,6 +395,7 @@ static struct reada_extent *reada_find_extent(struct btrfs_root *root,
        }
        /* insert extent in reada_tree + all per-device trees, all or nothing */
+        btrfs_dev_replace_lock(&fs_info->dev_replace);
        spin_lock(&fs_info->reada_lock);
        ret = radix_tree_insert(&fs_info->reada_tree, index, re);
        if (ret == -EEXIST) {
@@ -400,13 +403,17 @@ static struct reada_extent *reada_find_extent(struct btrfs_root *root,
                BUG_ON(!re_exist);
                re_exist->refcnt++;
                spin_unlock(&fs_info->reada_lock);
+                btrfs_dev_replace_unlock(&fs_info->dev_replace);
                goto error;
        }
        if (ret) {
                spin_unlock(&fs_info->reada_lock);
+                btrfs_dev_replace_unlock(&fs_info->dev_replace);
                goto error;
        }
        prev_dev = NULL;
+        dev_replace_is_ongoing = btrfs_dev_replace_is_ongoing(
+                        &fs_info->dev_replace);
        for (i = 0; i < nzones; ++i) {
                dev = bbio->stripes[i].dev;
                if (dev == prev_dev) {
@@ -419,21 +426,36 @@ static struct reada_extent *reada_find_extent(struct btrfs_root *root,
                         */
                        continue;
                }
+                if (!dev->bdev) {
+                        /* cannot read ahead on missing device */
+                        continue;
+                }
+                if (dev_replace_is_ongoing &&
+                    dev == fs_info->dev_replace.tgtdev) {
+                        /*
+                         * as this device is selected for reading only as
+                         * a last resort, skip it for read ahead.
+                         */
+                        continue;
+                }
                prev_dev = dev;
                ret = radix_tree_insert(&dev->reada_extents, index, re);
                if (ret) {
                        while (--i >= 0) {
                                dev = bbio->stripes[i].dev;
                                BUG_ON(dev == NULL);
+                                /* ignore whether the entry was inserted */
                                radix_tree_delete(&dev->reada_extents, index);
                        }
                        BUG_ON(fs_info == NULL);
                        radix_tree_delete(&fs_info->reada_tree, index);
                        spin_unlock(&fs_info->reada_lock);
+                        btrfs_dev_replace_unlock(&fs_info->dev_replace);
                        goto error;
                }
        }
        spin_unlock(&fs_info->reada_lock);
+        btrfs_dev_replace_unlock(&fs_info->dev_replace);
        kfree(bbio);
        return re;
@@ -915,7 +937,10 @@ struct reada_control *btrfs_reada_add(struct btrfs_root *root,
        generation = btrfs_header_generation(node);
        free_extent_buffer(node);
-        reada_add_block(rc, start, &max_key, level, generation);
+        if (reada_add_block(rc, start, &max_key, level, generation)) {
+                kfree(rc);
+                return ERR_PTR(-ENOMEM);
+        }
        reada_start_machine(root->fs_info);
diff --git a/fs/btrfs/relocation.c b/fs/btrfs/relocation.c
index 776f0aa128fc..300e09ac3659 100644
--- a/fs/btrfs/relocation.c
+++ b/fs/btrfs/relocation.c
@@ -2025,7 +2025,6 @@ static noinline_for_stack int merge_reloc_root(struct reloc_control *rc,
        struct btrfs_root_item *root_item;
        struct btrfs_path *path;
        struct extent_buffer *leaf;
-        unsigned long nr;
        int level;
        int max_level;
        int replaced = 0;
@@ -2074,7 +2073,8 @@ static noinline_for_stack int merge_reloc_root(struct reloc_control *rc,
                BUG_ON(IS_ERR(trans));
                trans->block_rsv = rc->block_rsv;
-                ret = btrfs_block_rsv_refill(root, rc->block_rsv, min_reserved);
+                ret = btrfs_block_rsv_refill(root, rc->block_rsv, min_reserved,
+                                             BTRFS_RESERVE_FLUSH_ALL);
                if (ret) {
                        BUG_ON(ret != -EAGAIN);
                        ret = btrfs_commit_transaction(trans, root);
@@ -2125,10 +2125,9 @@ static noinline_for_stack int merge_reloc_root(struct reloc_control *rc,
                               path->slots[level]);
                root_item->drop_level = level;
-                nr = trans->blocks_used;
                btrfs_end_transaction_throttle(trans, root);
-                btrfs_btree_balance_dirty(root, nr);
+                btrfs_btree_balance_dirty(root);
                if (replaced && rc->stage == UPDATE_DATA_PTRS)
                        invalidate_extent_cache(root, &key, &next_key);
@@ -2155,10 +2154,9 @@ out:
                btrfs_update_reloc_root(trans, root);
        }
-        nr = trans->blocks_used;
        btrfs_end_transaction_throttle(trans, root);
-        btrfs_btree_balance_dirty(root, nr);
+        btrfs_btree_balance_dirty(root);
        if (replaced && rc->stage == UPDATE_DATA_PTRS)
                invalidate_extent_cache(root, &key, &next_key);
@@ -2184,7 +2182,8 @@ int prepare_to_merge(struct reloc_control *rc, int err)
 again:
        if (!err) {
                num_bytes = rc->merging_rsv_size;
-                ret = btrfs_block_rsv_add(root, rc->block_rsv, num_bytes);
+                ret = btrfs_block_rsv_add(root, rc->block_rsv, num_bytes,
+                                          BTRFS_RESERVE_FLUSH_ALL);
                if (ret)
                        err = ret;
        }
@@ -2459,7 +2458,8 @@ static int reserve_metadata_space(struct btrfs_trans_handle *trans,
        num_bytes = calcu_metadata_size(rc, node, 1) * 2;
        trans->block_rsv = rc->block_rsv;
-        ret = btrfs_block_rsv_add(root, rc->block_rsv, num_bytes);
+        ret = btrfs_block_rsv_add(root, rc->block_rsv, num_bytes,
+                                  BTRFS_RESERVE_FLUSH_ALL);
        if (ret) {
                if (ret == -EAGAIN)
                        rc->commit_transaction = 1;
@@ -3259,7 +3259,6 @@ static int delete_block_group_cache(struct btrfs_fs_info *fs_info,
        struct btrfs_path *path;
        struct btrfs_root *root = fs_info->tree_root;
        struct btrfs_trans_handle *trans;
-        unsigned long nr;
        int ret = 0;
        if (inode)
@@ -3293,9 +3292,8 @@ truncate:
        ret = btrfs_truncate_free_space_cache(root, trans, path, inode);
        btrfs_free_path(path);
-        nr = trans->blocks_used;
        btrfs_end_transaction(trans, root);
-        btrfs_btree_balance_dirty(root, nr);
+        btrfs_btree_balance_dirty(root);
 out:
        iput(inode);
        return ret;
@@ -3685,7 +3683,8 @@ int prepare_to_relocate(struct reloc_control *rc)
         * is no reservation in transaction handle.
         */
        ret = btrfs_block_rsv_add(rc->extent_root, rc->block_rsv,
-                                  rc->extent_root->nodesize * 256);
+                                  rc->extent_root->nodesize * 256,
+                                  BTRFS_RESERVE_FLUSH_ALL);
        if (ret)
                return ret;
@@ -3711,7 +3710,6 @@ static noinline_for_stack int relocate_block_group(struct reloc_control *rc)
        struct btrfs_trans_handle *trans = NULL;
        struct btrfs_path *path;
        struct btrfs_extent_item *ei;
-        unsigned long nr;
        u64 flags;
        u32 item_size;
        int ret;
@@ -3828,9 +3826,8 @@ restart:
                        ret = btrfs_commit_transaction(trans, rc->extent_root);
                        BUG_ON(ret);
                } else {
-                        nr = trans->blocks_used;
                        btrfs_end_transaction_throttle(trans, rc->extent_root);
-                        btrfs_btree_balance_dirty(rc->extent_root, nr);
+                        btrfs_btree_balance_dirty(rc->extent_root);
                }
                trans = NULL;
@@ -3860,9 +3857,8 @@ restart:
                          GFP_NOFS);
        if (trans) {
-                nr = trans->blocks_used;
                btrfs_end_transaction_throttle(trans, rc->extent_root);
-                btrfs_btree_balance_dirty(rc->extent_root, nr);
+                btrfs_btree_balance_dirty(rc->extent_root);
        }
        if (!err) {
@@ -3941,7 +3937,6 @@ struct inode *create_reloc_inode(struct btrfs_fs_info *fs_info,
        struct btrfs_trans_handle *trans;
        struct btrfs_root *root;
        struct btrfs_key key;
-        unsigned long nr;
        u64 objectid = BTRFS_FIRST_FREE_OBJECTID;
        int err = 0;
@@ -3969,9 +3964,8 @@ struct inode *create_reloc_inode(struct btrfs_fs_info *fs_info,
        err = btrfs_orphan_add(trans, inode);
 out:
-        nr = trans->blocks_used;
        btrfs_end_transaction(trans, root);
-        btrfs_btree_balance_dirty(root, nr);
+        btrfs_btree_balance_dirty(root);
        if (err) {
                if (inode)
                        iput(inode);
@@ -4057,7 +4051,11 @@ int btrfs_relocate_block_group(struct btrfs_root *extent_root, u64 group_start)
               (unsigned long long)rc->block_group->key.objectid,
               (unsigned long long)rc->block_group->flags);
-        btrfs_start_delalloc_inodes(fs_info->tree_root, 0);
+        ret = btrfs_start_delalloc_inodes(fs_info->tree_root, 0);
+        if (ret < 0) {
+                err = ret;
+                goto out;
+        }
        btrfs_wait_ordered_extents(fs_info->tree_root, 0);
        while (1) {
diff --git a/fs/btrfs/root-tree.c b/fs/btrfs/root-tree.c
index eb923d087da7..668af537a3ea 100644
--- a/fs/btrfs/root-tree.c
+++ b/fs/btrfs/root-tree.c
@@ -548,9 +548,9 @@ void btrfs_update_root_times(struct btrfs_trans_handle *trans,
        struct btrfs_root_item *item = &root->root_item;
        struct timespec ct = CURRENT_TIME;
-        spin_lock(&root->root_times_lock);
+        spin_lock(&root->root_item_lock);
        item->ctransid = cpu_to_le64(trans->transid);
        item->ctime.sec = cpu_to_le64(ct.tv_sec);
        item->ctime.nsec = cpu_to_le32(ct.tv_nsec);
-        spin_unlock(&root->root_times_lock);
+        spin_unlock(&root->root_item_lock);
 }
diff --git a/fs/btrfs/scrub.c b/fs/btrfs/scrub.c
index 27892f67e69b..bdbb94f245c9 100644
--- a/fs/btrfs/scrub.c
+++ b/fs/btrfs/scrub.c
@@ -1,5 +1,5 @@
 /*
- * Copyright (C) 2011 STRATO.  All rights reserved.
+ * Copyright (C) 2011, 2012 STRATO.  All rights reserved.
 *
 * This program is free software; you can redistribute it and/or
 * modify it under the terms of the GNU General Public
@@ -25,6 +25,7 @@
 #include "transaction.h"
 #include "backref.h"
 #include "extent_io.h"
+#include "dev-replace.h"
 #include "check-integrity.h"
 #include "rcu-string.h"
@@ -42,10 +43,23 @@
 */
 struct scrub_block;
-struct scrub_dev;
+struct scrub_ctx;
-#define SCRUB_PAGES_PER_BIO     16      /* 64k per bio */
+/*
-#define SCRUB_BIOS_PER_DEV      16      /* 1 MB per device in flight */
+ * the following three values only influence the performance.
+ * The last one configures the number of parallel and outstanding I/O
+ * operations. The first two values configure an upper limit for the number
+ * of (dynamically allocated) pages that are added to a bio.
+ */
+#define SCRUB_PAGES_PER_RD_BIO  32      /* 128k per bio */
+#define SCRUB_PAGES_PER_WR_BIO  32      /* 128k per bio */
+#define SCRUB_BIOS_PER_SCTX     64      /* 8MB per device in flight */
+/*
+ * the following value times PAGE_SIZE needs to be large enough to match the
+ * largest node/leaf/sector size that shall be supported.
+ * Values larger than BTRFS_STRIPE_LEN are not supported.
+ */
 #define SCRUB_MAX_PAGES_PER_BLOCK       16      /* 64k per node/leaf/sector */
 struct scrub_page {
@@ -56,6 +70,8 @@ struct scrub_page {
        u64                     generation;
        u64                     logical;
        u64                     physical;
+        u64                     physical_for_dev_replace;
+        atomic_t                ref_count;
        struct {
                unsigned int    mirror_num:8;
                unsigned int    have_csum:1;
@@ -66,23 +82,28 @@ struct scrub_page {
 struct scrub_bio {
        int                     index;
-        struct scrub_dev        *sdev;
+        struct scrub_ctx        *sctx;
+        struct btrfs_device     *dev;
        struct bio              *bio;
        int                     err;
        u64                     logical;
        u64                     physical;
-        struct scrub_page       *pagev[SCRUB_PAGES_PER_BIO];
+#if SCRUB_PAGES_PER_WR_BIO >= SCRUB_PAGES_PER_RD_BIO
+        struct scrub_page       *pagev[SCRUB_PAGES_PER_WR_BIO];
+#else
+        struct scrub_page       *pagev[SCRUB_PAGES_PER_RD_BIO];
+#endif
        int                     page_count;
        int                     next_free;
        struct btrfs_work       work;
 };
 struct scrub_block {
-        struct scrub_page       pagev[SCRUB_MAX_PAGES_PER_BLOCK];
+        struct scrub_page       *pagev[SCRUB_MAX_PAGES_PER_BLOCK];
        int                     page_count;
        atomic_t                outstanding_pages;
        atomic_t                ref_count; /* free mem on transition to zero */
-        struct scrub_dev        *sdev;
+        struct scrub_ctx        *sctx;
        struct {
                unsigned int    header_error:1;
                unsigned int    checksum_error:1;
@@ -91,23 +112,35 @@ struct scrub_block {
        };
 };
-struct scrub_dev {
+struct scrub_wr_ctx {
-        struct scrub_bio        *bios[SCRUB_BIOS_PER_DEV];
+        struct scrub_bio *wr_curr_bio;
-        struct btrfs_device     *dev;
+        struct btrfs_device *tgtdev;
+        int pages_per_wr_bio; /* <= SCRUB_PAGES_PER_WR_BIO */
+        atomic_t flush_all_writes;
+        struct mutex wr_lock;
+};
+struct scrub_ctx {
+        struct scrub_bio        *bios[SCRUB_BIOS_PER_SCTX];
+        struct btrfs_root       *dev_root;
        int                     first_free;
        int                     curr;
-        atomic_t                in_flight;
+        atomic_t                bios_in_flight;
-        atomic_t                fixup_cnt;
+        atomic_t                workers_pending;
        spinlock_t              list_lock;
        wait_queue_head_t       list_wait;
        u16                     csum_size;
        struct list_head        csum_list;
        atomic_t                cancel_req;
        int                     readonly;
-        int                     pages_per_bio; /* <= SCRUB_PAGES_PER_BIO */
+        int                     pages_per_rd_bio;
        u32                     sectorsize;
        u32                     nodesize;
        u32                     leafsize;
+        int                     is_dev_replace;
+        struct scrub_wr_ctx     wr_ctx;
        /*
         * statistics
         */
@@ -116,13 +149,23 @@ struct scrub_dev {
 };
 struct scrub_fixup_nodatasum {
-        struct scrub_dev        *sdev;
+        struct scrub_ctx        *sctx;
+        struct btrfs_device     *dev;
        u64                     logical;
        struct btrfs_root       *root;
        struct btrfs_work       work;
        int                     mirror_num;
 };
+struct scrub_copy_nocow_ctx {
+        struct scrub_ctx        *sctx;
+        u64                     logical;
+        u64                     len;
+        int                     mirror_num;
+        u64                     physical_for_dev_replace;
+        struct btrfs_work       work;
+};
 struct scrub_warning {
        struct btrfs_path       *path;
        u64                     extent_item_size;
@@ -137,15 +180,20 @@ struct scrub_warning {
 };
+static void scrub_pending_bio_inc(struct scrub_ctx *sctx);
+static void scrub_pending_bio_dec(struct scrub_ctx *sctx);
+static void scrub_pending_trans_workers_inc(struct scrub_ctx *sctx);
+static void scrub_pending_trans_workers_dec(struct scrub_ctx *sctx);
 static int scrub_handle_errored_block(struct scrub_block *sblock_to_check);
-static int scrub_setup_recheck_block(struct scrub_dev *sdev,
+static int scrub_setup_recheck_block(struct scrub_ctx *sctx,
-                                     struct btrfs_mapping_tree *map_tree,
+                                     struct btrfs_fs_info *fs_info,
+                                     struct scrub_block *original_sblock,
                                     u64 length, u64 logical,
-                                     struct scrub_block *sblock);
+                                     struct scrub_block *sblocks_for_recheck);
-static int scrub_recheck_block(struct btrfs_fs_info *fs_info,
+static void scrub_recheck_block(struct btrfs_fs_info *fs_info,
-                               struct scrub_block *sblock, int is_metadata,
+                                struct scrub_block *sblock, int is_metadata,
-                               int have_csum, u8 *csum, u64 generation,
+                                int have_csum, u8 *csum, u64 generation,
-                               u16 csum_size);
+                                u16 csum_size);
 static void scrub_recheck_block_checksum(struct btrfs_fs_info *fs_info,
                                         struct scrub_block *sblock,
                                         int is_metadata, int have_csum,
@@ -158,118 +206,221 @@ static int scrub_repair_block_from_good_copy(struct scrub_block *sblock_bad,
 static int scrub_repair_page_from_good_copy(struct scrub_block *sblock_bad,
                                            struct scrub_block *sblock_good,
                                            int page_num, int force_write);
+static void scrub_write_block_to_dev_replace(struct scrub_block *sblock);
+static int scrub_write_page_to_dev_replace(struct scrub_block *sblock,
+                                           int page_num);
 static int scrub_checksum_data(struct scrub_block *sblock);
 static int scrub_checksum_tree_block(struct scrub_block *sblock);
 static int scrub_checksum_super(struct scrub_block *sblock);
 static void scrub_block_get(struct scrub_block *sblock);
 static void scrub_block_put(struct scrub_block *sblock);
-static int scrub_add_page_to_bio(struct scrub_dev *sdev,
+static void scrub_page_get(struct scrub_page *spage);
-                                 struct scrub_page *spage);
+static void scrub_page_put(struct scrub_page *spage);
-static int scrub_pages(struct scrub_dev *sdev, u64 logical, u64 len,
+static int scrub_add_page_to_rd_bio(struct scrub_ctx *sctx,
-                       u64 physical, u64 flags, u64 gen, int mirror_num,
+                                    struct scrub_page *spage);
-                       u8 *csum, int force);
+static int scrub_pages(struct scrub_ctx *sctx, u64 logical, u64 len,
+                       u64 physical, struct btrfs_device *dev, u64 flags,
+                       u64 gen, int mirror_num, u8 *csum, int force,
+                       u64 physical_for_dev_replace);
 static void scrub_bio_end_io(struct bio *bio, int err);
 static void scrub_bio_end_io_worker(struct btrfs_work *work);
 static void scrub_block_complete(struct scrub_block *sblock);
+static void scrub_remap_extent(struct btrfs_fs_info *fs_info,
+                               u64 extent_logical, u64 extent_len,
+                               u64 *extent_physical,
+                               struct btrfs_device **extent_dev,
+                               int *extent_mirror_num);
+static int scrub_setup_wr_ctx(struct scrub_ctx *sctx,
+                              struct scrub_wr_ctx *wr_ctx,
+                              struct btrfs_fs_info *fs_info,
+                              struct btrfs_device *dev,
+                              int is_dev_replace);
+static void scrub_free_wr_ctx(struct scrub_wr_ctx *wr_ctx);
+static int scrub_add_page_to_wr_bio(struct scrub_ctx *sctx,
+                                    struct scrub_page *spage);
+static void scrub_wr_submit(struct scrub_ctx *sctx);
+static void scrub_wr_bio_end_io(struct bio *bio, int err);
+static void scrub_wr_bio_end_io_worker(struct btrfs_work *work);
+static int write_page_nocow(struct scrub_ctx *sctx,
+                            u64 physical_for_dev_replace, struct page *page);
+static int copy_nocow_pages_for_inode(u64 inum, u64 offset, u64 root,
+                                      void *ctx);
+static int copy_nocow_pages(struct scrub_ctx *sctx, u64 logical, u64 len,
+                            int mirror_num, u64 physical_for_dev_replace);
+static void copy_nocow_pages_worker(struct btrfs_work *work);
+static void scrub_pending_bio_inc(struct scrub_ctx *sctx)
+{
+        atomic_inc(&sctx->bios_in_flight);
+}
+static void scrub_pending_bio_dec(struct scrub_ctx *sctx)
+{
+        atomic_dec(&sctx->bios_in_flight);
+        wake_up(&sctx->list_wait);
+}
+/*
+ * used for workers that require transaction commits (i.e., for the
+ * NOCOW case)
+ */
+static void scrub_pending_trans_workers_inc(struct scrub_ctx *sctx)
+{
+        struct btrfs_fs_info *fs_info = sctx->dev_root->fs_info;
+        /*
+         * increment scrubs_running to prevent cancel requests from
+         * completing as long as a worker is running. we must also
+         * increment scrubs_paused to prevent deadlocking on pause
+         * requests used for transactions commits (as the worker uses a
+         * transaction context). it is safe to regard the worker
+         * as paused for all matters practical. effectively, we only
+         * avoid cancellation requests from completing.
+         */
+        mutex_lock(&fs_info->scrub_lock);
+        atomic_inc(&fs_info->scrubs_running);
+        atomic_inc(&fs_info->scrubs_paused);
+        mutex_unlock(&fs_info->scrub_lock);
+        atomic_inc(&sctx->workers_pending);
+}
+/* used for workers that require transaction commits */
+static void scrub_pending_trans_workers_dec(struct scrub_ctx *sctx)
+{
+        struct btrfs_fs_info *fs_info = sctx->dev_root->fs_info;
-static void scrub_free_csums(struct scrub_dev *sdev)
+        /*
+         * see scrub_pending_trans_workers_inc() why we're pretending
+         * to be paused in the scrub counters
+         */
+        mutex_lock(&fs_info->scrub_lock);
+        atomic_dec(&fs_info->scrubs_running);
+        atomic_dec(&fs_info->scrubs_paused);
+        mutex_unlock(&fs_info->scrub_lock);
+        atomic_dec(&sctx->workers_pending);
+        wake_up(&fs_info->scrub_pause_wait);
+        wake_up(&sctx->list_wait);
+}
+static void scrub_free_csums(struct scrub_ctx *sctx)
 {
-        while (!list_empty(&sdev->csum_list)) {
+        while (!list_empty(&sctx->csum_list)) {
                struct btrfs_ordered_sum *sum;
-                sum = list_first_entry(&sdev->csum_list,
+                sum = list_first_entry(&sctx->csum_list,
                                       struct btrfs_ordered_sum, list);
                list_del(&sum->list);
                kfree(sum);
        }
 }
-static noinline_for_stack void scrub_free_dev(struct scrub_dev *sdev)
+static noinline_for_stack void scrub_free_ctx(struct scrub_ctx *sctx)
 {
        int i;
-        if (!sdev)
+        if (!sctx)
                return;
+        scrub_free_wr_ctx(&sctx->wr_ctx);
        /* this can happen when scrub is cancelled */
-        if (sdev->curr != -1) {
+        if (sctx->curr != -1) {
-                struct scrub_bio *sbio = sdev->bios[sdev->curr];
+                struct scrub_bio *sbio = sctx->bios[sctx->curr];
                for (i = 0; i < sbio->page_count; i++) {
-                        BUG_ON(!sbio->pagev[i]);
+                        WARN_ON(!sbio->pagev[i]->page);
-                        BUG_ON(!sbio->pagev[i]->page);
                        scrub_block_put(sbio->pagev[i]->sblock);
                }
                bio_put(sbio->bio);
        }
-        for (i = 0; i < SCRUB_BIOS_PER_DEV; ++i) {
+        for (i = 0; i < SCRUB_BIOS_PER_SCTX; ++i) {
-                struct scrub_bio *sbio = sdev->bios[i];
+                struct scrub_bio *sbio = sctx->bios[i];
                if (!sbio)
                        break;
                kfree(sbio);
        }
-        scrub_free_csums(sdev);
+        scrub_free_csums(sctx);
-        kfree(sdev);
+        kfree(sctx);
 }
 static noinline_for_stack
-struct scrub_dev *scrub_setup_dev(struct btrfs_device *dev)
+struct scrub_ctx *scrub_setup_ctx(struct btrfs_device *dev, int is_dev_replace)
 {
-        struct scrub_dev *sdev;
+        struct scrub_ctx *sctx;
        int             i;
        struct btrfs_fs_info *fs_info = dev->dev_root->fs_info;
-        int pages_per_bio;
+        int pages_per_rd_bio;
+        int ret;
-        pages_per_bio = min_t(int, SCRUB_PAGES_PER_BIO,
+        /*
-                              bio_get_nr_vecs(dev->bdev));
+         * the setting of pages_per_rd_bio is correct for scrub but might
-        sdev = kzalloc(sizeof(*sdev), GFP_NOFS);
+         * be wrong for the dev_replace code where we might read from
-        if (!sdev)
+         * different devices in the initial huge bios. However, that
+         * code is able to correctly handle the case when adding a page
+         * to a bio fails.
+         */
+        if (dev->bdev)
+                pages_per_rd_bio = min_t(int, SCRUB_PAGES_PER_RD_BIO,
+                                         bio_get_nr_vecs(dev->bdev));
+        else
+                pages_per_rd_bio = SCRUB_PAGES_PER_RD_BIO;
+        sctx = kzalloc(sizeof(*sctx), GFP_NOFS);
+        if (!sctx)
                goto nomem;
-        sdev->dev = dev;
+        sctx->is_dev_replace = is_dev_replace;
-        sdev->pages_per_bio = pages_per_bio;
+        sctx->pages_per_rd_bio = pages_per_rd_bio;
-        sdev->curr = -1;
+        sctx->curr = -1;
-        for (i = 0; i < SCRUB_BIOS_PER_DEV; ++i) {
+        sctx->dev_root = dev->dev_root;
+        for (i = 0; i < SCRUB_BIOS_PER_SCTX; ++i) {
                struct scrub_bio *sbio;
                sbio = kzalloc(sizeof(*sbio), GFP_NOFS);
                if (!sbio)
                        goto nomem;
-                sdev->bios[i] = sbio;
+                sctx->bios[i] = sbio;
                sbio->index = i;
-                sbio->sdev = sdev;
+                sbio->sctx = sctx;
                sbio->page_count = 0;
                sbio->work.func = scrub_bio_end_io_worker;
-                if (i != SCRUB_BIOS_PER_DEV-1)
+                if (i != SCRUB_BIOS_PER_SCTX - 1)
-                        sdev->bios[i]->next_free = i + 1;
+                        sctx->bios[i]->next_free = i + 1;
                else
-                        sdev->bios[i]->next_free = -1;
+                        sctx->bios[i]->next_free = -1;
-        }
+        }
-        sdev->first_free = 0;
+        sctx->first_free = 0;
-        sdev->nodesize = dev->dev_root->nodesize;
+        sctx->nodesize = dev->dev_root->nodesize;
-        sdev->leafsize = dev->dev_root->leafsize;
+        sctx->leafsize = dev->dev_root->leafsize;
-        sdev->sectorsize = dev->dev_root->sectorsize;
+        sctx->sectorsize = dev->dev_root->sectorsize;
-        atomic_set(&sdev->in_flight, 0);
+        atomic_set(&sctx->bios_in_flight, 0);
-        atomic_set(&sdev->fixup_cnt, 0);
+        atomic_set(&sctx->workers_pending, 0);
-        atomic_set(&sdev->cancel_req, 0);
+        atomic_set(&sctx->cancel_req, 0);
-        sdev->csum_size = btrfs_super_csum_size(fs_info->super_copy);
+        sctx->csum_size = btrfs_super_csum_size(fs_info->super_copy);
-        INIT_LIST_HEAD(&sdev->csum_list);
+        INIT_LIST_HEAD(&sctx->csum_list);
-        spin_lock_init(&sdev->list_lock);
+        spin_lock_init(&sctx->list_lock);
-        spin_lock_init(&sdev->stat_lock);
+        spin_lock_init(&sctx->stat_lock);
-        init_waitqueue_head(&sdev->list_wait);
+        init_waitqueue_head(&sctx->list_wait);
-        return sdev;
+        ret = scrub_setup_wr_ctx(sctx, &sctx->wr_ctx, fs_info,
+                                 fs_info->dev_replace.tgtdev, is_dev_replace);
+        if (ret) {
+                scrub_free_ctx(sctx);
+                return ERR_PTR(ret);
+        }
+        return sctx;
 nomem:
-        scrub_free_dev(sdev);
+        scrub_free_ctx(sctx);
        return ERR_PTR(-ENOMEM);
 }
-static int scrub_print_warning_inode(u64 inum, u64 offset, u64 root, void *ctx)
+static int scrub_print_warning_inode(u64 inum, u64 offset, u64 root,
+                                     void *warn_ctx)
 {
        u64 isize;
        u32 nlink;
@@ -277,7 +428,7 @@ static int scrub_print_warning_inode(u64 inum, u64 offset, u64 root, void *ctx)
        int i;
        struct extent_buffer *eb;
        struct btrfs_inode_item *inode_item;
-        struct scrub_warning *swarn = ctx;
+        struct scrub_warning *swarn = warn_ctx;
        struct btrfs_fs_info *fs_info = swarn->dev->dev_root->fs_info;
        struct inode_fs_paths *ipath = NULL;
        struct btrfs_root *local_root;
@@ -345,8 +496,8 @@ err:
 static void scrub_print_warning(const char *errstr, struct scrub_block *sblock)
 {
-        struct btrfs_device *dev = sblock->sdev->dev;
+        struct btrfs_device *dev;
-        struct btrfs_fs_info *fs_info = dev->dev_root->fs_info;
+        struct btrfs_fs_info *fs_info;
        struct btrfs_path *path;
        struct btrfs_key found_key;
        struct extent_buffer *eb;
@@ -361,15 +512,18 @@ static void scrub_print_warning(const char *errstr, struct scrub_block *sblock)
        const int bufsize = 4096;
        int ret;
+        WARN_ON(sblock->page_count < 1);
+        dev = sblock->pagev[0]->dev;
+        fs_info = sblock->sctx->dev_root->fs_info;
        path = btrfs_alloc_path();
        swarn.scratch_buf = kmalloc(bufsize, GFP_NOFS);
        swarn.msg_buf = kmalloc(bufsize, GFP_NOFS);
-        BUG_ON(sblock->page_count < 1);
+        swarn.sector = (sblock->pagev[0]->physical) >> 9;
-        swarn.sector = (sblock->pagev[0].physical) >> 9;
+        swarn.logical = sblock->pagev[0]->logical;
-        swarn.logical = sblock->pagev[0].logical;
        swarn.errstr = errstr;
-        swarn.dev = dev;
+        swarn.dev = NULL;
        swarn.msg_bufsize = bufsize;
        swarn.scratch_bufsize = bufsize;
@@ -405,6 +559,7 @@ static void scrub_print_warning(const char *errstr, struct scrub_block *sblock)
                } while (ret != 1);
        } else {
                swarn.path = path;
+                swarn.dev = dev;
                iterate_extent_inodes(fs_info, found_key.objectid,
                                        extent_item_pos, 1,
                                        scrub_print_warning_inode, &swarn);
@@ -416,11 +571,11 @@ out:
        kfree(swarn.msg_buf);
 }
-static int scrub_fixup_readpage(u64 inum, u64 offset, u64 root, void *ctx)
+static int scrub_fixup_readpage(u64 inum, u64 offset, u64 root, void *fixup_ctx)
 {
        struct page *page = NULL;
        unsigned long index;
-        struct scrub_fixup_nodatasum *fixup = ctx;
+        struct scrub_fixup_nodatasum *fixup = fixup_ctx;
        int ret;
        int corrected = 0;
        struct btrfs_key key;
@@ -451,7 +606,7 @@ static int scrub_fixup_readpage(u64 inum, u64 offset, u64 root, void *ctx)
        }
        if (PageUptodate(page)) {
-                struct btrfs_mapping_tree *map_tree;
+                struct btrfs_fs_info *fs_info;
                if (PageDirty(page)) {
                        /*
                         * we need to write the data to the defect sector. the
@@ -472,8 +627,8 @@ static int scrub_fixup_readpage(u64 inum, u64 offset, u64 root, void *ctx)
                        ret = -EIO;
                        goto out;
                }
-                map_tree = &BTRFS_I(inode)->root->fs_info->mapping_tree;
+                fs_info = BTRFS_I(inode)->root->fs_info;
-                ret = repair_io_failure(map_tree, offset, PAGE_SIZE,
+                ret = repair_io_failure(fs_info, offset, PAGE_SIZE,
                                        fixup->logical, page,
                                        fixup->mirror_num);
                unlock_page(page);
@@ -530,21 +685,21 @@ static void scrub_fixup_nodatasum(struct btrfs_work *work)
 {
        int ret;
        struct scrub_fixup_nodatasum *fixup;
-        struct scrub_dev *sdev;
+        struct scrub_ctx *sctx;
        struct btrfs_trans_handle *trans = NULL;
        struct btrfs_fs_info *fs_info;
        struct btrfs_path *path;
        int uncorrectable = 0;
        fixup = container_of(work, struct scrub_fixup_nodatasum, work);
-        sdev = fixup->sdev;
+        sctx = fixup->sctx;
        fs_info = fixup->root->fs_info;
        path = btrfs_alloc_path();
        if (!path) {
-                spin_lock(&sdev->stat_lock);
+                spin_lock(&sctx->stat_lock);
-                ++sdev->stat.malloc_errors;
+                ++sctx->stat.malloc_errors;
-                spin_unlock(&sdev->stat_lock);
+                spin_unlock(&sctx->stat_lock);
                uncorrectable = 1;
                goto out;
        }
@@ -573,35 +728,30 @@ static void scrub_fixup_nodatasum(struct btrfs_work *work)
        }
        WARN_ON(ret != 1);
-        spin_lock(&sdev->stat_lock);
+        spin_lock(&sctx->stat_lock);
-        ++sdev->stat.corrected_errors;
+        ++sctx->stat.corrected_errors;
-        spin_unlock(&sdev->stat_lock);
+        spin_unlock(&sctx->stat_lock);
 out:
        if (trans && !IS_ERR(trans))
                btrfs_end_transaction(trans, fixup->root);
        if (uncorrectable) {
-                spin_lock(&sdev->stat_lock);
+                spin_lock(&sctx->stat_lock);
-                ++sdev->stat.uncorrectable_errors;
+                ++sctx->stat.uncorrectable_errors;
-                spin_unlock(&sdev->stat_lock);
+                spin_unlock(&sctx->stat_lock);
+                btrfs_dev_replace_stats_inc(
+                        &sctx->dev_root->fs_info->dev_replace.
+                        num_uncorrectable_read_errors);
                printk_ratelimited_in_rcu(KERN_ERR
                        "btrfs: unable to fixup (nodatasum) error at logical %llu on dev %s\n",
                        (unsigned long long)fixup->logical,
-                        rcu_str_deref(sdev->dev->name));
+                        rcu_str_deref(fixup->dev->name));
        }
        btrfs_free_path(path);
        kfree(fixup);
-        /* see caller why we're pretending to be paused in the scrub counters */
+        scrub_pending_trans_workers_dec(sctx);
-        mutex_lock(&fs_info->scrub_lock);
-        atomic_dec(&fs_info->scrubs_running);
-        atomic_dec(&fs_info->scrubs_paused);
-        mutex_unlock(&fs_info->scrub_lock);
-        atomic_dec(&sdev->fixup_cnt);
-        wake_up(&fs_info->scrub_pause_wait);
-        wake_up(&sdev->list_wait);
 }
 /*
@@ -614,7 +764,8 @@ out:
 */
 static int scrub_handle_errored_block(struct scrub_block *sblock_to_check)
 {
-        struct scrub_dev *sdev = sblock_to_check->sdev;
+        struct scrub_ctx *sctx = sblock_to_check->sctx;
+        struct btrfs_device *dev;
        struct btrfs_fs_info *fs_info;
        u64 length;
        u64 logical;
@@ -633,16 +784,33 @@ static int scrub_handle_errored_block(struct scrub_block *sblock_to_check)
                                      DEFAULT_RATELIMIT_BURST);
        BUG_ON(sblock_to_check->page_count < 1);
-        fs_info = sdev->dev->dev_root->fs_info;
+        fs_info = sctx->dev_root->fs_info;
+        if (sblock_to_check->pagev[0]->flags & BTRFS_EXTENT_FLAG_SUPER) {
+                /*
+                 * if we find an error in a super block, we just report it.
+                 * They will get written with the next transaction commit
+                 * anyway
+                 */
+                spin_lock(&sctx->stat_lock);
+                ++sctx->stat.super_errors;
+                spin_unlock(&sctx->stat_lock);
+                return 0;
+        }
        length = sblock_to_check->page_count * PAGE_SIZE;
-        logical = sblock_to_check->pagev[0].logical;
+        logical = sblock_to_check->pagev[0]->logical;
-        generation = sblock_to_check->pagev[0].generation;
+        generation = sblock_to_check->pagev[0]->generation;
-        BUG_ON(sblock_to_check->pagev[0].mirror_num < 1);
+        BUG_ON(sblock_to_check->pagev[0]->mirror_num < 1);
-        failed_mirror_index = sblock_to_check->pagev[0].mirror_num - 1;
+        failed_mirror_index = sblock_to_check->pagev[0]->mirror_num - 1;
-        is_metadata = !(sblock_to_check->pagev[0].flags &
+        is_metadata = !(sblock_to_check->pagev[0]->flags &
                        BTRFS_EXTENT_FLAG_DATA);
-        have_csum = sblock_to_check->pagev[0].have_csum;
+        have_csum = sblock_to_check->pagev[0]->have_csum;
-        csum = sblock_to_check->pagev[0].csum;
+        csum = sblock_to_check->pagev[0]->csum;
+        dev = sblock_to_check->pagev[0]->dev;
+        if (sctx->is_dev_replace && !is_metadata && !have_csum) {
+                sblocks_for_recheck = NULL;
+                goto nodatasum_case;
+        }
        /*
         * read all mirrors one after the other. This includes to
@@ -677,43 +845,32 @@ static int scrub_handle_errored_block(struct scrub_block *sblock_to_check)
                                     sizeof(*sblocks_for_recheck),
                                     GFP_NOFS);
        if (!sblocks_for_recheck) {
-                spin_lock(&sdev->stat_lock);
+                spin_lock(&sctx->stat_lock);
-                sdev->stat.malloc_errors++;
+                sctx->stat.malloc_errors++;
-                sdev->stat.read_errors++;
+                sctx->stat.read_errors++;
-                sdev->stat.uncorrectable_errors++;
+                sctx->stat.uncorrectable_errors++;
-                spin_unlock(&sdev->stat_lock);
+                spin_unlock(&sctx->stat_lock);
-                btrfs_dev_stat_inc_and_print(sdev->dev,
+                btrfs_dev_stat_inc_and_print(dev, BTRFS_DEV_STAT_READ_ERRS);
-                                             BTRFS_DEV_STAT_READ_ERRS);
                goto out;
        }
        /* setup the context, map the logical blocks and alloc the pages */
-        ret = scrub_setup_recheck_block(sdev, &fs_info->mapping_tree, length,
+        ret = scrub_setup_recheck_block(sctx, fs_info, sblock_to_check, length,
                                        logical, sblocks_for_recheck);
        if (ret) {
-                spin_lock(&sdev->stat_lock);
+                spin_lock(&sctx->stat_lock);
-                sdev->stat.read_errors++;
+                sctx->stat.read_errors++;
-                sdev->stat.uncorrectable_errors++;
+                sctx->stat.uncorrectable_errors++;
-                spin_unlock(&sdev->stat_lock);
+                spin_unlock(&sctx->stat_lock);
-                btrfs_dev_stat_inc_and_print(sdev->dev,
+                btrfs_dev_stat_inc_and_print(dev, BTRFS_DEV_STAT_READ_ERRS);
-                                             BTRFS_DEV_STAT_READ_ERRS);
                goto out;
        }
        BUG_ON(failed_mirror_index >= BTRFS_MAX_MIRRORS);
        sblock_bad = sblocks_for_recheck + failed_mirror_index;
        /* build and submit the bios for the failed mirror, check checksums */
-        ret = scrub_recheck_block(fs_info, sblock_bad, is_metadata, have_csum,
+        scrub_recheck_block(fs_info, sblock_bad, is_metadata, have_csum,
-                                  csum, generation, sdev->csum_size);
+                            csum, generation, sctx->csum_size);
-        if (ret) {
-                spin_lock(&sdev->stat_lock);
-                sdev->stat.read_errors++;
-                sdev->stat.uncorrectable_errors++;
-                spin_unlock(&sdev->stat_lock);
-                btrfs_dev_stat_inc_and_print(sdev->dev,
-                                             BTRFS_DEV_STAT_READ_ERRS);
-                goto out;
-        }
        if (!sblock_bad->header_error && !sblock_bad->checksum_error &&
            sblock_bad->no_io_error_seen) {
@@ -725,50 +882,54 @@ static int scrub_handle_errored_block(struct scrub_block *sblock_to_check)
                 * different bio (usually one of the two latter cases is
                 * the cause)
                 */
-                spin_lock(&sdev->stat_lock);
+                spin_lock(&sctx->stat_lock);
-                sdev->stat.unverified_errors++;
+                sctx->stat.unverified_errors++;
-                spin_unlock(&sdev->stat_lock);
+                spin_unlock(&sctx->stat_lock);
+                if (sctx->is_dev_replace)
+                        scrub_write_block_to_dev_replace(sblock_bad);
                goto out;
        }
        if (!sblock_bad->no_io_error_seen) {
-                spin_lock(&sdev->stat_lock);
+                spin_lock(&sctx->stat_lock);
-                sdev->stat.read_errors++;
+                sctx->stat.read_errors++;
-                spin_unlock(&sdev->stat_lock);
+                spin_unlock(&sctx->stat_lock);
                if (__ratelimit(&_rs))
                        scrub_print_warning("i/o error", sblock_to_check);
-                btrfs_dev_stat_inc_and_print(sdev->dev,
+                btrfs_dev_stat_inc_and_print(dev, BTRFS_DEV_STAT_READ_ERRS);
-                                             BTRFS_DEV_STAT_READ_ERRS);
        } else if (sblock_bad->checksum_error) {
-                spin_lock(&sdev->stat_lock);
+                spin_lock(&sctx->stat_lock);
-                sdev->stat.csum_errors++;
+                sctx->stat.csum_errors++;
-                spin_unlock(&sdev->stat_lock);
+                spin_unlock(&sctx->stat_lock);
                if (__ratelimit(&_rs))
                        scrub_print_warning("checksum error", sblock_to_check);
-                btrfs_dev_stat_inc_and_print(sdev->dev,
+                btrfs_dev_stat_inc_and_print(dev,
                                             BTRFS_DEV_STAT_CORRUPTION_ERRS);
        } else if (sblock_bad->header_error) {
-                spin_lock(&sdev->stat_lock);
+                spin_lock(&sctx->stat_lock);
-                sdev->stat.verify_errors++;
+                sctx->stat.verify_errors++;
-                spin_unlock(&sdev->stat_lock);
+                spin_unlock(&sctx->stat_lock);
                if (__ratelimit(&_rs))
                        scrub_print_warning("checksum/header error",
                                            sblock_to_check);
                if (sblock_bad->generation_error)
-                        btrfs_dev_stat_inc_and_print(sdev->dev,
+                        btrfs_dev_stat_inc_and_print(dev,
                                BTRFS_DEV_STAT_GENERATION_ERRS);
                else
-                        btrfs_dev_stat_inc_and_print(sdev->dev,
+                        btrfs_dev_stat_inc_and_print(dev,
                                BTRFS_DEV_STAT_CORRUPTION_ERRS);
        }
-        if (sdev->readonly)
+        if (sctx->readonly && !sctx->is_dev_replace)
                goto did_not_correct_error;
        if (!is_metadata && !have_csum) {
                struct scrub_fixup_nodatasum *fixup_nodatasum;
+nodatasum_case:
+                WARN_ON(sctx->is_dev_replace);
                /*
                 * !is_metadata and !have_csum, this means that the data
                 * might not be COW'ed, that it might be modified
@@ -779,24 +940,12 @@ static int scrub_handle_errored_block(struct scrub_block *sblock_to_check)
                fixup_nodatasum = kzalloc(sizeof(*fixup_nodatasum), GFP_NOFS);
                if (!fixup_nodatasum)
                        goto did_not_correct_error;
-                fixup_nodatasum->sdev = sdev;
+                fixup_nodatasum->sctx = sctx;
+                fixup_nodatasum->dev = dev;
                fixup_nodatasum->logical = logical;
                fixup_nodatasum->root = fs_info->extent_root;
                fixup_nodatasum->mirror_num = failed_mirror_index + 1;
-                /*
+                scrub_pending_trans_workers_inc(sctx);
-                 * increment scrubs_running to prevent cancel requests from
-                 * completing as long as a fixup worker is running. we must also
-                 * increment scrubs_paused to prevent deadlocking on pause
-                 * requests used for transactions commits (as the worker uses a
-                 * transaction context). it is safe to regard the fixup worker
-                 * as paused for all matters practical. effectively, we only
-                 * avoid cancellation requests from completing.
-                 */
-                mutex_lock(&fs_info->scrub_lock);
-                atomic_inc(&fs_info->scrubs_running);
-                atomic_inc(&fs_info->scrubs_paused);
-                mutex_unlock(&fs_info->scrub_lock);
-                atomic_inc(&sdev->fixup_cnt);
                fixup_nodatasum->work.func = scrub_fixup_nodatasum;
                btrfs_queue_worker(&fs_info->scrub_workers,
                                   &fixup_nodatasum->work);
@@ -805,26 +954,8 @@ static int scrub_handle_errored_block(struct scrub_block *sblock_to_check)
        /*
         * now build and submit the bios for the other mirrors, check
-         * checksums
+         * checksums.
-         */
+         * First try to pick the mirror which is completely without I/O
-        for (mirror_index = 0;
-             mirror_index < BTRFS_MAX_MIRRORS &&
-             sblocks_for_recheck[mirror_index].page_count > 0;
-             mirror_index++) {
-                if (mirror_index == failed_mirror_index)
-                        continue;
-                /* build and submit the bios, check checksums */
-                ret = scrub_recheck_block(fs_info,
-                                          sblocks_for_recheck + mirror_index,
-                                          is_metadata, have_csum, csum,
-                                          generation, sdev->csum_size);
-                if (ret)
-                        goto did_not_correct_error;
-        }
-        /*
-         * first try to pick the mirror which is completely without I/O
         * errors and also does not have a checksum error.
         * If one is found, and if a checksum is present, the full block
         * that is known to contain an error is rewritten. Afterwards
@@ -840,24 +971,93 @@ static int scrub_handle_errored_block(struct scrub_block *sblock_to_check)
             mirror_index < BTRFS_MAX_MIRRORS &&
             sblocks_for_recheck[mirror_index].page_count > 0;
             mirror_index++) {
-                struct scrub_block *sblock_other = sblocks_for_recheck +
+                struct scrub_block *sblock_other;
-                                                   mirror_index;
+                if (mirror_index == failed_mirror_index)
+                        continue;
+                sblock_other = sblocks_for_recheck + mirror_index;
+                /* build and submit the bios, check checksums */
+                scrub_recheck_block(fs_info, sblock_other, is_metadata,
+                                    have_csum, csum, generation,
+                                    sctx->csum_size);
                if (!sblock_other->header_error &&
                    !sblock_other->checksum_error &&
                    sblock_other->no_io_error_seen) {
-                        int force_write = is_metadata || have_csum;
+                        if (sctx->is_dev_replace) {
+                                scrub_write_block_to_dev_replace(sblock_other);
-                        ret = scrub_repair_block_from_good_copy(sblock_bad,
+                        } else {
-                                                                sblock_other,
+                                int force_write = is_metadata || have_csum;
-                                                                force_write);
+                                ret = scrub_repair_block_from_good_copy(
+                                                sblock_bad, sblock_other,
+                                                force_write);
+                        }
                        if (0 == ret)
                                goto corrected_error;
                }
        }
        /*
-         * in case of I/O errors in the area that is supposed to be
+         * for dev_replace, pick good pages and write to the target device.
+         */
+        if (sctx->is_dev_replace) {
+                success = 1;
+                for (page_num = 0; page_num < sblock_bad->page_count;
+                     page_num++) {
+                        int sub_success;
+                        sub_success = 0;
+                        for (mirror_index = 0;
+                             mirror_index < BTRFS_MAX_MIRRORS &&
+                             sblocks_for_recheck[mirror_index].page_count > 0;
+                             mirror_index++) {
+                                struct scrub_block *sblock_other =
+                                        sblocks_for_recheck + mirror_index;
+                                struct scrub_page *page_other =
+                                        sblock_other->pagev[page_num];
+                                if (!page_other->io_error) {
+                                        ret = scrub_write_page_to_dev_replace(
+                                                        sblock_other, page_num);
+                                        if (ret == 0) {
+                                                /* succeeded for this page */
+                                                sub_success = 1;
+                                                break;
+                                        } else {
+                                                btrfs_dev_replace_stats_inc(
+                                                        &sctx->dev_root->
+                                                        fs_info->dev_replace.
+                                                        num_write_errors);
+                                        }
+                                }
+                        }
+                        if (!sub_success) {
+                                /*
+                                 * did not find a mirror to fetch the page
+                                 * from. scrub_write_page_to_dev_replace()
+                                 * handles this case (page->io_error), by
+                                 * filling the block with zeros before
+                                 * submitting the write request
+                                 */
+                                success = 0;
+                                ret = scrub_write_page_to_dev_replace(
+                                                sblock_bad, page_num);
+                                if (ret)
+                                        btrfs_dev_replace_stats_inc(
+                                                &sctx->dev_root->fs_info->
+                                                dev_replace.num_write_errors);
+                        }
+                }
+                goto out;
+        }
+        /*
+         * for regular scrub, repair those pages that are errored.
+         * In case of I/O errors in the area that is supposed to be
         * repaired, continue by picking good copies of those pages.
         * Select the good pages from mirrors to rewrite bad pages from
         * the area to fix. Afterwards verify the checksum of the block
@@ -887,7 +1087,7 @@ static int scrub_handle_errored_block(struct scrub_block *sblock_to_check)
        success = 1;
        for (page_num = 0; page_num < sblock_bad->page_count; page_num++) {
-                struct scrub_page *page_bad = sblock_bad->pagev + page_num;
+                struct scrub_page *page_bad = sblock_bad->pagev[page_num];
                if (!page_bad->io_error)
                        continue;
@@ -898,8 +1098,8 @@ static int scrub_handle_errored_block(struct scrub_block *sblock_to_check)
                     mirror_index++) {
                        struct scrub_block *sblock_other = sblocks_for_recheck +
                                                           mirror_index;
-                        struct scrub_page *page_other = sblock_other->pagev +
+                        struct scrub_page *page_other = sblock_other->pagev[
-                                                        page_num;
+                                                        page_num];
                        if (!page_other->io_error) {
                                ret = scrub_repair_page_from_good_copy(
@@ -928,10 +1128,10 @@ static int scrub_handle_errored_block(struct scrub_block *sblock_to_check)
                         * is verified, but most likely the data comes out
                         * of the page cache.
                         */
-                        ret = scrub_recheck_block(fs_info, sblock_bad,
+                        scrub_recheck_block(fs_info, sblock_bad,
-                                                  is_metadata, have_csum, csum,
+                                            is_metadata, have_csum, csum,
-                                                  generation, sdev->csum_size);
+                                            generation, sctx->csum_size);
-                        if (!ret && !sblock_bad->header_error &&
+                        if (!sblock_bad->header_error &&
                            !sblock_bad->checksum_error &&
                            sblock_bad->no_io_error_seen)
                                goto corrected_error;
@@ -939,23 +1139,23 @@ static int scrub_handle_errored_block(struct scrub_block *sblock_to_check)
                                goto did_not_correct_error;
                } else {
 corrected_error:
-                        spin_lock(&sdev->stat_lock);
+                        spin_lock(&sctx->stat_lock);
-                        sdev->stat.corrected_errors++;
+                        sctx->stat.corrected_errors++;
-                        spin_unlock(&sdev->stat_lock);
+                        spin_unlock(&sctx->stat_lock);
                        printk_ratelimited_in_rcu(KERN_ERR
                                "btrfs: fixed up error at logical %llu on dev %s\n",
                                (unsigned long long)logical,
-                                rcu_str_deref(sdev->dev->name));
+                                rcu_str_deref(dev->name));
                }
        } else {
 did_not_correct_error:
-                spin_lock(&sdev->stat_lock);
+                spin_lock(&sctx->stat_lock);
-                sdev->stat.uncorrectable_errors++;
+                sctx->stat.uncorrectable_errors++;
-                spin_unlock(&sdev->stat_lock);
+                spin_unlock(&sctx->stat_lock);
                printk_ratelimited_in_rcu(KERN_ERR
                        "btrfs: unable to fixup (regular) error at logical %llu on dev %s\n",
                        (unsigned long long)logical,
-                        rcu_str_deref(sdev->dev->name));
+                        rcu_str_deref(dev->name));
        }
 out:
@@ -966,11 +1166,11 @@ out:
                                                     mirror_index;
                        int page_index;
-                        for (page_index = 0; page_index < SCRUB_PAGES_PER_BIO;
+                        for (page_index = 0; page_index < sblock->page_count;
-                             page_index++)
+                             page_index++) {
-                                if (sblock->pagev[page_index].page)
+                                sblock->pagev[page_index]->sblock = NULL;
-                                        __free_page(
+                                scrub_page_put(sblock->pagev[page_index]);
-                                                sblock->pagev[page_index].page);
+                        }
                }
                kfree(sblocks_for_recheck);
        }
@@ -978,8 +1178,9 @@ out:
        return 0;
 }
-static int scrub_setup_recheck_block(struct scrub_dev *sdev,
+static int scrub_setup_recheck_block(struct scrub_ctx *sctx,
-                                     struct btrfs_mapping_tree *map_tree,
+                                     struct btrfs_fs_info *fs_info,
+                                     struct scrub_block *original_sblock,
                                     u64 length, u64 logical,
                                     struct scrub_block *sblocks_for_recheck)
 {
@@ -988,7 +1189,7 @@ static int scrub_setup_recheck_block(struct scrub_dev *sdev,
        int ret;
        /*
-         * note: the three members sdev, ref_count and outstanding_pages
+         * note: the two members ref_count and outstanding_pages
         * are not used (and not set) in the blocks that are used for
         * the recheck procedure
         */
@@ -1003,14 +1204,14 @@ static int scrub_setup_recheck_block(struct scrub_dev *sdev,
                 * with a length of PAGE_SIZE, each returned stripe
                 * represents one mirror
                 */
-                ret = btrfs_map_block(map_tree, WRITE, logical, &mapped_length,
+                ret = btrfs_map_block(fs_info, REQ_GET_READ_MIRRORS, logical,
-                                      &bbio, 0);
+                                      &mapped_length, &bbio, 0);
                if (ret || !bbio || mapped_length < sublen) {
                        kfree(bbio);
                        return -EIO;
                }
-                BUG_ON(page_index >= SCRUB_PAGES_PER_BIO);
+                BUG_ON(page_index >= SCRUB_PAGES_PER_RD_BIO);
                for (mirror_index = 0; mirror_index < (int)bbio->num_stripes;
                     mirror_index++) {
                        struct scrub_block *sblock;
@@ -1020,21 +1221,31 @@ static int scrub_setup_recheck_block(struct scrub_dev *sdev,
                                continue;
                        sblock = sblocks_for_recheck + mirror_index;
-                        page = sblock->pagev + page_index;
+                        sblock->sctx = sctx;
+                        page = kzalloc(sizeof(*page), GFP_NOFS);
+                        if (!page) {
+leave_nomem:
+                                spin_lock(&sctx->stat_lock);
+                                sctx->stat.malloc_errors++;
+                                spin_unlock(&sctx->stat_lock);
+                                kfree(bbio);
+                                return -ENOMEM;
+                        }
+                        scrub_page_get(page);
+                        sblock->pagev[page_index] = page;
                        page->logical = logical;
                        page->physical = bbio->stripes[mirror_index].physical;
+                        BUG_ON(page_index >= original_sblock->page_count);
+                        page->physical_for_dev_replace =
+                                original_sblock->pagev[page_index]->
+                                physical_for_dev_replace;
                        /* for missing devices, dev->bdev is NULL */
                        page->dev = bbio->stripes[mirror_index].dev;
                        page->mirror_num = mirror_index + 1;
-                        page->page = alloc_page(GFP_NOFS);
-                        if (!page->page) {
-                                spin_lock(&sdev->stat_lock);
-                                sdev->stat.malloc_errors++;
-                                spin_unlock(&sdev->stat_lock);
-                                kfree(bbio);
-                                return -ENOMEM;
-                        }
                        sblock->page_count++;
+                        page->page = alloc_page(GFP_NOFS);
+                        if (!page->page)
+                                goto leave_nomem;
                }
                kfree(bbio);
                length -= sublen;
@@ -1052,10 +1263,10 @@ static int scrub_setup_recheck_block(struct scrub_dev *sdev,
 * to take those pages that are not errored from all the mirrors so that
 * the pages that are errored in the just handled mirror can be repaired.
 */
-static int scrub_recheck_block(struct btrfs_fs_info *fs_info,
+static void scrub_recheck_block(struct btrfs_fs_info *fs_info,
-                               struct scrub_block *sblock, int is_metadata,
+                                struct scrub_block *sblock, int is_metadata,
-                               int have_csum, u8 *csum, u64 generation,
+                                int have_csum, u8 *csum, u64 generation,
-                               u16 csum_size)
+                                u16 csum_size)
 {
        int page_num;
@@ -1065,8 +1276,7 @@ static int scrub_recheck_block(struct btrfs_fs_info *fs_info,
        for (page_num = 0; page_num < sblock->page_count; page_num++) {
                struct bio *bio;
-                int ret;
+                struct scrub_page *page = sblock->pagev[page_num];
-                struct scrub_page *page = sblock->pagev + page_num;
                DECLARE_COMPLETION_ONSTACK(complete);
                if (page->dev->bdev == NULL) {
@@ -1075,20 +1285,19 @@ static int scrub_recheck_block(struct btrfs_fs_info *fs_info,
                        continue;
                }
-                BUG_ON(!page->page);
+                WARN_ON(!page->page);
                bio = bio_alloc(GFP_NOFS, 1);
-                if (!bio)
+                if (!bio) {
-                        return -EIO;
+                        page->io_error = 1;
+                        sblock->no_io_error_seen = 0;
+                        continue;
+                }
                bio->bi_bdev = page->dev->bdev;
                bio->bi_sector = page->physical >> 9;
                bio->bi_end_io = scrub_complete_bio_end_io;
                bio->bi_private = &complete;
-                ret = bio_add_page(bio, page->page, PAGE_SIZE, 0);
+                bio_add_page(bio, page->page, PAGE_SIZE, 0);
-                if (PAGE_SIZE != ret) {
-                        bio_put(bio);
-                        return -EIO;
-                }
                btrfsic_submit_bio(READ, bio);
                /* this will also unplug the queue */
@@ -1105,7 +1314,7 @@ static int scrub_recheck_block(struct btrfs_fs_info *fs_info,
                                             have_csum, csum, generation,
                                             csum_size);
-        return 0;
+        return;
 }
 static void scrub_recheck_block_checksum(struct btrfs_fs_info *fs_info,
@@ -1120,14 +1329,14 @@ static void scrub_recheck_block_checksum(struct btrfs_fs_info *fs_info,
        struct btrfs_root *root = fs_info->extent_root;
        void *mapped_buffer;
-        BUG_ON(!sblock->pagev[0].page);
+        WARN_ON(!sblock->pagev[0]->page);
        if (is_metadata) {
                struct btrfs_header *h;
-                mapped_buffer = kmap_atomic(sblock->pagev[0].page);
+                mapped_buffer = kmap_atomic(sblock->pagev[0]->page);
                h = (struct btrfs_header *)mapped_buffer;
-                if (sblock->pagev[0].logical != le64_to_cpu(h->bytenr) ||
+                if (sblock->pagev[0]->logical != le64_to_cpu(h->bytenr) ||
                    memcmp(h->fsid, fs_info->fsid, BTRFS_UUID_SIZE) ||
                    memcmp(h->chunk_tree_uuid, fs_info->chunk_tree_uuid,
                           BTRFS_UUID_SIZE)) {
@@ -1141,7 +1350,7 @@ static void scrub_recheck_block_checksum(struct btrfs_fs_info *fs_info,
                if (!have_csum)
                        return;
-                mapped_buffer = kmap_atomic(sblock->pagev[0].page);
+                mapped_buffer = kmap_atomic(sblock->pagev[0]->page);
        }
        for (page_num = 0;;) {
@@ -1157,9 +1366,9 @@ static void scrub_recheck_block_checksum(struct btrfs_fs_info *fs_info,
                page_num++;
                if (page_num >= sblock->page_count)
                        break;
-                BUG_ON(!sblock->pagev[page_num].page);
+                WARN_ON(!sblock->pagev[page_num]->page);
-                mapped_buffer = kmap_atomic(sblock->pagev[page_num].page);
+                mapped_buffer = kmap_atomic(sblock->pagev[page_num]->page);
        }
        btrfs_csum_final(crc, calculated_csum);
@@ -1197,17 +1406,23 @@ static int scrub_repair_page_from_good_copy(struct scrub_block *sblock_bad,
                                            struct scrub_block *sblock_good,
                                            int page_num, int force_write)
 {
-        struct scrub_page *page_bad = sblock_bad->pagev + page_num;
+        struct scrub_page *page_bad = sblock_bad->pagev[page_num];
-        struct scrub_page *page_good = sblock_good->pagev + page_num;
+        struct scrub_page *page_good = sblock_good->pagev[page_num];
-        BUG_ON(sblock_bad->pagev[page_num].page == NULL);
+        BUG_ON(page_bad->page == NULL);
-        BUG_ON(sblock_good->pagev[page_num].page == NULL);
+        BUG_ON(page_good->page == NULL);
        if (force_write || sblock_bad->header_error ||
            sblock_bad->checksum_error || page_bad->io_error) {
                struct bio *bio;
                int ret;
                DECLARE_COMPLETION_ONSTACK(complete);
+                if (!page_bad->dev->bdev) {
+                        printk_ratelimited(KERN_WARNING
+                                "btrfs: scrub_repair_page_from_good_copy(bdev == NULL) is unexpected!\n");
+                        return -EIO;
+                }
                bio = bio_alloc(GFP_NOFS, 1);
                if (!bio)
                        return -EIO;
@@ -1228,6 +1443,9 @@ static int scrub_repair_page_from_good_copy(struct scrub_block *sblock_bad,
                if (!bio_flagged(bio, BIO_UPTODATE)) {
                        btrfs_dev_stat_inc_and_print(page_bad->dev,
                                BTRFS_DEV_STAT_WRITE_ERRS);
+                        btrfs_dev_replace_stats_inc(
+                                &sblock_bad->sctx->dev_root->fs_info->
+                                dev_replace.num_write_errors);
                        bio_put(bio);
                        return -EIO;
                }
@@ -1237,13 +1455,174 @@ static int scrub_repair_page_from_good_copy(struct scrub_block *sblock_bad,
        return 0;
 }
-static void scrub_checksum(struct scrub_block *sblock)
+static void scrub_write_block_to_dev_replace(struct scrub_block *sblock)
+{
+        int page_num;
+        for (page_num = 0; page_num < sblock->page_count; page_num++) {
+                int ret;
+                ret = scrub_write_page_to_dev_replace(sblock, page_num);
+                if (ret)
+                        btrfs_dev_replace_stats_inc(
+                                &sblock->sctx->dev_root->fs_info->dev_replace.
+                                num_write_errors);
+        }
+}
+static int scrub_write_page_to_dev_replace(struct scrub_block *sblock,
+                                           int page_num)
+{
+        struct scrub_page *spage = sblock->pagev[page_num];
+        BUG_ON(spage->page == NULL);
+        if (spage->io_error) {
+                void *mapped_buffer = kmap_atomic(spage->page);
+                memset(mapped_buffer, 0, PAGE_CACHE_SIZE);
+                flush_dcache_page(spage->page);
+                kunmap_atomic(mapped_buffer);
+        }
+        return scrub_add_page_to_wr_bio(sblock->sctx, spage);
+}
+static int scrub_add_page_to_wr_bio(struct scrub_ctx *sctx,
+                                    struct scrub_page *spage)
+{
+        struct scrub_wr_ctx *wr_ctx = &sctx->wr_ctx;
+        struct scrub_bio *sbio;
+        int ret;
+        mutex_lock(&wr_ctx->wr_lock);
+again:
+        if (!wr_ctx->wr_curr_bio) {
+                wr_ctx->wr_curr_bio = kzalloc(sizeof(*wr_ctx->wr_curr_bio),
+                                              GFP_NOFS);
+                if (!wr_ctx->wr_curr_bio) {
+                        mutex_unlock(&wr_ctx->wr_lock);
+                        return -ENOMEM;
+                }
+                wr_ctx->wr_curr_bio->sctx = sctx;
+                wr_ctx->wr_curr_bio->page_count = 0;
+        }
+        sbio = wr_ctx->wr_curr_bio;
+        if (sbio->page_count == 0) {
+                struct bio *bio;
+                sbio->physical = spage->physical_for_dev_replace;
+                sbio->logical = spage->logical;
+                sbio->dev = wr_ctx->tgtdev;
+                bio = sbio->bio;
+                if (!bio) {
+                        bio = bio_alloc(GFP_NOFS, wr_ctx->pages_per_wr_bio);
+                        if (!bio) {
+                                mutex_unlock(&wr_ctx->wr_lock);
+                                return -ENOMEM;
+                        }
+                        sbio->bio = bio;
+                }
+                bio->bi_private = sbio;
+                bio->bi_end_io = scrub_wr_bio_end_io;
+                bio->bi_bdev = sbio->dev->bdev;
+                bio->bi_sector = sbio->physical >> 9;
+                sbio->err = 0;
+        } else if (sbio->physical + sbio->page_count * PAGE_SIZE !=
+                   spage->physical_for_dev_replace ||
+                   sbio->logical + sbio->page_count * PAGE_SIZE !=
+                   spage->logical) {
+                scrub_wr_submit(sctx);
+                goto again;
+        }
+        ret = bio_add_page(sbio->bio, spage->page, PAGE_SIZE, 0);
+        if (ret != PAGE_SIZE) {
+                if (sbio->page_count < 1) {
+                        bio_put(sbio->bio);
+                        sbio->bio = NULL;
+                        mutex_unlock(&wr_ctx->wr_lock);
+                        return -EIO;
+                }
+                scrub_wr_submit(sctx);
+                goto again;
+        }
+        sbio->pagev[sbio->page_count] = spage;
+        scrub_page_get(spage);
+        sbio->page_count++;
+        if (sbio->page_count == wr_ctx->pages_per_wr_bio)
+                scrub_wr_submit(sctx);
+        mutex_unlock(&wr_ctx->wr_lock);
+        return 0;
+}
+static void scrub_wr_submit(struct scrub_ctx *sctx)
+{
+        struct scrub_wr_ctx *wr_ctx = &sctx->wr_ctx;
+        struct scrub_bio *sbio;
+        if (!wr_ctx->wr_curr_bio)
+                return;
+        sbio = wr_ctx->wr_curr_bio;
+        wr_ctx->wr_curr_bio = NULL;
+        WARN_ON(!sbio->bio->bi_bdev);
+        scrub_pending_bio_inc(sctx);
+        /* process all writes in a single worker thread. Then the block layer
+         * orders the requests before sending them to the driver which
+         * doubled the write performance on spinning disks when measured
+         * with Linux 3.5 */
+        btrfsic_submit_bio(WRITE, sbio->bio);
+}
+static void scrub_wr_bio_end_io(struct bio *bio, int err)
+{
+        struct scrub_bio *sbio = bio->bi_private;
+        struct btrfs_fs_info *fs_info = sbio->dev->dev_root->fs_info;
+        sbio->err = err;
+        sbio->bio = bio;
+        sbio->work.func = scrub_wr_bio_end_io_worker;
+        btrfs_queue_worker(&fs_info->scrub_wr_completion_workers, &sbio->work);
+}
+static void scrub_wr_bio_end_io_worker(struct btrfs_work *work)
+{
+        struct scrub_bio *sbio = container_of(work, struct scrub_bio, work);
+        struct scrub_ctx *sctx = sbio->sctx;
+        int i;
+        WARN_ON(sbio->page_count > SCRUB_PAGES_PER_WR_BIO);
+        if (sbio->err) {
+                struct btrfs_dev_replace *dev_replace =
+                        &sbio->sctx->dev_root->fs_info->dev_replace;
+                for (i = 0; i < sbio->page_count; i++) {
+                        struct scrub_page *spage = sbio->pagev[i];
+                        spage->io_error = 1;
+                        btrfs_dev_replace_stats_inc(&dev_replace->
+                                                    num_write_errors);
+                }
+        }
+        for (i = 0; i < sbio->page_count; i++)
+                scrub_page_put(sbio->pagev[i]);
+        bio_put(sbio->bio);
+        kfree(sbio);
+        scrub_pending_bio_dec(sctx);
+}
+static int scrub_checksum(struct scrub_block *sblock)
 {
        u64 flags;
        int ret;
-        BUG_ON(sblock->page_count < 1);
+        WARN_ON(sblock->page_count < 1);
-        flags = sblock->pagev[0].flags;
+        flags = sblock->pagev[0]->flags;
        ret = 0;
        if (flags & BTRFS_EXTENT_FLAG_DATA)
                ret = scrub_checksum_data(sblock);
@@ -1255,30 +1634,32 @@ static void scrub_checksum(struct scrub_block *sblock)
                WARN_ON(1);
        if (ret)
                scrub_handle_errored_block(sblock);
+        return ret;
 }
 static int scrub_checksum_data(struct scrub_block *sblock)
 {
-        struct scrub_dev *sdev = sblock->sdev;
+        struct scrub_ctx *sctx = sblock->sctx;
        u8 csum[BTRFS_CSUM_SIZE];
        u8 *on_disk_csum;
        struct page *page;
        void *buffer;
        u32 crc = ~(u32)0;
        int fail = 0;
-        struct btrfs_root *root = sdev->dev->dev_root;
+        struct btrfs_root *root = sctx->dev_root;
        u64 len;
        int index;
        BUG_ON(sblock->page_count < 1);
-        if (!sblock->pagev[0].have_csum)
+        if (!sblock->pagev[0]->have_csum)
                return 0;
-        on_disk_csum = sblock->pagev[0].csum;
+        on_disk_csum = sblock->pagev[0]->csum;
-        page = sblock->pagev[0].page;
+        page = sblock->pagev[0]->page;
        buffer = kmap_atomic(page);
-        len = sdev->sectorsize;
+        len = sctx->sectorsize;
        index = 0;
        for (;;) {
                u64 l = min_t(u64, len, PAGE_SIZE);
@@ -1290,13 +1671,13 @@ static int scrub_checksum_data(struct scrub_block *sblock)
                        break;
                index++;
                BUG_ON(index >= sblock->page_count);
-                BUG_ON(!sblock->pagev[index].page);
+                BUG_ON(!sblock->pagev[index]->page);
-                page = sblock->pagev[index].page;
+                page = sblock->pagev[index]->page;
                buffer = kmap_atomic(page);
        }
        btrfs_csum_final(crc, csum);
-        if (memcmp(csum, on_disk_csum, sdev->csum_size))
+        if (memcmp(csum, on_disk_csum, sctx->csum_size))
                fail = 1;
        return fail;
@@ -1304,9 +1685,9 @@ static int scrub_checksum_data(struct scrub_block *sblock)
 static int scrub_checksum_tree_block(struct scrub_block *sblock)
 {
-        struct scrub_dev *sdev = sblock->sdev;
+        struct scrub_ctx *sctx = sblock->sctx;
        struct btrfs_header *h;
-        struct btrfs_root *root = sdev->dev->dev_root;
+        struct btrfs_root *root = sctx->dev_root;
        struct btrfs_fs_info *fs_info = root->fs_info;
        u8 calculated_csum[BTRFS_CSUM_SIZE];
        u8 on_disk_csum[BTRFS_CSUM_SIZE];
@@ -1321,10 +1702,10 @@ static int scrub_checksum_tree_block(struct scrub_block *sblock)
        int index;
        BUG_ON(sblock->page_count < 1);
-        page = sblock->pagev[0].page;
+        page = sblock->pagev[0]->page;
        mapped_buffer = kmap_atomic(page);
        h = (struct btrfs_header *)mapped_buffer;
-        memcpy(on_disk_csum, h->csum, sdev->csum_size);
+        memcpy(on_disk_csum, h->csum, sctx->csum_size);
        /*
         * we don't use the getter functions here, as we
@@ -1332,10 +1713,10 @@ static int scrub_checksum_tree_block(struct scrub_block *sblock)
         * b) the page is already kmapped
         */
-        if (sblock->pagev[0].logical != le64_to_cpu(h->bytenr))
+        if (sblock->pagev[0]->logical != le64_to_cpu(h->bytenr))
                ++fail;
-        if (sblock->pagev[0].generation != le64_to_cpu(h->generation))
+        if (sblock->pagev[0]->generation != le64_to_cpu(h->generation))
                ++fail;
        if (memcmp(h->fsid, fs_info->fsid, BTRFS_UUID_SIZE))
@@ -1345,8 +1726,8 @@ static int scrub_checksum_tree_block(struct scrub_block *sblock)
                   BTRFS_UUID_SIZE))
                ++fail;
-        BUG_ON(sdev->nodesize != sdev->leafsize);
+        WARN_ON(sctx->nodesize != sctx->leafsize);
-        len = sdev->nodesize - BTRFS_CSUM_SIZE;
+        len = sctx->nodesize - BTRFS_CSUM_SIZE;
        mapped_size = PAGE_SIZE - BTRFS_CSUM_SIZE;
        p = ((u8 *)mapped_buffer) + BTRFS_CSUM_SIZE;
        index = 0;
@@ -1360,15 +1741,15 @@ static int scrub_checksum_tree_block(struct scrub_block *sblock)
                        break;
                index++;
                BUG_ON(index >= sblock->page_count);
-                BUG_ON(!sblock->pagev[index].page);
+                BUG_ON(!sblock->pagev[index]->page);
-                page = sblock->pagev[index].page;
+                page = sblock->pagev[index]->page;
                mapped_buffer = kmap_atomic(page);
                mapped_size = PAGE_SIZE;
                p = mapped_buffer;
        }
        btrfs_csum_final(crc, calculated_csum);
-        if (memcmp(calculated_csum, on_disk_csum, sdev->csum_size))
+        if (memcmp(calculated_csum, on_disk_csum, sctx->csum_size))
                ++crc_fail;
        return fail || crc_fail;
@@ -1377,8 +1758,8 @@ static int scrub_checksum_tree_block(struct scrub_block *sblock)
 static int scrub_checksum_super(struct scrub_block *sblock)
 {
        struct btrfs_super_block *s;
-        struct scrub_dev *sdev = sblock->sdev;
+        struct scrub_ctx *sctx = sblock->sctx;
-        struct btrfs_root *root = sdev->dev->dev_root;
+        struct btrfs_root *root = sctx->dev_root;
        struct btrfs_fs_info *fs_info = root->fs_info;
        u8 calculated_csum[BTRFS_CSUM_SIZE];
        u8 on_disk_csum[BTRFS_CSUM_SIZE];
@@ -1393,15 +1774,15 @@ static int scrub_checksum_super(struct scrub_block *sblock)
        int index;
        BUG_ON(sblock->page_count < 1);
-        page = sblock->pagev[0].page;
+        page = sblock->pagev[0]->page;
        mapped_buffer = kmap_atomic(page);
        s = (struct btrfs_super_block *)mapped_buffer;
-        memcpy(on_disk_csum, s->csum, sdev->csum_size);
+        memcpy(on_disk_csum, s->csum, sctx->csum_size);
-        if (sblock->pagev[0].logical != le64_to_cpu(s->bytenr))
+        if (sblock->pagev[0]->logical != le64_to_cpu(s->bytenr))
                ++fail_cor;
-        if (sblock->pagev[0].generation != le64_to_cpu(s->generation))
+        if (sblock->pagev[0]->generation != le64_to_cpu(s->generation))
                ++fail_gen;
        if (memcmp(s->fsid, fs_info->fsid, BTRFS_UUID_SIZE))
@@ -1421,15 +1802,15 @@ static int scrub_checksum_super(struct scrub_block *sblock)
                        break;
                index++;
                BUG_ON(index >= sblock->page_count);
-                BUG_ON(!sblock->pagev[index].page);
+                BUG_ON(!sblock->pagev[index]->page);
-                page = sblock->pagev[index].page;
+                page = sblock->pagev[index]->page;
                mapped_buffer = kmap_atomic(page);
                mapped_size = PAGE_SIZE;
                p = mapped_buffer;
        }
        btrfs_csum_final(crc, calculated_csum);
-        if (memcmp(calculated_csum, on_disk_csum, sdev->csum_size))
+        if (memcmp(calculated_csum, on_disk_csum, sctx->csum_size))
                ++fail_cor;
        if (fail_cor + fail_gen) {
@@ -1438,14 +1819,14 @@ static int scrub_checksum_super(struct scrub_block *sblock)
                 * They will get written with the next transaction commit
                 * anyway
                 */
-                spin_lock(&sdev->stat_lock);
+                spin_lock(&sctx->stat_lock);
-                ++sdev->stat.super_errors;
+                ++sctx->stat.super_errors;
-                spin_unlock(&sdev->stat_lock);
+                spin_unlock(&sctx->stat_lock);
                if (fail_cor)
-                        btrfs_dev_stat_inc_and_print(sdev->dev,
+                        btrfs_dev_stat_inc_and_print(sblock->pagev[0]->dev,
                                BTRFS_DEV_STAT_CORRUPTION_ERRS);
                else
-                        btrfs_dev_stat_inc_and_print(sdev->dev,
+                        btrfs_dev_stat_inc_and_print(sblock->pagev[0]->dev,
                                BTRFS_DEV_STAT_GENERATION_ERRS);
        }
@@ -1463,28 +1844,54 @@ static void scrub_block_put(struct scrub_block *sblock)
                int i;
                for (i = 0; i < sblock->page_count; i++)
-                        if (sblock->pagev[i].page)
+                        scrub_page_put(sblock->pagev[i]);
-                                __free_page(sblock->pagev[i].page);
                kfree(sblock);
        }
 }
-static void scrub_submit(struct scrub_dev *sdev)
+static void scrub_page_get(struct scrub_page *spage)
+{
+        atomic_inc(&spage->ref_count);
+}
+static void scrub_page_put(struct scrub_page *spage)
+{
+        if (atomic_dec_and_test(&spage->ref_count)) {
+                if (spage->page)
+                        __free_page(spage->page);
+                kfree(spage);
+        }
+}
+static void scrub_submit(struct scrub_ctx *sctx)
 {
        struct scrub_bio *sbio;
-        if (sdev->curr == -1)
+        if (sctx->curr == -1)
                return;
-        sbio = sdev->bios[sdev->curr];
+        sbio = sctx->bios[sctx->curr];
-        sdev->curr = -1;
+        sctx->curr = -1;
-        atomic_inc(&sdev->in_flight);
+        scrub_pending_bio_inc(sctx);
-        btrfsic_submit_bio(READ, sbio->bio);
+        if (!sbio->bio->bi_bdev) {
+                /*
+                 * this case should not happen. If btrfs_map_block() is
+                 * wrong, it could happen for dev-replace operations on
+                 * missing devices when no mirrors are available, but in
+                 * this case it should already fail the mount.
+                 * This case is handled correctly (but _very_ slowly).
+                 */
+                printk_ratelimited(KERN_WARNING
+                        "btrfs: scrub_submit(bio bdev == NULL) is unexpected!\n");
+                bio_endio(sbio->bio, -EIO);
+        } else {
+                btrfsic_submit_bio(READ, sbio->bio);
+        }
 }
-static int scrub_add_page_to_bio(struct scrub_dev *sdev,
+static int scrub_add_page_to_rd_bio(struct scrub_ctx *sctx,
-                                 struct scrub_page *spage)
+                                    struct scrub_page *spage)
 {
        struct scrub_block *sblock = spage->sblock;
        struct scrub_bio *sbio;
@@ -1494,28 +1901,29 @@ again:
        /*
         * grab a fresh bio or wait for one to become available
         */
-        while (sdev->curr == -1) {
+        while (sctx->curr == -1) {
-                spin_lock(&sdev->list_lock);
+                spin_lock(&sctx->list_lock);
-                sdev->curr = sdev->first_free;
+                sctx->curr = sctx->first_free;
-                if (sdev->curr != -1) {
+                if (sctx->curr != -1) {
-                        sdev->first_free = sdev->bios[sdev->curr]->next_free;
+                        sctx->first_free = sctx->bios[sctx->curr]->next_free;
-                        sdev->bios[sdev->curr]->next_free = -1;
+                        sctx->bios[sctx->curr]->next_free = -1;
-                        sdev->bios[sdev->curr]->page_count = 0;
+                        sctx->bios[sctx->curr]->page_count = 0;
-                        spin_unlock(&sdev->list_lock);
+                        spin_unlock(&sctx->list_lock);
                } else {
-                        spin_unlock(&sdev->list_lock);
+                        spin_unlock(&sctx->list_lock);
-                        wait_event(sdev->list_wait, sdev->first_free != -1);
+                        wait_event(sctx->list_wait, sctx->first_free != -1);
                }
        }
-        sbio = sdev->bios[sdev->curr];
+        sbio = sctx->bios[sctx->curr];
        if (sbio->page_count == 0) {
                struct bio *bio;
                sbio->physical = spage->physical;
                sbio->logical = spage->logical;
+                sbio->dev = spage->dev;
                bio = sbio->bio;
                if (!bio) {
-                        bio = bio_alloc(GFP_NOFS, sdev->pages_per_bio);
+                        bio = bio_alloc(GFP_NOFS, sctx->pages_per_rd_bio);
                        if (!bio)
                                return -ENOMEM;
                        sbio->bio = bio;
@@ -1523,14 +1931,15 @@ again:
                bio->bi_private = sbio;
                bio->bi_end_io = scrub_bio_end_io;
-                bio->bi_bdev = sdev->dev->bdev;
+                bio->bi_bdev = sbio->dev->bdev;
-                bio->bi_sector = spage->physical >> 9;
+                bio->bi_sector = sbio->physical >> 9;
                sbio->err = 0;
        } else if (sbio->physical + sbio->page_count * PAGE_SIZE !=
                   spage->physical ||
                   sbio->logical + sbio->page_count * PAGE_SIZE !=
-                   spage->logical) {
+                   spage->logical ||
-                scrub_submit(sdev);
+                   sbio->dev != spage->dev) {
+                scrub_submit(sctx);
                goto again;
        }
@@ -1542,81 +1951,87 @@ again:
                        sbio->bio = NULL;
                        return -EIO;
                }
-                scrub_submit(sdev);
+                scrub_submit(sctx);
                goto again;
        }
-        scrub_block_get(sblock); /* one for the added page */
+        scrub_block_get(sblock); /* one for the page added to the bio */
        atomic_inc(&sblock->outstanding_pages);
        sbio->page_count++;
-        if (sbio->page_count == sdev->pages_per_bio)
+        if (sbio->page_count == sctx->pages_per_rd_bio)
-                scrub_submit(sdev);
+                scrub_submit(sctx);
        return 0;
 }
-static int scrub_pages(struct scrub_dev *sdev, u64 logical, u64 len,
+static int scrub_pages(struct scrub_ctx *sctx, u64 logical, u64 len,
-                       u64 physical, u64 flags, u64 gen, int mirror_num,
+                       u64 physical, struct btrfs_device *dev, u64 flags,
-                       u8 *csum, int force)
+                       u64 gen, int mirror_num, u8 *csum, int force,
+                       u64 physical_for_dev_replace)
 {
        struct scrub_block *sblock;
        int index;
        sblock = kzalloc(sizeof(*sblock), GFP_NOFS);
        if (!sblock) {
-                spin_lock(&sdev->stat_lock);
+                spin_lock(&sctx->stat_lock);
-                sdev->stat.malloc_errors++;
+                sctx->stat.malloc_errors++;
-                spin_unlock(&sdev->stat_lock);
+                spin_unlock(&sctx->stat_lock);
                return -ENOMEM;
        }
-        /* one ref inside this function, plus one for each page later on */
+        /* one ref inside this function, plus one for each page added to
+         * a bio later on */
        atomic_set(&sblock->ref_count, 1);
-        sblock->sdev = sdev;
+        sblock->sctx = sctx;
        sblock->no_io_error_seen = 1;
        for (index = 0; len > 0; index++) {
-                struct scrub_page *spage = sblock->pagev + index;
+                struct scrub_page *spage;
                u64 l = min_t(u64, len, PAGE_SIZE);
-                BUG_ON(index >= SCRUB_MAX_PAGES_PER_BLOCK);
+                spage = kzalloc(sizeof(*spage), GFP_NOFS);
-                spage->page = alloc_page(GFP_NOFS);
+                if (!spage) {
-                if (!spage->page) {
+leave_nomem:
-                        spin_lock(&sdev->stat_lock);
+                        spin_lock(&sctx->stat_lock);
-                        sdev->stat.malloc_errors++;
+                        sctx->stat.malloc_errors++;
-                        spin_unlock(&sdev->stat_lock);
+                        spin_unlock(&sctx->stat_lock);
-                        while (index > 0) {
+                        scrub_block_put(sblock);
-                                index--;
-                                __free_page(sblock->pagev[index].page);
-                        }
-                        kfree(sblock);
                        return -ENOMEM;
                }
+                BUG_ON(index >= SCRUB_MAX_PAGES_PER_BLOCK);
+                scrub_page_get(spage);
+                sblock->pagev[index] = spage;
                spage->sblock = sblock;
-                spage->dev = sdev->dev;
+                spage->dev = dev;
                spage->flags = flags;
                spage->generation = gen;
                spage->logical = logical;
                spage->physical = physical;
+                spage->physical_for_dev_replace = physical_for_dev_replace;
                spage->mirror_num = mirror_num;
                if (csum) {
                        spage->have_csum = 1;
-                        memcpy(spage->csum, csum, sdev->csum_size);
+                        memcpy(spage->csum, csum, sctx->csum_size);
                } else {
                        spage->have_csum = 0;
                }
                sblock->page_count++;
+                spage->page = alloc_page(GFP_NOFS);
+                if (!spage->page)
+                        goto leave_nomem;
                len -= l;
                logical += l;
                physical += l;
+                physical_for_dev_replace += l;
        }
-        BUG_ON(sblock->page_count == 0);
+        WARN_ON(sblock->page_count == 0);
        for (index = 0; index < sblock->page_count; index++) {
-                struct scrub_page *spage = sblock->pagev + index;
+                struct scrub_page *spage = sblock->pagev[index];
                int ret;
-                ret = scrub_add_page_to_bio(sdev, spage);
+                ret = scrub_add_page_to_rd_bio(sctx, spage);
                if (ret) {
                        scrub_block_put(sblock);
                        return ret;
@@ -1624,7 +2039,7 @@ static int scrub_pages(struct scrub_dev *sdev, u64 logical, u64 len,
        }
        if (force)
-                scrub_submit(sdev);
+                scrub_submit(sctx);
        /* last one frees, either here or in bio completion for last page */
        scrub_block_put(sblock);
@@ -1634,8 +2049,7 @@ static int scrub_pages(struct scrub_dev *sdev, u64 logical, u64 len,
 static void scrub_bio_end_io(struct bio *bio, int err)
 {
        struct scrub_bio *sbio = bio->bi_private;
-        struct scrub_dev *sdev = sbio->sdev;
+        struct btrfs_fs_info *fs_info = sbio->dev->dev_root->fs_info;
-        struct btrfs_fs_info *fs_info = sdev->dev->dev_root->fs_info;
        sbio->err = err;
        sbio->bio = bio;
@@ -1646,10 +2060,10 @@ static void scrub_bio_end_io(struct bio *bio, int err)
 static void scrub_bio_end_io_worker(struct btrfs_work *work)
 {
        struct scrub_bio *sbio = container_of(work, struct scrub_bio, work);
-        struct scrub_dev *sdev = sbio->sdev;
+        struct scrub_ctx *sctx = sbio->sctx;
        int i;
-        BUG_ON(sbio->page_count > SCRUB_PAGES_PER_BIO);
+        BUG_ON(sbio->page_count > SCRUB_PAGES_PER_RD_BIO);
        if (sbio->err) {
                for (i = 0; i < sbio->page_count; i++) {
                        struct scrub_page *spage = sbio->pagev[i];
@@ -1671,23 +2085,37 @@ static void scrub_bio_end_io_worker(struct btrfs_work *work)
        bio_put(sbio->bio);
        sbio->bio = NULL;
-        spin_lock(&sdev->list_lock);
+        spin_lock(&sctx->list_lock);
-        sbio->next_free = sdev->first_free;
+        sbio->next_free = sctx->first_free;
-        sdev->first_free = sbio->index;
+        sctx->first_free = sbio->index;
-        spin_unlock(&sdev->list_lock);
+        spin_unlock(&sctx->list_lock);
-        atomic_dec(&sdev->in_flight);
-        wake_up(&sdev->list_wait);
+        if (sctx->is_dev_replace &&
+            atomic_read(&sctx->wr_ctx.flush_all_writes)) {
+                mutex_lock(&sctx->wr_ctx.wr_lock);
+                scrub_wr_submit(sctx);
+                mutex_unlock(&sctx->wr_ctx.wr_lock);
+        }
+        scrub_pending_bio_dec(sctx);
 }
 static void scrub_block_complete(struct scrub_block *sblock)
 {
-        if (!sblock->no_io_error_seen)
+        if (!sblock->no_io_error_seen) {
                scrub_handle_errored_block(sblock);
-        else
+        } else {
-                scrub_checksum(sblock);
+                /*
+                 * if has checksum error, write via repair mechanism in
+                 * dev replace case, otherwise write here in dev replace
+                 * case.
+                 */
+                if (!scrub_checksum(sblock) && sblock->sctx->is_dev_replace)
+                        scrub_write_block_to_dev_replace(sblock);
+        }
 }
-static int scrub_find_csum(struct scrub_dev *sdev, u64 logical, u64 len,
+static int scrub_find_csum(struct scrub_ctx *sctx, u64 logical, u64 len,
                           u8 *csum)
 {
        struct btrfs_ordered_sum *sum = NULL;
@@ -1695,15 +2123,15 @@ static int scrub_find_csum(struct scrub_dev *sdev, u64 logical, u64 len,
        unsigned long i;
        unsigned long num_sectors;
-        while (!list_empty(&sdev->csum_list)) {
+        while (!list_empty(&sctx->csum_list)) {
-                sum = list_first_entry(&sdev->csum_list,
+                sum = list_first_entry(&sctx->csum_list,
                                       struct btrfs_ordered_sum, list);
                if (sum->bytenr > logical)
                        return 0;
                if (sum->bytenr + sum->len > logical)
                        break;
-                ++sdev->stat.csum_discards;
+                ++sctx->stat.csum_discards;
                list_del(&sum->list);
                kfree(sum);
                sum = NULL;
@@ -1711,10 +2139,10 @@ static int scrub_find_csum(struct scrub_dev *sdev, u64 logical, u64 len,
        if (!sum)
                return 0;
-        num_sectors = sum->len / sdev->sectorsize;
+        num_sectors = sum->len / sctx->sectorsize;
        for (i = 0; i < num_sectors; ++i) {
                if (sum->sums[i].bytenr == logical) {
-                        memcpy(csum, &sum->sums[i].sum, sdev->csum_size);
+                        memcpy(csum, &sum->sums[i].sum, sctx->csum_size);
                        ret = 1;
                        break;
                }
@@ -1727,29 +2155,30 @@ static int scrub_find_csum(struct scrub_dev *sdev, u64 logical, u64 len,
 }
 /* scrub extent tries to collect up to 64 kB for each bio */
-static int scrub_extent(struct scrub_dev *sdev, u64 logical, u64 len,
+static int scrub_extent(struct scrub_ctx *sctx, u64 logical, u64 len,
-                        u64 physical, u64 flags, u64 gen, int mirror_num)
+                        u64 physical, struct btrfs_device *dev, u64 flags,
+                        u64 gen, int mirror_num, u64 physical_for_dev_replace)
 {
        int ret;
        u8 csum[BTRFS_CSUM_SIZE];
        u32 blocksize;
        if (flags & BTRFS_EXTENT_FLAG_DATA) {
-                blocksize = sdev->sectorsize;
+                blocksize = sctx->sectorsize;
-                spin_lock(&sdev->stat_lock);
+                spin_lock(&sctx->stat_lock);
-                sdev->stat.data_extents_scrubbed++;
+                sctx->stat.data_extents_scrubbed++;
-                sdev->stat.data_bytes_scrubbed += len;
+                sctx->stat.data_bytes_scrubbed += len;
-                spin_unlock(&sdev->stat_lock);
+                spin_unlock(&sctx->stat_lock);
        } else if (flags & BTRFS_EXTENT_FLAG_TREE_BLOCK) {
-                BUG_ON(sdev->nodesize != sdev->leafsize);
+                WARN_ON(sctx->nodesize != sctx->leafsize);
-                blocksize = sdev->nodesize;
+                blocksize = sctx->nodesize;
-                spin_lock(&sdev->stat_lock);
+                spin_lock(&sctx->stat_lock);
-                sdev->stat.tree_extents_scrubbed++;
+                sctx->stat.tree_extents_scrubbed++;
-                sdev->stat.tree_bytes_scrubbed += len;
+                sctx->stat.tree_bytes_scrubbed += len;
-                spin_unlock(&sdev->stat_lock);
+                spin_unlock(&sctx->stat_lock);
        } else {
-                blocksize = sdev->sectorsize;
+                blocksize = sctx->sectorsize;
-                BUG_ON(1);
+                WARN_ON(1);
        }
        while (len) {
@@ -1758,26 +2187,38 @@ static int scrub_extent(struct scrub_dev *sdev, u64 logical, u64 len,
                if (flags & BTRFS_EXTENT_FLAG_DATA) {
                        /* push csums to sbio */
-                        have_csum = scrub_find_csum(sdev, logical, l, csum);
+                        have_csum = scrub_find_csum(sctx, logical, l, csum);
                        if (have_csum == 0)
-                                ++sdev->stat.no_csum;
+                                ++sctx->stat.no_csum;
+                        if (sctx->is_dev_replace && !have_csum) {
+                                ret = copy_nocow_pages(sctx, logical, l,
+                                                       mirror_num,
+                                                      physical_for_dev_replace);
+                                goto behind_scrub_pages;
+                        }
                }
-                ret = scrub_pages(sdev, logical, l, physical, flags, gen,
+                ret = scrub_pages(sctx, logical, l, physical, dev, flags, gen,
-                                  mirror_num, have_csum ? csum : NULL, 0);
+                                  mirror_num, have_csum ? csum : NULL, 0,
+                                  physical_for_dev_replace);
+behind_scrub_pages:
                if (ret)
                        return ret;
                len -= l;
                logical += l;
                physical += l;
+                physical_for_dev_replace += l;
        }
        return 0;
 }
-static noinline_for_stack int scrub_stripe(struct scrub_dev *sdev,
+static noinline_for_stack int scrub_stripe(struct scrub_ctx *sctx,
-        struct map_lookup *map, int num, u64 base, u64 length)
+                                           struct map_lookup *map,
+                                           struct btrfs_device *scrub_dev,
+                                           int num, u64 base, u64 length,
+                                           int is_dev_replace)
 {
        struct btrfs_path *path;
-        struct btrfs_fs_info *fs_info = sdev->dev->dev_root->fs_info;
+        struct btrfs_fs_info *fs_info = sctx->dev_root->fs_info;
        struct btrfs_root *root = fs_info->extent_root;
        struct btrfs_root *csum_root = fs_info->csum_root;
        struct btrfs_extent_item *extent;
@@ -1797,9 +2238,13 @@ static noinline_for_stack int scrub_stripe(struct scrub_dev *sdev,
        struct reada_control *reada2;
        struct btrfs_key key_start;
        struct btrfs_key key_end;
        u64 increment = map->stripe_len;
        u64 offset;
+        u64 extent_logical;
+        u64 extent_physical;
+        u64 extent_len;
+        struct btrfs_device *extent_dev;
+        int extent_mirror_num;
        nstripes = length;
        offset = 0;
@@ -1843,8 +2288,8 @@ static noinline_for_stack int scrub_stripe(struct scrub_dev *sdev,
         */
        logical = base + offset;
-        wait_event(sdev->list_wait,
+        wait_event(sctx->list_wait,
-                   atomic_read(&sdev->in_flight) == 0);
+                   atomic_read(&sctx->bios_in_flight) == 0);
        atomic_inc(&fs_info->scrubs_paused);
        wake_up(&fs_info->scrub_pause_wait);
@@ -1898,7 +2343,7 @@ static noinline_for_stack int scrub_stripe(struct scrub_dev *sdev,
                 * canceled?
                 */
                if (atomic_read(&fs_info->scrub_cancel_req) ||
-                    atomic_read(&sdev->cancel_req)) {
+                    atomic_read(&sctx->cancel_req)) {
                        ret = -ECANCELED;
                        goto out;
                }
@@ -1907,9 +2352,14 @@ static noinline_for_stack int scrub_stripe(struct scrub_dev *sdev,
                 */
                if (atomic_read(&fs_info->scrub_pause_req)) {
                        /* push queued extents */
-                        scrub_submit(sdev);
+                        atomic_set(&sctx->wr_ctx.flush_all_writes, 1);
-                        wait_event(sdev->list_wait,
+                        scrub_submit(sctx);
-                                   atomic_read(&sdev->in_flight) == 0);
+                        mutex_lock(&sctx->wr_ctx.wr_lock);
+                        scrub_wr_submit(sctx);
+                        mutex_unlock(&sctx->wr_ctx.wr_lock);
+                        wait_event(sctx->list_wait,
+                                   atomic_read(&sctx->bios_in_flight) == 0);
+                        atomic_set(&sctx->wr_ctx.flush_all_writes, 0);
                        atomic_inc(&fs_info->scrubs_paused);
                        wake_up(&fs_info->scrub_pause_wait);
                        mutex_lock(&fs_info->scrub_lock);
@@ -1926,7 +2376,7 @@ static noinline_for_stack int scrub_stripe(struct scrub_dev *sdev,
                ret = btrfs_lookup_csums_range(csum_root, logical,
                                               logical + map->stripe_len - 1,
-                                               &sdev->csum_list, 1);
+                                               &sctx->csum_list, 1);
                if (ret)
                        goto out;
@@ -2004,9 +2454,20 @@ static noinline_for_stack int scrub_stripe(struct scrub_dev *sdev,
                                             key.objectid;
                        }
-                        ret = scrub_extent(sdev, key.objectid, key.offset,
+                        extent_logical = key.objectid;
-                                           key.objectid - logical + physical,
+                        extent_physical = key.objectid - logical + physical;
-                                           flags, generation, mirror_num);
+                        extent_len = key.offset;
+                        extent_dev = scrub_dev;
+                        extent_mirror_num = mirror_num;
+                        if (is_dev_replace)
+                                scrub_remap_extent(fs_info, extent_logical,
+                                                   extent_len, &extent_physical,
+                                                   &extent_dev,
+                                                   &extent_mirror_num);
+                        ret = scrub_extent(sctx, extent_logical, extent_len,
+                                           extent_physical, extent_dev, flags,
+                                           generation, extent_mirror_num,
+                                           key.objectid - logical + physical);
                        if (ret)
                                goto out;
@@ -2016,29 +2477,34 @@ next:
                btrfs_release_path(path);
                logical += increment;
                physical += map->stripe_len;
-                spin_lock(&sdev->stat_lock);
+                spin_lock(&sctx->stat_lock);
-                sdev->stat.last_physical = physical;
+                sctx->stat.last_physical = physical;
-                spin_unlock(&sdev->stat_lock);
+                spin_unlock(&sctx->stat_lock);
        }
+out:
        /* push queued extents */
-        scrub_submit(sdev);
+        scrub_submit(sctx);
+        mutex_lock(&sctx->wr_ctx.wr_lock);
+        scrub_wr_submit(sctx);
+        mutex_unlock(&sctx->wr_ctx.wr_lock);
-out:
        blk_finish_plug(&plug);
        btrfs_free_path(path);
        return ret < 0 ? ret : 0;
 }
-static noinline_for_stack int scrub_chunk(struct scrub_dev *sdev,
+static noinline_for_stack int scrub_chunk(struct scrub_ctx *sctx,
-        u64 chunk_tree, u64 chunk_objectid, u64 chunk_offset, u64 length,
+                                          struct btrfs_device *scrub_dev,
-        u64 dev_offset)
+                                          u64 chunk_tree, u64 chunk_objectid,
+                                          u64 chunk_offset, u64 length,
+                                          u64 dev_offset, int is_dev_replace)
 {
        struct btrfs_mapping_tree *map_tree =
-                &sdev->dev->dev_root->fs_info->mapping_tree;
+                &sctx->dev_root->fs_info->mapping_tree;
        struct map_lookup *map;
        struct extent_map *em;
        int i;
-        int ret = -EINVAL;
+        int ret = 0;
        read_lock(&map_tree->map_tree.lock);
        em = lookup_extent_mapping(&map_tree->map_tree, chunk_offset, 1);
@@ -2055,9 +2521,11 @@ static noinline_for_stack int scrub_chunk(struct scrub_dev *sdev,
                goto out;
        for (i = 0; i < map->num_stripes; ++i) {
-                if (map->stripes[i].dev == sdev->dev &&
+                if (map->stripes[i].dev->bdev == scrub_dev->bdev &&
                    map->stripes[i].physical == dev_offset) {
-                        ret = scrub_stripe(sdev, map, i, chunk_offset, length);
+                        ret = scrub_stripe(sctx, map, scrub_dev, i,
+                                           chunk_offset, length,
+                                           is_dev_replace);
                        if (ret)
                                goto out;
                }
@@ -2069,11 +2537,13 @@ out:
 }
 static noinline_for_stack
-int scrub_enumerate_chunks(struct scrub_dev *sdev, u64 start, u64 end)
+int scrub_enumerate_chunks(struct scrub_ctx *sctx,
+                           struct btrfs_device *scrub_dev, u64 start, u64 end,
+                           int is_dev_replace)
 {
        struct btrfs_dev_extent *dev_extent = NULL;
        struct btrfs_path *path;
-        struct btrfs_root *root = sdev->dev->dev_root;
+        struct btrfs_root *root = sctx->dev_root;
        struct btrfs_fs_info *fs_info = root->fs_info;
        u64 length;
        u64 chunk_tree;
@@ -2085,6 +2555,7 @@ int scrub_enumerate_chunks(struct scrub_dev *sdev, u64 start, u64 end)
        struct btrfs_key key;
        struct btrfs_key found_key;
        struct btrfs_block_group_cache *cache;
+        struct btrfs_dev_replace *dev_replace = &fs_info->dev_replace;
        path = btrfs_alloc_path();
        if (!path)
@@ -2094,11 +2565,10 @@ int scrub_enumerate_chunks(struct scrub_dev *sdev, u64 start, u64 end)
        path->search_commit_root = 1;
        path->skip_locking = 1;
-        key.objectid = sdev->dev->devid;
+        key.objectid = scrub_dev->devid;
        key.offset = 0ull;
        key.type = BTRFS_DEV_EXTENT_KEY;
        while (1) {
                ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
                if (ret < 0)
@@ -2117,7 +2587,7 @@ int scrub_enumerate_chunks(struct scrub_dev *sdev, u64 start, u64 end)
                btrfs_item_key_to_cpu(l, &found_key, slot);
-                if (found_key.objectid != sdev->dev->devid)
+                if (found_key.objectid != scrub_dev->devid)
                        break;
                if (btrfs_key_type(&found_key) != BTRFS_DEV_EXTENT_KEY)
@@ -2151,11 +2621,62 @@ int scrub_enumerate_chunks(struct scrub_dev *sdev, u64 start, u64 end)
                        ret = -ENOENT;
                        break;
                }
-                ret = scrub_chunk(sdev, chunk_tree, chunk_objectid,
+                dev_replace->cursor_right = found_key.offset + length;
-                                  chunk_offset, length, found_key.offset);
+                dev_replace->cursor_left = found_key.offset;
+                dev_replace->item_needs_writeback = 1;
+                ret = scrub_chunk(sctx, scrub_dev, chunk_tree, chunk_objectid,
+                                  chunk_offset, length, found_key.offset,
+                                  is_dev_replace);
+                /*
+                 * flush, submit all pending read and write bios, afterwards
+                 * wait for them.
+                 * Note that in the dev replace case, a read request causes
+                 * write requests that are submitted in the read completion
+                 * worker. Therefore in the current situation, it is required
+                 * that all write requests are flushed, so that all read and
+                 * write requests are really completed when bios_in_flight
+                 * changes to 0.
+                 */
+                atomic_set(&sctx->wr_ctx.flush_all_writes, 1);
+                scrub_submit(sctx);
+                mutex_lock(&sctx->wr_ctx.wr_lock);
+                scrub_wr_submit(sctx);
+                mutex_unlock(&sctx->wr_ctx.wr_lock);
+                wait_event(sctx->list_wait,
+                           atomic_read(&sctx->bios_in_flight) == 0);
+                atomic_set(&sctx->wr_ctx.flush_all_writes, 0);
+                atomic_inc(&fs_info->scrubs_paused);
+                wake_up(&fs_info->scrub_pause_wait);
+                wait_event(sctx->list_wait,
+                           atomic_read(&sctx->workers_pending) == 0);
+                mutex_lock(&fs_info->scrub_lock);
+                while (atomic_read(&fs_info->scrub_pause_req)) {
+                        mutex_unlock(&fs_info->scrub_lock);
+                        wait_event(fs_info->scrub_pause_wait,
+                           atomic_read(&fs_info->scrub_pause_req) == 0);
+                        mutex_lock(&fs_info->scrub_lock);
+                }
+                atomic_dec(&fs_info->scrubs_paused);
+                mutex_unlock(&fs_info->scrub_lock);
+                wake_up(&fs_info->scrub_pause_wait);
+                dev_replace->cursor_left = dev_replace->cursor_right;
+                dev_replace->item_needs_writeback = 1;
                btrfs_put_block_group(cache);
                if (ret)
                        break;
+                if (is_dev_replace &&
+                    atomic64_read(&dev_replace->num_write_errors) > 0) {
+                        ret = -EIO;
+                        break;
+                }
+                if (sctx->stat.malloc_errors > 0) {
+                        ret = -ENOMEM;
+                        break;
+                }
                key.offset = found_key.offset + length;
                btrfs_release_path(path);
@@ -2170,14 +2691,14 @@ int scrub_enumerate_chunks(struct scrub_dev *sdev, u64 start, u64 end)
        return ret < 0 ? ret : 0;
 }
-static noinline_for_stack int scrub_supers(struct scrub_dev *sdev)
+static noinline_for_stack int scrub_supers(struct scrub_ctx *sctx,
+                                           struct btrfs_device *scrub_dev)
 {
        int     i;
        u64     bytenr;
        u64     gen;
        int     ret;
-        struct btrfs_device *device = sdev->dev;
+        struct btrfs_root *root = sctx->dev_root;
-        struct btrfs_root *root = device->dev_root;
        if (root->fs_info->fs_state & BTRFS_SUPER_FLAG_ERROR)
                return -EIO;
@@ -2186,15 +2707,16 @@ static noinline_for_stack int scrub_supers(struct scrub_dev *sdev)
        for (i = 0; i < BTRFS_SUPER_MIRROR_MAX; i++) {
                bytenr = btrfs_sb_offset(i);
-                if (bytenr + BTRFS_SUPER_INFO_SIZE > device->total_bytes)
+                if (bytenr + BTRFS_SUPER_INFO_SIZE > scrub_dev->total_bytes)
                        break;
-                ret = scrub_pages(sdev, bytenr, BTRFS_SUPER_INFO_SIZE, bytenr,
+                ret = scrub_pages(sctx, bytenr, BTRFS_SUPER_INFO_SIZE, bytenr,
-                                     BTRFS_EXTENT_FLAG_SUPER, gen, i, NULL, 1);
+                                  scrub_dev, BTRFS_EXTENT_FLAG_SUPER, gen, i,
+                                  NULL, 1, bytenr);
                if (ret)
                        return ret;
        }
-        wait_event(sdev->list_wait, atomic_read(&sdev->in_flight) == 0);
+        wait_event(sctx->list_wait, atomic_read(&sctx->bios_in_flight) == 0);
        return 0;
 }
@@ -2202,19 +2724,38 @@ static noinline_for_stack int scrub_supers(struct scrub_dev *sdev)
 /*
 * get a reference count on fs_info->scrub_workers. start worker if necessary
 */
-static noinline_for_stack int scrub_workers_get(struct btrfs_root *root)
+static noinline_for_stack int scrub_workers_get(struct btrfs_fs_info *fs_info,
+                                                int is_dev_replace)
 {
-        struct btrfs_fs_info *fs_info = root->fs_info;
        int ret = 0;
        mutex_lock(&fs_info->scrub_lock);
        if (fs_info->scrub_workers_refcnt == 0) {
-                btrfs_init_workers(&fs_info->scrub_workers, "scrub",
+                if (is_dev_replace)
-                           fs_info->thread_pool_size, &fs_info->generic_worker);
+                        btrfs_init_workers(&fs_info->scrub_workers, "scrub", 1,
+                                        &fs_info->generic_worker);
+                else
+                        btrfs_init_workers(&fs_info->scrub_workers, "scrub",
+                                        fs_info->thread_pool_size,
+                                        &fs_info->generic_worker);
                fs_info->scrub_workers.idle_thresh = 4;
                ret = btrfs_start_workers(&fs_info->scrub_workers);
                if (ret)
                        goto out;
+                btrfs_init_workers(&fs_info->scrub_wr_completion_workers,
+                                   "scrubwrc",
+                                   fs_info->thread_pool_size,
+                                   &fs_info->generic_worker);
+                fs_info->scrub_wr_completion_workers.idle_thresh = 2;
+                ret = btrfs_start_workers(
+                                &fs_info->scrub_wr_completion_workers);
+                if (ret)
+                        goto out;
+                btrfs_init_workers(&fs_info->scrub_nocow_workers, "scrubnc", 1,
+                                   &fs_info->generic_worker);
+                ret = btrfs_start_workers(&fs_info->scrub_nocow_workers);
+                if (ret)
+                        goto out;
        }
        ++fs_info->scrub_workers_refcnt;
 out:
@@ -2223,40 +2764,41 @@ out:
        return ret;
 }
-static noinline_for_stack void scrub_workers_put(struct btrfs_root *root)
+static noinline_for_stack void scrub_workers_put(struct btrfs_fs_info *fs_info)
 {
-        struct btrfs_fs_info *fs_info = root->fs_info;
        mutex_lock(&fs_info->scrub_lock);
-        if (--fs_info->scrub_workers_refcnt == 0)
+        if (--fs_info->scrub_workers_refcnt == 0) {
                btrfs_stop_workers(&fs_info->scrub_workers);
+                btrfs_stop_workers(&fs_info->scrub_wr_completion_workers);
+                btrfs_stop_workers(&fs_info->scrub_nocow_workers);
+        }
        WARN_ON(fs_info->scrub_workers_refcnt < 0);
        mutex_unlock(&fs_info->scrub_lock);
 }
+int btrfs_scrub_dev(struct btrfs_fs_info *fs_info, u64 devid, u64 start,
-int btrfs_scrub_dev(struct btrfs_root *root, u64 devid, u64 start, u64 end,
+                    u64 end, struct btrfs_scrub_progress *progress,
-                    struct btrfs_scrub_progress *progress, int readonly)
+                    int readonly, int is_dev_replace)
 {
-        struct scrub_dev *sdev;
+        struct scrub_ctx *sctx;
-        struct btrfs_fs_info *fs_info = root->fs_info;
        int ret;
        struct btrfs_device *dev;
-        if (btrfs_fs_closing(root->fs_info))
+        if (btrfs_fs_closing(fs_info))
                return -EINVAL;
        /*
         * check some assumptions
         */
-        if (root->nodesize != root->leafsize) {
+        if (fs_info->chunk_root->nodesize != fs_info->chunk_root->leafsize) {
                printk(KERN_ERR
                       "btrfs_scrub: size assumption nodesize == leafsize (%d == %d) fails\n",
-                       root->nodesize, root->leafsize);
+                       fs_info->chunk_root->nodesize,
+                       fs_info->chunk_root->leafsize);
                return -EINVAL;
        }
-        if (root->nodesize > BTRFS_STRIPE_LEN) {
+        if (fs_info->chunk_root->nodesize > BTRFS_STRIPE_LEN) {
                /*
                 * in this case scrub is unable to calculate the checksum
                 * the way scrub is implemented. Do not handle this
@@ -2264,80 +2806,105 @@ int btrfs_scrub_dev(struct btrfs_root *root, u64 devid, u64 start, u64 end,
                 */
                printk(KERN_ERR
                       "btrfs_scrub: size assumption nodesize <= BTRFS_STRIPE_LEN (%d <= %d) fails\n",
-                       root->nodesize, BTRFS_STRIPE_LEN);
+                       fs_info->chunk_root->nodesize, BTRFS_STRIPE_LEN);
                return -EINVAL;
        }
-        if (root->sectorsize != PAGE_SIZE) {
+        if (fs_info->chunk_root->sectorsize != PAGE_SIZE) {
                /* not supported for data w/o checksums */
                printk(KERN_ERR
                       "btrfs_scrub: size assumption sectorsize != PAGE_SIZE (%d != %lld) fails\n",
-                       root->sectorsize, (unsigned long long)PAGE_SIZE);
+                       fs_info->chunk_root->sectorsize,
+                       (unsigned long long)PAGE_SIZE);
                return -EINVAL;
        }
-        ret = scrub_workers_get(root);
+        if (fs_info->chunk_root->nodesize >
+            PAGE_SIZE * SCRUB_MAX_PAGES_PER_BLOCK ||
+            fs_info->chunk_root->sectorsize >
+            PAGE_SIZE * SCRUB_MAX_PAGES_PER_BLOCK) {
+                /*
+                 * would exhaust the array bounds of pagev member in
+                 * struct scrub_block
+                 */
+                pr_err("btrfs_scrub: size assumption nodesize and sectorsize <= SCRUB_MAX_PAGES_PER_BLOCK (%d <= %d && %d <= %d) fails\n",
+                       fs_info->chunk_root->nodesize,
+                       SCRUB_MAX_PAGES_PER_BLOCK,
+                       fs_info->chunk_root->sectorsize,
+                       SCRUB_MAX_PAGES_PER_BLOCK);
+                return -EINVAL;
+        }
+        ret = scrub_workers_get(fs_info, is_dev_replace);
        if (ret)
                return ret;
-        mutex_lock(&root->fs_info->fs_devices->device_list_mutex);
+        mutex_lock(&fs_info->fs_devices->device_list_mutex);
-        dev = btrfs_find_device(root, devid, NULL, NULL);
+        dev = btrfs_find_device(fs_info, devid, NULL, NULL);
-        if (!dev || dev->missing) {
+        if (!dev || (dev->missing && !is_dev_replace)) {
-                mutex_unlock(&root->fs_info->fs_devices->device_list_mutex);
+                mutex_unlock(&fs_info->fs_devices->device_list_mutex);
-                scrub_workers_put(root);
+                scrub_workers_put(fs_info);
                return -ENODEV;
        }
        mutex_lock(&fs_info->scrub_lock);
-        if (!dev->in_fs_metadata) {
+        if (!dev->in_fs_metadata || dev->is_tgtdev_for_dev_replace) {
                mutex_unlock(&fs_info->scrub_lock);
-                mutex_unlock(&root->fs_info->fs_devices->device_list_mutex);
+                mutex_unlock(&fs_info->fs_devices->device_list_mutex);
-                scrub_workers_put(root);
+                scrub_workers_put(fs_info);
-                return -ENODEV;
+                return -EIO;
        }
-        if (dev->scrub_device) {
+        btrfs_dev_replace_lock(&fs_info->dev_replace);
+        if (dev->scrub_device ||
+            (!is_dev_replace &&
+             btrfs_dev_replace_is_ongoing(&fs_info->dev_replace))) {
+                btrfs_dev_replace_unlock(&fs_info->dev_replace);
                mutex_unlock(&fs_info->scrub_lock);
-                mutex_unlock(&root->fs_info->fs_devices->device_list_mutex);
+                mutex_unlock(&fs_info->fs_devices->device_list_mutex);
-                scrub_workers_put(root);
+                scrub_workers_put(fs_info);
                return -EINPROGRESS;
        }
-        sdev = scrub_setup_dev(dev);
+        btrfs_dev_replace_unlock(&fs_info->dev_replace);
-        if (IS_ERR(sdev)) {
+        sctx = scrub_setup_ctx(dev, is_dev_replace);
+        if (IS_ERR(sctx)) {
                mutex_unlock(&fs_info->scrub_lock);
-                mutex_unlock(&root->fs_info->fs_devices->device_list_mutex);
+                mutex_unlock(&fs_info->fs_devices->device_list_mutex);
-                scrub_workers_put(root);
+                scrub_workers_put(fs_info);
-                return PTR_ERR(sdev);
+                return PTR_ERR(sctx);
        }
-        sdev->readonly = readonly;
+        sctx->readonly = readonly;
-        dev->scrub_device = sdev;
+        dev->scrub_device = sctx;
        atomic_inc(&fs_info->scrubs_running);
        mutex_unlock(&fs_info->scrub_lock);
-        mutex_unlock(&root->fs_info->fs_devices->device_list_mutex);
+        mutex_unlock(&fs_info->fs_devices->device_list_mutex);
-        down_read(&fs_info->scrub_super_lock);
+        if (!is_dev_replace) {
-        ret = scrub_supers(sdev);
+                down_read(&fs_info->scrub_super_lock);
-        up_read(&fs_info->scrub_super_lock);
+                ret = scrub_supers(sctx, dev);
+                up_read(&fs_info->scrub_super_lock);
+        }
        if (!ret)
-                ret = scrub_enumerate_chunks(sdev, start, end);
+                ret = scrub_enumerate_chunks(sctx, dev, start, end,
+                                             is_dev_replace);
-        wait_event(sdev->list_wait, atomic_read(&sdev->in_flight) == 0);
+        wait_event(sctx->list_wait, atomic_read(&sctx->bios_in_flight) == 0);
        atomic_dec(&fs_info->scrubs_running);
        wake_up(&fs_info->scrub_pause_wait);
-        wait_event(sdev->list_wait, atomic_read(&sdev->fixup_cnt) == 0);
+        wait_event(sctx->list_wait, atomic_read(&sctx->workers_pending) == 0);
        if (progress)
-                memcpy(progress, &sdev->stat, sizeof(*progress));
+                memcpy(progress, &sctx->stat, sizeof(*progress));
        mutex_lock(&fs_info->scrub_lock);
        dev->scrub_device = NULL;
        mutex_unlock(&fs_info->scrub_lock);
-        scrub_free_dev(sdev);
+        scrub_free_ctx(sctx);
-        scrub_workers_put(root);
+        scrub_workers_put(fs_info);
        return ret;
 }
@@ -2377,9 +2944,8 @@ void btrfs_scrub_continue_super(struct btrfs_root *root)
        up_write(&root->fs_info->scrub_super_lock);
 }
-int __btrfs_scrub_cancel(struct btrfs_fs_info *fs_info)
+int btrfs_scrub_cancel(struct btrfs_fs_info *fs_info)
 {
        mutex_lock(&fs_info->scrub_lock);
        if (!atomic_read(&fs_info->scrubs_running)) {
                mutex_unlock(&fs_info->scrub_lock);
@@ -2399,23 +2965,18 @@ int __btrfs_scrub_cancel(struct btrfs_fs_info *fs_info)
        return 0;
 }
-int btrfs_scrub_cancel(struct btrfs_root *root)
+int btrfs_scrub_cancel_dev(struct btrfs_fs_info *fs_info,
+                           struct btrfs_device *dev)
 {
-        return __btrfs_scrub_cancel(root->fs_info);
+        struct scrub_ctx *sctx;
-}
-int btrfs_scrub_cancel_dev(struct btrfs_root *root, struct btrfs_device *dev)
-{
-        struct btrfs_fs_info *fs_info = root->fs_info;
-        struct scrub_dev *sdev;
        mutex_lock(&fs_info->scrub_lock);
-        sdev = dev->scrub_device;
+        sctx = dev->scrub_device;
-        if (!sdev) {
+        if (!sctx) {
                mutex_unlock(&fs_info->scrub_lock);
                return -ENOTCONN;
        }
-        atomic_inc(&sdev->cancel_req);
+        atomic_inc(&sctx->cancel_req);
        while (dev->scrub_device) {
                mutex_unlock(&fs_info->scrub_lock);
                wait_event(fs_info->scrub_pause_wait,
@@ -2438,12 +2999,12 @@ int btrfs_scrub_cancel_devid(struct btrfs_root *root, u64 devid)
         * does not go away in cancel_dev. FIXME: find a better solution
         */
        mutex_lock(&fs_info->fs_devices->device_list_mutex);
-        dev = btrfs_find_device(root, devid, NULL, NULL);
+        dev = btrfs_find_device(fs_info, devid, NULL, NULL);
        if (!dev) {
                mutex_unlock(&fs_info->fs_devices->device_list_mutex);
                return -ENODEV;
        }
-        ret = btrfs_scrub_cancel_dev(root, dev);
+        ret = btrfs_scrub_cancel_dev(fs_info, dev);
        mutex_unlock(&fs_info->fs_devices->device_list_mutex);
        return ret;
@@ -2453,15 +3014,284 @@ int btrfs_scrub_progress(struct btrfs_root *root, u64 devid,
                         struct btrfs_scrub_progress *progress)
 {
        struct btrfs_device *dev;
-        struct scrub_dev *sdev = NULL;
+        struct scrub_ctx *sctx = NULL;
        mutex_lock(&root->fs_info->fs_devices->device_list_mutex);
-        dev = btrfs_find_device(root, devid, NULL, NULL);
+        dev = btrfs_find_device(root->fs_info, devid, NULL, NULL);
        if (dev)
-                sdev = dev->scrub_device;
+                sctx = dev->scrub_device;
-        if (sdev)
+        if (sctx)
-                memcpy(progress, &sdev->stat, sizeof(*progress));
+                memcpy(progress, &sctx->stat, sizeof(*progress));
        mutex_unlock(&root->fs_info->fs_devices->device_list_mutex);
-        return dev ? (sdev ? 0 : -ENOTCONN) : -ENODEV;
+        return dev ? (sctx ? 0 : -ENOTCONN) : -ENODEV;
+}
+static void scrub_remap_extent(struct btrfs_fs_info *fs_info,
+                               u64 extent_logical, u64 extent_len,
+                               u64 *extent_physical,
+                               struct btrfs_device **extent_dev,
+                               int *extent_mirror_num)
+{
+        u64 mapped_length;
+        struct btrfs_bio *bbio = NULL;
+        int ret;
+        mapped_length = extent_len;
+        ret = btrfs_map_block(fs_info, READ, extent_logical,
+                              &mapped_length, &bbio, 0);
+        if (ret || !bbio || mapped_length < extent_len ||
+            !bbio->stripes[0].dev->bdev) {
+                kfree(bbio);
+                return;
+        }
+        *extent_physical = bbio->stripes[0].physical;
+        *extent_mirror_num = bbio->mirror_num;
+        *extent_dev = bbio->stripes[0].dev;
+        kfree(bbio);
+}
+static int scrub_setup_wr_ctx(struct scrub_ctx *sctx,
+                              struct scrub_wr_ctx *wr_ctx,
+                              struct btrfs_fs_info *fs_info,
+                              struct btrfs_device *dev,
+                              int is_dev_replace)
+{
+        WARN_ON(wr_ctx->wr_curr_bio != NULL);
+        mutex_init(&wr_ctx->wr_lock);
+        wr_ctx->wr_curr_bio = NULL;
+        if (!is_dev_replace)
+                return 0;
+        WARN_ON(!dev->bdev);
+        wr_ctx->pages_per_wr_bio = min_t(int, SCRUB_PAGES_PER_WR_BIO,
+                                         bio_get_nr_vecs(dev->bdev));
+        wr_ctx->tgtdev = dev;
+        atomic_set(&wr_ctx->flush_all_writes, 0);
+        return 0;
+}
+static void scrub_free_wr_ctx(struct scrub_wr_ctx *wr_ctx)
+{
+        mutex_lock(&wr_ctx->wr_lock);
+        kfree(wr_ctx->wr_curr_bio);
+        wr_ctx->wr_curr_bio = NULL;
+        mutex_unlock(&wr_ctx->wr_lock);
+}
+static int copy_nocow_pages(struct scrub_ctx *sctx, u64 logical, u64 len,
+                            int mirror_num, u64 physical_for_dev_replace)
+{
+        struct scrub_copy_nocow_ctx *nocow_ctx;
+        struct btrfs_fs_info *fs_info = sctx->dev_root->fs_info;
+        nocow_ctx = kzalloc(sizeof(*nocow_ctx), GFP_NOFS);
+        if (!nocow_ctx) {
+                spin_lock(&sctx->stat_lock);
+                sctx->stat.malloc_errors++;
+                spin_unlock(&sctx->stat_lock);
+                return -ENOMEM;
+        }
+        scrub_pending_trans_workers_inc(sctx);
+        nocow_ctx->sctx = sctx;
+        nocow_ctx->logical = logical;
+        nocow_ctx->len = len;
+        nocow_ctx->mirror_num = mirror_num;
+        nocow_ctx->physical_for_dev_replace = physical_for_dev_replace;
+        nocow_ctx->work.func = copy_nocow_pages_worker;
+        btrfs_queue_worker(&fs_info->scrub_nocow_workers,
+                           &nocow_ctx->work);
+        return 0;
+}
+static void copy_nocow_pages_worker(struct btrfs_work *work)
+{
+        struct scrub_copy_nocow_ctx *nocow_ctx =
+                container_of(work, struct scrub_copy_nocow_ctx, work);
+        struct scrub_ctx *sctx = nocow_ctx->sctx;
+        u64 logical = nocow_ctx->logical;
+        u64 len = nocow_ctx->len;
+        int mirror_num = nocow_ctx->mirror_num;
+        u64 physical_for_dev_replace = nocow_ctx->physical_for_dev_replace;
+        int ret;
+        struct btrfs_trans_handle *trans = NULL;
+        struct btrfs_fs_info *fs_info;
+        struct btrfs_path *path;
+        struct btrfs_root *root;
+        int not_written = 0;
+        fs_info = sctx->dev_root->fs_info;
+        root = fs_info->extent_root;
+        path = btrfs_alloc_path();
+        if (!path) {
+                spin_lock(&sctx->stat_lock);
+                sctx->stat.malloc_errors++;
+                spin_unlock(&sctx->stat_lock);
+                not_written = 1;
+                goto out;
+        }
+        trans = btrfs_join_transaction(root);
+        if (IS_ERR(trans)) {
+                not_written = 1;
+                goto out;
+        }
+        ret = iterate_inodes_from_logical(logical, fs_info, path,
+                                          copy_nocow_pages_for_inode,
+                                          nocow_ctx);
+        if (ret != 0 && ret != -ENOENT) {
+                pr_warn("iterate_inodes_from_logical() failed: log %llu, phys %llu, len %llu, mir %llu, ret %d\n",
+                        (unsigned long long)logical,
+                        (unsigned long long)physical_for_dev_replace,
+                        (unsigned long long)len,
+                        (unsigned long long)mirror_num, ret);
+                not_written = 1;
+                goto out;
+        }
+out:
+        if (trans && !IS_ERR(trans))
+                btrfs_end_transaction(trans, root);
+        if (not_written)
+                btrfs_dev_replace_stats_inc(&fs_info->dev_replace.
+                                            num_uncorrectable_read_errors);
+        btrfs_free_path(path);
+        kfree(nocow_ctx);
+        scrub_pending_trans_workers_dec(sctx);
+}
+static int copy_nocow_pages_for_inode(u64 inum, u64 offset, u64 root, void *ctx)
+{
+        unsigned long index;
+        struct scrub_copy_nocow_ctx *nocow_ctx = ctx;
+        int ret = 0;
+        struct btrfs_key key;
+        struct inode *inode = NULL;
+        struct btrfs_root *local_root;
+        u64 physical_for_dev_replace;
+        u64 len;
+        struct btrfs_fs_info *fs_info = nocow_ctx->sctx->dev_root->fs_info;
+        key.objectid = root;
+        key.type = BTRFS_ROOT_ITEM_KEY;
+        key.offset = (u64)-1;
+        local_root = btrfs_read_fs_root_no_name(fs_info, &key);
+        if (IS_ERR(local_root))
+                return PTR_ERR(local_root);
+        key.type = BTRFS_INODE_ITEM_KEY;
+        key.objectid = inum;
+        key.offset = 0;
+        inode = btrfs_iget(fs_info->sb, &key, local_root, NULL);
+        if (IS_ERR(inode))
+                return PTR_ERR(inode);
+        physical_for_dev_replace = nocow_ctx->physical_for_dev_replace;
+        len = nocow_ctx->len;
+        while (len >= PAGE_CACHE_SIZE) {
+                struct page *page = NULL;
+                int ret_sub;
+                index = offset >> PAGE_CACHE_SHIFT;
+                page = find_or_create_page(inode->i_mapping, index, GFP_NOFS);
+                if (!page) {
+                        pr_err("find_or_create_page() failed\n");
+                        ret = -ENOMEM;
+                        goto next_page;
+                }
+                if (PageUptodate(page)) {
+                        if (PageDirty(page))
+                                goto next_page;
+                } else {
+                        ClearPageError(page);
+                        ret_sub = extent_read_full_page(&BTRFS_I(inode)->
+                                                         io_tree,
+                                                        page, btrfs_get_extent,
+                                                        nocow_ctx->mirror_num);
+                        if (ret_sub) {
+                                ret = ret_sub;
+                                goto next_page;
+                        }
+                        wait_on_page_locked(page);
+                        if (!PageUptodate(page)) {
+                                ret = -EIO;
+                                goto next_page;
+                        }
+                }
+                ret_sub = write_page_nocow(nocow_ctx->sctx,
+                                           physical_for_dev_replace, page);
+                if (ret_sub) {
+                        ret = ret_sub;
+                        goto next_page;
+                }
+next_page:
+                if (page) {
+                        unlock_page(page);
+                        put_page(page);
+                }
+                offset += PAGE_CACHE_SIZE;
+                physical_for_dev_replace += PAGE_CACHE_SIZE;
+                len -= PAGE_CACHE_SIZE;
+        }
+        if (inode)
+                iput(inode);
+        return ret;
+}
+static int write_page_nocow(struct scrub_ctx *sctx,
+                            u64 physical_for_dev_replace, struct page *page)
+{
+        struct bio *bio;
+        struct btrfs_device *dev;
+        int ret;
+        DECLARE_COMPLETION_ONSTACK(compl);
+        dev = sctx->wr_ctx.tgtdev;
+        if (!dev)
+                return -EIO;
+        if (!dev->bdev) {
+                printk_ratelimited(KERN_WARNING
+                        "btrfs: scrub write_page_nocow(bdev == NULL) is unexpected!\n");
+                return -EIO;
+        }
+        bio = bio_alloc(GFP_NOFS, 1);
+        if (!bio) {
+                spin_lock(&sctx->stat_lock);
+                sctx->stat.malloc_errors++;
+                spin_unlock(&sctx->stat_lock);
+                return -ENOMEM;
+        }
+        bio->bi_private = &compl;
+        bio->bi_end_io = scrub_complete_bio_end_io;
+        bio->bi_size = 0;
+        bio->bi_sector = physical_for_dev_replace >> 9;
+        bio->bi_bdev = dev->bdev;
+        ret = bio_add_page(bio, page, PAGE_CACHE_SIZE, 0);
+        if (ret != PAGE_CACHE_SIZE) {
+leave_with_eio:
+                bio_put(bio);
+                btrfs_dev_stat_inc_and_print(dev, BTRFS_DEV_STAT_WRITE_ERRS);
+                return -EIO;
+        }
+        btrfsic_submit_bio(WRITE_SYNC, bio);
+        wait_for_completion(&compl);
+        if (!test_bit(BIO_UPTODATE, &bio->bi_flags))
+                goto leave_with_eio;
+        bio_put(bio);
+        return 0;
 }
diff --git a/fs/btrfs/send.c b/fs/btrfs/send.c
index e78b297b0b00..54454542ad40 100644
--- a/fs/btrfs/send.c
+++ b/fs/btrfs/send.c
@@ -4397,9 +4397,9 @@ static int full_send_tree(struct send_ctx *sctx)
        if (!path)
                return -ENOMEM;
-        spin_lock(&send_root->root_times_lock);
+        spin_lock(&send_root->root_item_lock);
        start_ctransid = btrfs_root_ctransid(&send_root->root_item);
-        spin_unlock(&send_root->root_times_lock);
+        spin_unlock(&send_root->root_item_lock);
        key.objectid = BTRFS_FIRST_FREE_OBJECTID;
        key.type = BTRFS_INODE_ITEM_KEY;
@@ -4422,9 +4422,9 @@ join_trans:
         * Make sure the tree has not changed after re-joining. We detect this
         * by comparing start_ctransid and ctransid. They should always match.
         */
-        spin_lock(&send_root->root_times_lock);
+        spin_lock(&send_root->root_item_lock);
        ctransid = btrfs_root_ctransid(&send_root->root_item);
-        spin_unlock(&send_root->root_times_lock);
+        spin_unlock(&send_root->root_item_lock);
        if (ctransid != start_ctransid) {
                WARN(1, KERN_WARNING "btrfs: the root that you're trying to "
diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c
index 915ac14c2064..99545df1b86c 100644
--- a/fs/btrfs/super.c
+++ b/fs/btrfs/super.c
@@ -55,6 +55,7 @@
 #include "export.h"
 #include "compression.h"
 #include "rcu-string.h"
+#include "dev-replace.h"
 #define CREATE_TRACE_POINTS
 #include <trace/events/btrfs.h>
@@ -116,7 +117,16 @@ static void btrfs_handle_error(struct btrfs_fs_info *fs_info)
        if (fs_info->fs_state & BTRFS_SUPER_FLAG_ERROR) {
                sb->s_flags |= MS_RDONLY;
                printk(KERN_INFO "btrfs is forced readonly\n");
-                __btrfs_scrub_cancel(fs_info);
+                /*
+                 * Note that a running device replace operation is not
+                 * canceled here although there is no way to update
+                 * the progress. It would add the risk of a deadlock,
+                 * therefore the canceling is ommited. The only penalty
+                 * is that some I/O remains active until the procedure
+                 * completes. The next time when the filesystem is
+                 * mounted writeable again, the device replace
+                 * operation continues.
+                 */
 //              WARN_ON(1);
        }
 }
@@ -1186,7 +1196,8 @@ static void btrfs_resize_thread_pool(struct btrfs_fs_info *fs_info,
        btrfs_set_max_workers(&fs_info->endio_freespace_worker, new_pool_size);
        btrfs_set_max_workers(&fs_info->delayed_workers, new_pool_size);
        btrfs_set_max_workers(&fs_info->readahead_workers, new_pool_size);
-        btrfs_set_max_workers(&fs_info->scrub_workers, new_pool_size);
+        btrfs_set_max_workers(&fs_info->scrub_wr_completion_workers,
+                              new_pool_size);
 }
 static int btrfs_remount(struct super_block *sb, int *flags, char *data)
@@ -1215,8 +1226,15 @@ static int btrfs_remount(struct super_block *sb, int *flags, char *data)
                return 0;
        if (*flags & MS_RDONLY) {
+                /*
+                 * this also happens on 'umount -rf' or on shutdown, when
+                 * the filesystem is busy.
+                 */
                sb->s_flags |= MS_RDONLY;
+                btrfs_dev_replace_suspend_for_unmount(fs_info);
+                btrfs_scrub_cancel(fs_info);
                ret = btrfs_commit_super(root);
                if (ret)
                        goto restore;
@@ -1226,6 +1244,15 @@ static int btrfs_remount(struct super_block *sb, int *flags, char *data)
                        goto restore;
                }
+                if (fs_info->fs_devices->missing_devices >
+                     fs_info->num_tolerated_disk_barrier_failures &&
+                    !(*flags & MS_RDONLY)) {
+                        printk(KERN_WARNING
+                               "Btrfs: too many missing devices, writeable remount is not allowed\n");
+                        ret = -EACCES;
+                        goto restore;
+                }
                if (btrfs_super_log_root(fs_info->super_copy) != 0) {
                        ret = -EINVAL;
                        goto restore;
@@ -1244,6 +1271,11 @@ static int btrfs_remount(struct super_block *sb, int *flags, char *data)
                if (ret)
                        goto restore;
+                ret = btrfs_resume_dev_replace_async(fs_info);
+                if (ret) {
+                        pr_warn("btrfs: failed to resume dev_replace\n");
+                        goto restore;
+                }
                sb->s_flags &= ~MS_RDONLY;
        }
@@ -1336,7 +1368,8 @@ static int btrfs_calc_avail_data_space(struct btrfs_root *root, u64 *free_bytes)
                min_stripe_size = BTRFS_STRIPE_LEN;
        list_for_each_entry(device, &fs_devices->devices, dev_list) {
-                if (!device->in_fs_metadata || !device->bdev)
+                if (!device->in_fs_metadata || !device->bdev ||
+                    device->is_tgtdev_for_dev_replace)
                        continue;
                avail_space = device->total_bytes - device->bytes_used;
@@ -1647,10 +1680,14 @@ static int __init init_btrfs_fs(void)
        if (err)
                goto free_ordered_data;
-        err = btrfs_interface_init();
+        err = btrfs_auto_defrag_init();
        if (err)
                goto free_delayed_inode;
+        err = btrfs_interface_init();
+        if (err)
+                goto free_auto_defrag;
        err = register_filesystem(&btrfs_fs_type);
        if (err)
                goto unregister_ioctl;
@@ -1662,6 +1699,8 @@ static int __init init_btrfs_fs(void)
 unregister_ioctl:
        btrfs_interface_exit();
+free_auto_defrag:
+        btrfs_auto_defrag_exit();
 free_delayed_inode:
        btrfs_delayed_inode_exit();
 free_ordered_data:
@@ -1681,6 +1720,7 @@ free_compress:
 static void __exit exit_btrfs_fs(void)
 {
        btrfs_destroy_cachep();
+        btrfs_auto_defrag_exit();
        btrfs_delayed_inode_exit();
        ordered_data_exit();
        extent_map_exit();
diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c
index 04bbfb1052eb..87fac9a21ea5 100644
--- a/fs/btrfs/transaction.c
+++ b/fs/btrfs/transaction.c
@@ -30,6 +30,7 @@
 #include "tree-log.h"
 #include "inode-map.h"
 #include "volumes.h"
+#include "dev-replace.h"
 #define BTRFS_ROOT_TRANS_TAG 0
@@ -145,16 +146,12 @@ loop:
         * the log must never go across transaction boundaries.
         */
        smp_mb();
-        if (!list_empty(&fs_info->tree_mod_seq_list)) {
+        if (!list_empty(&fs_info->tree_mod_seq_list))
-                printk(KERN_ERR "btrfs: tree_mod_seq_list not empty when "
+                WARN(1, KERN_ERR "btrfs: tree_mod_seq_list not empty when "
                        "creating a fresh transaction\n");
-                WARN_ON(1);
+        if (!RB_EMPTY_ROOT(&fs_info->tree_mod_log))
-        }
+                WARN(1, KERN_ERR "btrfs: tree_mod_log rb tree not empty when "
-        if (!RB_EMPTY_ROOT(&fs_info->tree_mod_log)) {
-                printk(KERN_ERR "btrfs: tree_mod_log rb tree not empty when "
                        "creating a fresh transaction\n");
-                WARN_ON(1);
-        }
        atomic_set(&fs_info->tree_mod_seq, 0);
        spin_lock_init(&cur_trans->commit_lock);
@@ -295,9 +292,9 @@ static int may_wait_transaction(struct btrfs_root *root, int type)
        return 0;
 }
-static struct btrfs_trans_handle *start_transaction(struct btrfs_root *root,
+static struct btrfs_trans_handle *
-                                                    u64 num_items, int type,
+start_transaction(struct btrfs_root *root, u64 num_items, int type,
-                                                    int noflush)
+                  enum btrfs_reserve_flush_enum flush)
 {
        struct btrfs_trans_handle *h;
        struct btrfs_transaction *cur_trans;
@@ -312,6 +309,7 @@ static struct btrfs_trans_handle *start_transaction(struct btrfs_root *root,
                WARN_ON(type != TRANS_JOIN && type != TRANS_JOIN_NOLOCK);
                h = current->journal_info;
                h->use_count++;
+                WARN_ON(h->use_count > 2);
                h->orig_rsv = h->block_rsv;
                h->block_rsv = NULL;
                goto got_it;
@@ -331,14 +329,9 @@ static struct btrfs_trans_handle *start_transaction(struct btrfs_root *root,
                }
                num_bytes = btrfs_calc_trans_metadata_size(root, num_items);
-                if (noflush)
+                ret = btrfs_block_rsv_add(root,
-                        ret = btrfs_block_rsv_add_noflush(root,
+                                          &root->fs_info->trans_block_rsv,
-                                                &root->fs_info->trans_block_rsv,
+                                          num_bytes, flush);
-                                                num_bytes);
-                else
-                        ret = btrfs_block_rsv_add(root,
-                                                &root->fs_info->trans_block_rsv,
-                                                num_bytes);
                if (ret)
                        return ERR_PTR(ret);
        }
@@ -422,13 +415,15 @@ got_it:
 struct btrfs_trans_handle *btrfs_start_transaction(struct btrfs_root *root,
                                                   int num_items)
 {
-        return start_transaction(root, num_items, TRANS_START, 0);
+        return start_transaction(root, num_items, TRANS_START,
+                                 BTRFS_RESERVE_FLUSH_ALL);
 }
-struct btrfs_trans_handle *btrfs_start_transaction_noflush(
+struct btrfs_trans_handle *btrfs_start_transaction_lflush(
                                        struct btrfs_root *root, int num_items)
 {
-        return start_transaction(root, num_items, TRANS_START, 1);
+        return start_transaction(root, num_items, TRANS_START,
+                                 BTRFS_RESERVE_FLUSH_LIMIT);
 }
 struct btrfs_trans_handle *btrfs_join_transaction(struct btrfs_root *root)
@@ -461,28 +456,31 @@ static noinline void wait_for_commit(struct btrfs_root *root,
 int btrfs_wait_for_commit(struct btrfs_root *root, u64 transid)
 {
        struct btrfs_transaction *cur_trans = NULL, *t;
-        int ret;
+        int ret = 0;
-        ret = 0;
        if (transid) {
                if (transid <= root->fs_info->last_trans_committed)
                        goto out;
+                ret = -EINVAL;
                /* find specified transaction */
                spin_lock(&root->fs_info->trans_lock);
                list_for_each_entry(t, &root->fs_info->trans_list, list) {
                        if (t->transid == transid) {
                                cur_trans = t;
                                atomic_inc(&cur_trans->use_count);
+                                ret = 0;
                                break;
                        }
-                        if (t->transid > transid)
+                        if (t->transid > transid) {
+                                ret = 0;
                                break;
+                        }
                }
                spin_unlock(&root->fs_info->trans_lock);
-                ret = -EINVAL;
+                /* The specified transaction doesn't exist */
                if (!cur_trans)
-                        goto out;  /* bad transid */
+                        goto out;
        } else {
                /* find newest transaction that is committing | committed */
                spin_lock(&root->fs_info->trans_lock);
@@ -502,9 +500,7 @@ int btrfs_wait_for_commit(struct btrfs_root *root, u64 transid)
        }
        wait_for_commit(root, cur_trans);
        put_transaction(cur_trans);
-        ret = 0;
 out:
        return ret;
 }
@@ -851,7 +847,9 @@ static noinline int commit_cowonly_roots(struct btrfs_trans_handle *trans,
                return ret;
        ret = btrfs_run_dev_stats(trans, root->fs_info);
-        BUG_ON(ret);
+        WARN_ON(ret);
+        ret = btrfs_run_dev_replace(trans, root->fs_info);
+        WARN_ON(ret);
        ret = btrfs_run_qgroups(trans, root->fs_info);
        BUG_ON(ret);
@@ -874,6 +872,8 @@ static noinline int commit_cowonly_roots(struct btrfs_trans_handle *trans,
        switch_commit_root(fs_info->extent_root);
        up_write(&fs_info->extent_commit_sem);
+        btrfs_after_dev_replace_commit(fs_info);
        return 0;
 }
@@ -958,7 +958,6 @@ int btrfs_defrag_root(struct btrfs_root *root, int cacheonly)
        struct btrfs_fs_info *info = root->fs_info;
        struct btrfs_trans_handle *trans;
        int ret;
-        unsigned long nr;
        if (xchg(&root->defrag_running, 1))
                return 0;
@@ -970,9 +969,8 @@ int btrfs_defrag_root(struct btrfs_root *root, int cacheonly)
                ret = btrfs_defrag_leaves(trans, root, cacheonly);
-                nr = trans->blocks_used;
                btrfs_end_transaction(trans, root);
-                btrfs_btree_balance_dirty(info->tree_root, nr);
+                btrfs_btree_balance_dirty(info->tree_root);
                cond_resched();
                if (btrfs_fs_closing(root->fs_info) || ret != -EAGAIN)
@@ -1032,8 +1030,9 @@ static noinline int create_pending_snapshot(struct btrfs_trans_handle *trans,
        btrfs_reloc_pre_snapshot(trans, pending, &to_reserve);
        if (to_reserve > 0) {
-                ret = btrfs_block_rsv_add_noflush(root, &pending->block_rsv,
+                ret = btrfs_block_rsv_add(root, &pending->block_rsv,
-                                                  to_reserve);
+                                          to_reserve,
+                                          BTRFS_RESERVE_NO_FLUSH);
                if (ret) {
                        pending->error = ret;
                        goto no_free_objectid;
@@ -1191,7 +1190,7 @@ static noinline int create_pending_snapshot(struct btrfs_trans_handle *trans,
                                    parent_inode, &key,
                                    BTRFS_FT_DIR, index);
        /* We have check then name at the beginning, so it is impossible. */
-        BUG_ON(ret == -EEXIST);
+        BUG_ON(ret == -EEXIST || ret == -EOVERFLOW);
        if (ret) {
                btrfs_abort_transaction(trans, root, ret);
                goto fail;
@@ -1309,9 +1308,10 @@ static void do_async_commit(struct work_struct *work)
         * We've got freeze protection passed with the transaction.
         * Tell lockdep about it.
         */
-        rwsem_acquire_read(
+        if (ac->newtrans->type < TRANS_JOIN_NOLOCK)
-                &ac->root->fs_info->sb->s_writers.lock_map[SB_FREEZE_FS-1],
+                rwsem_acquire_read(
-                0, 1, _THIS_IP_);
+                     &ac->root->fs_info->sb->s_writers.lock_map[SB_FREEZE_FS-1],
+                     0, 1, _THIS_IP_);
        current->journal_info = ac->newtrans;
@@ -1349,8 +1349,10 @@ int btrfs_commit_transaction_async(struct btrfs_trans_handle *trans,
         * Tell lockdep we've released the freeze rwsem, since the
         * async commit thread will be the one to unlock it.
         */
-        rwsem_release(&root->fs_info->sb->s_writers.lock_map[SB_FREEZE_FS-1],
+        if (trans->type < TRANS_JOIN_NOLOCK)
-                      1, _THIS_IP_);
+                rwsem_release(
+                        &root->fs_info->sb->s_writers.lock_map[SB_FREEZE_FS-1],
+                        1, _THIS_IP_);
        schedule_delayed_work(&ac->work, 0);
@@ -1400,6 +1402,48 @@ static void cleanup_transaction(struct btrfs_trans_handle *trans,
        kmem_cache_free(btrfs_trans_handle_cachep, trans);
 }
+static int btrfs_flush_all_pending_stuffs(struct btrfs_trans_handle *trans,
+                                          struct btrfs_root *root)
+{
+        int flush_on_commit = btrfs_test_opt(root, FLUSHONCOMMIT);
+        int snap_pending = 0;
+        int ret;
+        if (!flush_on_commit) {
+                spin_lock(&root->fs_info->trans_lock);
+                if (!list_empty(&trans->transaction->pending_snapshots))
+                        snap_pending = 1;
+                spin_unlock(&root->fs_info->trans_lock);
+        }
+        if (flush_on_commit || snap_pending) {
+                btrfs_start_delalloc_inodes(root, 1);
+                btrfs_wait_ordered_extents(root, 1);
+        }
+        ret = btrfs_run_delayed_items(trans, root);
+        if (ret)
+                return ret;
+        /*
+         * running the delayed items may have added new refs. account
+         * them now so that they hinder processing of more delayed refs
+         * as little as possible.
+         */
+        btrfs_delayed_refs_qgroup_accounting(trans, root->fs_info);
+        /*
+         * rename don't use btrfs_join_transaction, so, once we
+         * set the transaction to blocked above, we aren't going
+         * to get any new ordered operations.  We can safely run
+         * it here and no for sure that nothing new will be added
+         * to the list
+         */
+        btrfs_run_ordered_operations(root, 1);
+        return 0;
+}
 /*
 * btrfs_transaction state sequence:
 *    in_commit = 0, blocked = 0  (initial)
@@ -1414,15 +1458,20 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans,
        struct btrfs_transaction *cur_trans = trans->transaction;
        struct btrfs_transaction *prev_trans = NULL;
        DEFINE_WAIT(wait);
-        int ret = -EIO;
+        int ret;
        int should_grow = 0;
        unsigned long now = get_seconds();
-        int flush_on_commit = btrfs_test_opt(root, FLUSHONCOMMIT);
-        btrfs_run_ordered_operations(root, 0);
+        ret = btrfs_run_ordered_operations(root, 0);
+        if (ret) {
+                btrfs_abort_transaction(trans, root, ret);
+                goto cleanup_transaction;
+        }
-        if (cur_trans->aborted)
+        if (cur_trans->aborted) {
+                ret = cur_trans->aborted;
                goto cleanup_transaction;
+        }
        /* make a pass through all the delayed refs we have so far
         * any runnings procs may add more while we are here
@@ -1490,39 +1539,14 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans,
                should_grow = 1;
        do {
-                int snap_pending = 0;
                joined = cur_trans->num_joined;
-                if (!list_empty(&trans->transaction->pending_snapshots))
-                        snap_pending = 1;
                WARN_ON(cur_trans != trans->transaction);
-                if (flush_on_commit || snap_pending) {
+                ret = btrfs_flush_all_pending_stuffs(trans, root);
-                        btrfs_start_delalloc_inodes(root, 1);
-                        btrfs_wait_ordered_extents(root, 1);
-                }
-                ret = btrfs_run_delayed_items(trans, root);
                if (ret)
                        goto cleanup_transaction;
-                /*
-                 * running the delayed items may have added new refs. account
-                 * them now so that they hinder processing of more delayed refs
-                 * as little as possible.
-                 */
-                btrfs_delayed_refs_qgroup_accounting(trans, root->fs_info);
-                /*
-                 * rename don't use btrfs_join_transaction, so, once we
-                 * set the transaction to blocked above, we aren't going
-                 * to get any new ordered operations.  We can safely run
-                 * it here and no for sure that nothing new will be added
-                 * to the list
-                 */
-                btrfs_run_ordered_operations(root, 1);
                prepare_to_wait(&cur_trans->writer_wait, &wait,
                                TASK_UNINTERRUPTIBLE);
@@ -1535,6 +1559,10 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans,
        } while (atomic_read(&cur_trans->num_writers) > 1 ||
                 (should_grow && cur_trans->num_joined != joined));
+        ret = btrfs_flush_all_pending_stuffs(trans, root);
+        if (ret)
+                goto cleanup_transaction;
        /*
         * Ok now we need to make sure to block out any other joins while we
         * commit the transaction.  We could have started a join before setting
diff --git a/fs/btrfs/transaction.h b/fs/btrfs/transaction.h
index 80961947a6b2..0e8aa1e6c287 100644
--- a/fs/btrfs/transaction.h
+++ b/fs/btrfs/transaction.h
@@ -105,7 +105,7 @@ int btrfs_end_transaction(struct btrfs_trans_handle *trans,
                          struct btrfs_root *root);
 struct btrfs_trans_handle *btrfs_start_transaction(struct btrfs_root *root,
                                                   int num_items);
-struct btrfs_trans_handle *btrfs_start_transaction_noflush(
+struct btrfs_trans_handle *btrfs_start_transaction_lflush(
                                        struct btrfs_root *root, int num_items);
 struct btrfs_trans_handle *btrfs_join_transaction(struct btrfs_root *root);
 struct btrfs_trans_handle *btrfs_join_transaction_nolock(struct btrfs_root *root);
diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c
index 81e407d9677a..83186c7e45d4 100644
--- a/fs/btrfs/tree-log.c
+++ b/fs/btrfs/tree-log.c
@@ -2952,33 +2952,9 @@ static void fill_inode_item(struct btrfs_trans_handle *trans,
                            struct btrfs_inode_item *item,
                            struct inode *inode, int log_inode_only)
 {
-        btrfs_set_inode_uid(leaf, item, i_uid_read(inode));
+        struct btrfs_map_token token;
-        btrfs_set_inode_gid(leaf, item, i_gid_read(inode));
-        btrfs_set_inode_mode(leaf, item, inode->i_mode);
+        btrfs_init_map_token(&token);
-        btrfs_set_inode_nlink(leaf, item, inode->i_nlink);
-        btrfs_set_timespec_sec(leaf, btrfs_inode_atime(item),
-                               inode->i_atime.tv_sec);
-        btrfs_set_timespec_nsec(leaf, btrfs_inode_atime(item),
-                                inode->i_atime.tv_nsec);
-        btrfs_set_timespec_sec(leaf, btrfs_inode_mtime(item),
-                               inode->i_mtime.tv_sec);
-        btrfs_set_timespec_nsec(leaf, btrfs_inode_mtime(item),
-                                inode->i_mtime.tv_nsec);
-        btrfs_set_timespec_sec(leaf, btrfs_inode_ctime(item),
-                               inode->i_ctime.tv_sec);
-        btrfs_set_timespec_nsec(leaf, btrfs_inode_ctime(item),
-                                inode->i_ctime.tv_nsec);
-        btrfs_set_inode_nbytes(leaf, item, inode_get_bytes(inode));
-        btrfs_set_inode_sequence(leaf, item, inode->i_version);
-        btrfs_set_inode_transid(leaf, item, trans->transid);
-        btrfs_set_inode_rdev(leaf, item, inode->i_rdev);
-        btrfs_set_inode_flags(leaf, item, BTRFS_I(inode)->flags);
-        btrfs_set_inode_block_group(leaf, item, 0);
        if (log_inode_only) {
                /* set the generation to zero so the recover code
@@ -2986,14 +2962,63 @@ static void fill_inode_item(struct btrfs_trans_handle *trans,
                 * just to say 'this inode exists' and a logging
                 * to say 'update this inode with these values'
                 */
-                btrfs_set_inode_generation(leaf, item, 0);
+                btrfs_set_token_inode_generation(leaf, item, 0, &token);
-                btrfs_set_inode_size(leaf, item, 0);
+                btrfs_set_token_inode_size(leaf, item, 0, &token);
        } else {
-                btrfs_set_inode_generation(leaf, item,
+                btrfs_set_token_inode_generation(leaf, item,
-                                           BTRFS_I(inode)->generation);
+                                                 BTRFS_I(inode)->generation,
-                btrfs_set_inode_size(leaf, item, inode->i_size);
+                                                 &token);
-        }
+                btrfs_set_token_inode_size(leaf, item, inode->i_size, &token);
+        }
+        btrfs_set_token_inode_uid(leaf, item, i_uid_read(inode), &token);
+        btrfs_set_token_inode_gid(leaf, item, i_gid_read(inode), &token);
+        btrfs_set_token_inode_mode(leaf, item, inode->i_mode, &token);
+        btrfs_set_token_inode_nlink(leaf, item, inode->i_nlink, &token);
+        btrfs_set_token_timespec_sec(leaf, btrfs_inode_atime(item),
+                                     inode->i_atime.tv_sec, &token);
+        btrfs_set_token_timespec_nsec(leaf, btrfs_inode_atime(item),
+                                      inode->i_atime.tv_nsec, &token);
+        btrfs_set_token_timespec_sec(leaf, btrfs_inode_mtime(item),
+                                     inode->i_mtime.tv_sec, &token);
+        btrfs_set_token_timespec_nsec(leaf, btrfs_inode_mtime(item),
+                                      inode->i_mtime.tv_nsec, &token);
+        btrfs_set_token_timespec_sec(leaf, btrfs_inode_ctime(item),
+                                     inode->i_ctime.tv_sec, &token);
+        btrfs_set_token_timespec_nsec(leaf, btrfs_inode_ctime(item),
+                                      inode->i_ctime.tv_nsec, &token);
+        btrfs_set_token_inode_nbytes(leaf, item, inode_get_bytes(inode),
+                                     &token);
+        btrfs_set_token_inode_sequence(leaf, item, inode->i_version, &token);
+        btrfs_set_token_inode_transid(leaf, item, trans->transid, &token);
+        btrfs_set_token_inode_rdev(leaf, item, inode->i_rdev, &token);
+        btrfs_set_token_inode_flags(leaf, item, BTRFS_I(inode)->flags, &token);
+        btrfs_set_token_inode_block_group(leaf, item, 0, &token);
+}
+static int log_inode_item(struct btrfs_trans_handle *trans,
+                          struct btrfs_root *log, struct btrfs_path *path,
+                          struct inode *inode)
+{
+        struct btrfs_inode_item *inode_item;
+        struct btrfs_key key;
+        int ret;
+        memcpy(&key, &BTRFS_I(inode)->location, sizeof(key));
+        ret = btrfs_insert_empty_item(trans, log, path, &key,
+                                      sizeof(*inode_item));
+        if (ret && ret != -EEXIST)
+                return ret;
+        inode_item = btrfs_item_ptr(path->nodes[0], path->slots[0],
+                                    struct btrfs_inode_item);
+        fill_inode_item(trans, path->nodes[0], inode_item, inode, 0);
+        btrfs_release_path(path);
+        return 0;
 }
 static noinline int copy_items(struct btrfs_trans_handle *trans,
@@ -3130,151 +3155,234 @@ static int extent_cmp(void *priv, struct list_head *a, struct list_head *b)
        return 0;
 }
-struct log_args {
+static int drop_adjacent_extents(struct btrfs_trans_handle *trans,
-        struct extent_buffer *src;
+                                 struct btrfs_root *root, struct inode *inode,
-        u64 next_offset;
+                                 struct extent_map *em,
-        int start_slot;
+                                 struct btrfs_path *path)
-        int nr;
+{
-};
+        struct btrfs_file_extent_item *fi;
+        struct extent_buffer *leaf;
+        struct btrfs_key key, new_key;
+        struct btrfs_map_token token;
+        u64 extent_end;
+        u64 extent_offset = 0;
+        int extent_type;
+        int del_slot = 0;
+        int del_nr = 0;
+        int ret = 0;
+        while (1) {
+                btrfs_init_map_token(&token);
+                leaf = path->nodes[0];
+                path->slots[0]++;
+                if (path->slots[0] >= btrfs_header_nritems(leaf)) {
+                        if (del_nr) {
+                                ret = btrfs_del_items(trans, root, path,
+                                                      del_slot, del_nr);
+                                if (ret)
+                                        return ret;
+                                del_nr = 0;
+                        }
+                        ret = btrfs_next_leaf_write(trans, root, path, 1);
+                        if (ret < 0)
+                                return ret;
+                        if (ret > 0)
+                                return 0;
+                        leaf = path->nodes[0];
+                }
+                btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
+                if (key.objectid != btrfs_ino(inode) ||
+                    key.type != BTRFS_EXTENT_DATA_KEY ||
+                    key.offset >= em->start + em->len)
+                        break;
+                fi = btrfs_item_ptr(leaf, path->slots[0],
+                                    struct btrfs_file_extent_item);
+                extent_type = btrfs_token_file_extent_type(leaf, fi, &token);
+                if (extent_type == BTRFS_FILE_EXTENT_REG ||
+                    extent_type == BTRFS_FILE_EXTENT_PREALLOC) {
+                        extent_offset = btrfs_token_file_extent_offset(leaf,
+                                                                fi, &token);
+                        extent_end = key.offset +
+                                btrfs_token_file_extent_num_bytes(leaf, fi,
+                                                                  &token);
+                } else if (extent_type == BTRFS_FILE_EXTENT_INLINE) {
+                        extent_end = key.offset +
+                                btrfs_file_extent_inline_len(leaf, fi);
+                } else {
+                        BUG();
+                }
+                if (extent_end <= em->len + em->start) {
+                        if (!del_nr) {
+                                del_slot = path->slots[0];
+                        }
+                        del_nr++;
+                        continue;
+                }
+                /*
+                 * Ok so we'll ignore previous items if we log a new extent,
+                 * which can lead to overlapping extents, so if we have an
+                 * existing extent we want to adjust we _have_ to check the next
+                 * guy to make sure we even need this extent anymore, this keeps
+                 * us from panicing in set_item_key_safe.
+                 */
+                if (path->slots[0] < btrfs_header_nritems(leaf) - 1) {
+                        struct btrfs_key tmp_key;
+                        btrfs_item_key_to_cpu(leaf, &tmp_key,
+                                              path->slots[0] + 1);
+                        if (tmp_key.objectid == btrfs_ino(inode) &&
+                            tmp_key.type == BTRFS_EXTENT_DATA_KEY &&
+                            tmp_key.offset <= em->start + em->len) {
+                                if (!del_nr)
+                                        del_slot = path->slots[0];
+                                del_nr++;
+                                continue;
+                        }
+                }
+                BUG_ON(extent_type == BTRFS_FILE_EXTENT_INLINE);
+                memcpy(&new_key, &key, sizeof(new_key));
+                new_key.offset = em->start + em->len;
+                btrfs_set_item_key_safe(trans, root, path, &new_key);
+                extent_offset += em->start + em->len - key.offset;
+                btrfs_set_token_file_extent_offset(leaf, fi, extent_offset,
+                                                   &token);
+                btrfs_set_token_file_extent_num_bytes(leaf, fi, extent_end -
+                                                      (em->start + em->len),
+                                                      &token);
+                btrfs_mark_buffer_dirty(leaf);
+        }
+        if (del_nr)
+                ret = btrfs_del_items(trans, root, path, del_slot, del_nr);
+        return ret;
+}
 static int log_one_extent(struct btrfs_trans_handle *trans,
                          struct inode *inode, struct btrfs_root *root,
-                          struct extent_map *em, struct btrfs_path *path,
+                          struct extent_map *em, struct btrfs_path *path)
-                          struct btrfs_path *dst_path, struct log_args *args)
 {
        struct btrfs_root *log = root->log_root;
        struct btrfs_file_extent_item *fi;
+        struct extent_buffer *leaf;
+        struct list_head ordered_sums;
+        struct btrfs_map_token token;
        struct btrfs_key key;
-        u64 start = em->mod_start;
+        u64 csum_offset = em->mod_start - em->start;
-        u64 search_start = start;
+        u64 csum_len = em->mod_len;
-        u64 len = em->mod_len;
+        u64 extent_offset = em->start - em->orig_start;
-        u64 num_bytes;
+        u64 block_len;
-        int nritems;
        int ret;
+        bool skip_csum = BTRFS_I(inode)->flags & BTRFS_INODE_NODATASUM;
-        if (BTRFS_I(inode)->logged_trans == trans->transid) {
+        INIT_LIST_HEAD(&ordered_sums);
-                ret = __btrfs_drop_extents(trans, log, inode, dst_path, start,
+        btrfs_init_map_token(&token);
-                                           start + len, NULL, 0);
+        key.objectid = btrfs_ino(inode);
-                if (ret)
+        key.type = BTRFS_EXTENT_DATA_KEY;
-                        return ret;
+        key.offset = em->start;
+        path->really_keep_locks = 1;
+        ret = btrfs_insert_empty_item(trans, log, path, &key, sizeof(*fi));
+        if (ret && ret != -EEXIST) {
+                path->really_keep_locks = 0;
+                return ret;
        }
+        leaf = path->nodes[0];
+        fi = btrfs_item_ptr(leaf, path->slots[0],
+                            struct btrfs_file_extent_item);
+        btrfs_set_token_file_extent_generation(leaf, fi, em->generation,
+                                               &token);
+        if (test_bit(EXTENT_FLAG_PREALLOC, &em->flags)) {
+                skip_csum = true;
+                btrfs_set_token_file_extent_type(leaf, fi,
+                                                 BTRFS_FILE_EXTENT_PREALLOC,
+                                                 &token);
+        } else {
+                btrfs_set_token_file_extent_type(leaf, fi,
+                                                 BTRFS_FILE_EXTENT_REG,
+                                                 &token);
+                if (em->block_start == 0)
+                        skip_csum = true;
+        }
+        block_len = max(em->block_len, em->orig_block_len);
+        if (em->compress_type != BTRFS_COMPRESS_NONE) {
+                btrfs_set_token_file_extent_disk_bytenr(leaf, fi,
+                                                        em->block_start,
+                                                        &token);
+                btrfs_set_token_file_extent_disk_num_bytes(leaf, fi, block_len,
+                                                           &token);
+        } else if (em->block_start < EXTENT_MAP_LAST_BYTE) {
+                btrfs_set_token_file_extent_disk_bytenr(leaf, fi,
+                                                        em->block_start -
+                                                        extent_offset, &token);
+                btrfs_set_token_file_extent_disk_num_bytes(leaf, fi, block_len,
+                                                           &token);
+        } else {
+                btrfs_set_token_file_extent_disk_bytenr(leaf, fi, 0, &token);
+                btrfs_set_token_file_extent_disk_num_bytes(leaf, fi, 0,
+                                                           &token);
+        }
+        btrfs_set_token_file_extent_offset(leaf, fi,
+                                           em->start - em->orig_start,
+                                           &token);
+        btrfs_set_token_file_extent_num_bytes(leaf, fi, em->len, &token);
+        btrfs_set_token_file_extent_ram_bytes(leaf, fi, em->len, &token);
+        btrfs_set_token_file_extent_compression(leaf, fi, em->compress_type,
+                                                &token);
+        btrfs_set_token_file_extent_encryption(leaf, fi, 0, &token);
+        btrfs_set_token_file_extent_other_encoding(leaf, fi, 0, &token);
+        btrfs_mark_buffer_dirty(leaf);
-        while (len) {
+        /*
-                if (args->nr)
+         * Have to check the extent to the right of us to make sure it doesn't
-                        goto next_slot;
+         * fall in our current range.  We're ok if the previous extent is in our
-again:
+         * range since the recovery stuff will run us in key order and thus just
-                key.objectid = btrfs_ino(inode);
+         * drop the part we overwrote.
-                key.type = BTRFS_EXTENT_DATA_KEY;
+         */
-                key.offset = search_start;
+        ret = drop_adjacent_extents(trans, log, inode, em, path);
+        btrfs_release_path(path);
-                ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
+        path->really_keep_locks = 0;
-                if (ret < 0)
+        if (ret) {
-                        return ret;
+                return ret;
+        }
-                if (ret) {
-                        /*
-                         * A rare case were we can have an em for a section of a
-                         * larger extent so we need to make sure that this em
-                         * falls within the extent we've found.  If not we just
-                         * bail and go back to ye-olde way of doing things but
-                         * it happens often enough in testing that we need to do
-                         * this dance to make sure.
-                         */
-                        do {
-                                if (path->slots[0] == 0) {
-                                        btrfs_release_path(path);
-                                        if (search_start == 0)
-                                                return -ENOENT;
-                                        search_start--;
-                                        goto again;
-                                }
-                                path->slots[0]--;
-                                btrfs_item_key_to_cpu(path->nodes[0], &key,
-                                                      path->slots[0]);
-                                if (key.objectid != btrfs_ino(inode) ||
-                                    key.type != BTRFS_EXTENT_DATA_KEY) {
-                                        btrfs_release_path(path);
-                                        return -ENOENT;
-                                }
-                        } while (key.offset > start);
-                        fi = btrfs_item_ptr(path->nodes[0], path->slots[0],
+        if (skip_csum)
-                                            struct btrfs_file_extent_item);
+                return 0;
-                        num_bytes = btrfs_file_extent_num_bytes(path->nodes[0],
-                                                                fi);
-                        if (key.offset + num_bytes <= start) {
-                                btrfs_release_path(path);
-                                return -ENOENT;
-                        }
-                }
-                args->src = path->nodes[0];
-next_slot:
-                btrfs_item_key_to_cpu(path->nodes[0], &key, path->slots[0]);
-                fi = btrfs_item_ptr(args->src, path->slots[0],
-                                    struct btrfs_file_extent_item);
-                if (args->nr &&
-                    args->start_slot + args->nr == path->slots[0]) {
-                        args->nr++;
-                } else if (args->nr) {
-                        ret = copy_items(trans, inode, dst_path, args->src,
-                                         args->start_slot, args->nr,
-                                         LOG_INODE_ALL);
-                        if (ret)
-                                return ret;
-                        args->nr = 1;
-                        args->start_slot = path->slots[0];
-                } else if (!args->nr) {
-                        args->nr = 1;
-                        args->start_slot = path->slots[0];
-                }
-                nritems = btrfs_header_nritems(path->nodes[0]);
-                path->slots[0]++;
-                num_bytes = btrfs_file_extent_num_bytes(args->src, fi);
-                if (len < num_bytes) {
-                        /* I _think_ this is ok, envision we write to a
-                         * preallocated space that is adjacent to a previously
-                         * written preallocated space that gets merged when we
-                         * mark this preallocated space written.  If we do not
-                         * have the adjacent extent in cache then when we copy
-                         * this extent it could end up being larger than our EM
-                         * thinks it is, which is a-ok, so just set len to 0.
-                         */
-                        len = 0;
-                } else {
-                        len -= num_bytes;
-                }
-                start = key.offset + num_bytes;
-                args->next_offset = start;
-                search_start = start;
-                if (path->slots[0] < nritems) {
+        /* block start is already adjusted for the file extent offset. */
-                        if (len)
+        ret = btrfs_lookup_csums_range(log->fs_info->csum_root,
-                                goto next_slot;
+                                       em->block_start + csum_offset,
-                        break;
+                                       em->block_start + csum_offset +
-                }
+                                       csum_len - 1, &ordered_sums, 0);
+        if (ret)
+                return ret;
-                if (args->nr) {
+        while (!list_empty(&ordered_sums)) {
-                        ret = copy_items(trans, inode, dst_path, args->src,
+                struct btrfs_ordered_sum *sums = list_entry(ordered_sums.next,
-                                         args->start_slot, args->nr,
+                                                   struct btrfs_ordered_sum,
-                                         LOG_INODE_ALL);
+                                                   list);
-                        if (ret)
+                if (!ret)
-                                return ret;
+                        ret = btrfs_csum_file_blocks(trans, log, sums);
-                        args->nr = 0;
+                list_del(&sums->list);
-                        btrfs_release_path(path);
+                kfree(sums);
-                }
        }
-        return 0;
+        return ret;
 }
 static int btrfs_log_changed_extents(struct btrfs_trans_handle *trans,
                                     struct btrfs_root *root,
                                     struct inode *inode,
-                                     struct btrfs_path *path,
+                                     struct btrfs_path *path)
-                                     struct btrfs_path *dst_path)
 {
-        struct log_args args;
        struct extent_map *em, *n;
        struct list_head extents;
        struct extent_map_tree *tree = &BTRFS_I(inode)->extent_tree;
@@ -3283,8 +3391,6 @@ static int btrfs_log_changed_extents(struct btrfs_trans_handle *trans,
        INIT_LIST_HEAD(&extents);
-        memset(&args, 0, sizeof(args));
        write_lock(&tree->lock);
        test_gen = root->fs_info->last_trans_committed;
@@ -3317,34 +3423,13 @@ static int btrfs_log_changed_extents(struct btrfs_trans_handle *trans,
                write_unlock(&tree->lock);
-                /*
+                ret = log_one_extent(trans, inode, root, em, path);
-                 * If the previous EM and the last extent we left off on aren't
-                 * sequential then we need to copy the items we have and redo
-                 * our search
-                 */
-                if (args.nr && em->mod_start != args.next_offset) {
-                        ret = copy_items(trans, inode, dst_path, args.src,
-                                         args.start_slot, args.nr,
-                                         LOG_INODE_ALL);
-                        if (ret) {
-                                free_extent_map(em);
-                                write_lock(&tree->lock);
-                                continue;
-                        }
-                        btrfs_release_path(path);
-                        args.nr = 0;
-                }
-                ret = log_one_extent(trans, inode, root, em, path, dst_path, &args);
                free_extent_map(em);
                write_lock(&tree->lock);
        }
        WARN_ON(!list_empty(&extents));
        write_unlock(&tree->lock);
-        if (!ret && args.nr)
-                ret = copy_items(trans, inode, dst_path, args.src,
-                                 args.start_slot, args.nr, LOG_INODE_ALL);
        btrfs_release_path(path);
        return ret;
 }
@@ -3400,7 +3485,10 @@ static int btrfs_log_inode(struct btrfs_trans_handle *trans,
        /* today the code can only do partial logging of directories */
-        if (inode_only == LOG_INODE_EXISTS || S_ISDIR(inode->i_mode))
+        if (S_ISDIR(inode->i_mode) ||
+            (!test_bit(BTRFS_INODE_NEEDS_FULL_SYNC,
+                       &BTRFS_I(inode)->runtime_flags) &&
+             inode_only == LOG_INODE_EXISTS))
                max_key.type = BTRFS_XATTR_ITEM_KEY;
        else
                max_key.type = (u8)-1;
@@ -3432,14 +3520,28 @@ static int btrfs_log_inode(struct btrfs_trans_handle *trans,
        } else {
                if (test_and_clear_bit(BTRFS_INODE_NEEDS_FULL_SYNC,
                                       &BTRFS_I(inode)->runtime_flags)) {
+                        clear_bit(BTRFS_INODE_COPY_EVERYTHING,
+                                  &BTRFS_I(inode)->runtime_flags);
                        ret = btrfs_truncate_inode_items(trans, log,
                                                         inode, 0, 0);
-                } else {
+                } else if (test_and_clear_bit(BTRFS_INODE_COPY_EVERYTHING,
-                        fast_search = true;
+                                              &BTRFS_I(inode)->runtime_flags)) {
+                        if (inode_only == LOG_INODE_ALL)
+                                fast_search = true;
                        max_key.type = BTRFS_XATTR_ITEM_KEY;
                        ret = drop_objectid_items(trans, log, path, ino,
-                                                  BTRFS_XATTR_ITEM_KEY);
+                                                  max_key.type);
+                } else {
+                        if (inode_only == LOG_INODE_ALL)
+                                fast_search = true;
+                        ret = log_inode_item(trans, log, dst_path, inode);
+                        if (ret) {
+                                err = ret;
+                                goto out_unlock;
+                        }
+                        goto log_extents;
                }
        }
        if (ret) {
                err = ret;
@@ -3518,11 +3620,10 @@ next_slot:
                ins_nr = 0;
        }
+log_extents:
        if (fast_search) {
-                btrfs_release_path(path);
                btrfs_release_path(dst_path);
-                ret = btrfs_log_changed_extents(trans, root, inode, path,
+                ret = btrfs_log_changed_extents(trans, root, inode, dst_path);
-                                                dst_path);
                if (ret) {
                        err = ret;
                        goto out_unlock;
@@ -3531,8 +3632,10 @@ next_slot:
                struct extent_map_tree *tree = &BTRFS_I(inode)->extent_tree;
                struct extent_map *em, *n;
+                write_lock(&tree->lock);
                list_for_each_entry_safe(em, n, &tree->modified_extents, list)
                        list_del_init(&em->list);
+                write_unlock(&tree->lock);
        }
        if (inode_only == LOG_INODE_ALL && S_ISDIR(inode->i_mode)) {
diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c
index 0f5ebb72a5ea..5cce6aa74012 100644
--- a/fs/btrfs/volumes.c
+++ b/fs/btrfs/volumes.c
@@ -25,7 +25,6 @@
 #include <linux/capability.h>
 #include <linux/ratelimit.h>
 #include <linux/kthread.h>
-#include <asm/div64.h>
 #include "compat.h"
 #include "ctree.h"
 #include "extent_map.h"
@@ -36,6 +35,8 @@
 #include "async-thread.h"
 #include "check-integrity.h"
 #include "rcu-string.h"
+#include "math.h"
+#include "dev-replace.h"
 static int init_first_rw_device(struct btrfs_trans_handle *trans,
                                struct btrfs_root *root,
@@ -71,6 +72,19 @@ static void free_fs_devices(struct btrfs_fs_devices *fs_devices)
        kfree(fs_devices);
 }
+static void btrfs_kobject_uevent(struct block_device *bdev,
+                                 enum kobject_action action)
+{
+        int ret;
+        ret = kobject_uevent(&disk_to_dev(bdev->bd_disk)->kobj, action);
+        if (ret)
+                pr_warn("Sending event '%d' to kobject: '%s' (%p): failed\n",
+                        action,
+                        kobject_name(&disk_to_dev(bdev->bd_disk)->kobj),
+                        &disk_to_dev(bdev->bd_disk)->kobj);
+}
 void btrfs_cleanup_fs_uuids(void)
 {
        struct btrfs_fs_devices *fs_devices;
@@ -108,6 +122,44 @@ static noinline struct btrfs_fs_devices *find_fsid(u8 *fsid)
        return NULL;
 }
+static int
+btrfs_get_bdev_and_sb(const char *device_path, fmode_t flags, void *holder,
+                      int flush, struct block_device **bdev,
+                      struct buffer_head **bh)
+{
+        int ret;
+        *bdev = blkdev_get_by_path(device_path, flags, holder);
+        if (IS_ERR(*bdev)) {
+                ret = PTR_ERR(*bdev);
+                printk(KERN_INFO "btrfs: open %s failed\n", device_path);
+                goto error;
+        }
+        if (flush)
+                filemap_write_and_wait((*bdev)->bd_inode->i_mapping);
+        ret = set_blocksize(*bdev, 4096);
+        if (ret) {
+                blkdev_put(*bdev, flags);
+                goto error;
+        }
+        invalidate_bdev(*bdev);
+        *bh = btrfs_read_dev_super(*bdev);
+        if (!*bh) {
+                ret = -EINVAL;
+                blkdev_put(*bdev, flags);
+                goto error;
+        }
+        return 0;
+error:
+        *bdev = NULL;
+        *bh = NULL;
+        return ret;
+}
 static void requeue_list(struct btrfs_pending_bios *pending_bios,
                        struct bio *head, struct bio *tail)
 {
@@ -467,7 +519,8 @@ error:
        return ERR_PTR(-ENOMEM);
 }
-void btrfs_close_extra_devices(struct btrfs_fs_devices *fs_devices)
+void btrfs_close_extra_devices(struct btrfs_fs_info *fs_info,
+                               struct btrfs_fs_devices *fs_devices, int step)
 {
        struct btrfs_device *device, *next;
@@ -480,8 +533,9 @@ again:
        /* This is the initialized path, it is safe to release the devices. */
        list_for_each_entry_safe(device, next, &fs_devices->devices, dev_list) {
                if (device->in_fs_metadata) {
-                        if (!latest_transid ||
+                        if (!device->is_tgtdev_for_dev_replace &&
-                            device->generation > latest_transid) {
+                            (!latest_transid ||
+                             device->generation > latest_transid)) {
                                latest_devid = device->devid;
                                latest_transid = device->generation;
                                latest_bdev = device->bdev;
@@ -489,6 +543,21 @@ again:
                        continue;
                }
+                if (device->devid == BTRFS_DEV_REPLACE_DEVID) {
+                        /*
+                         * In the first step, keep the device which has
+                         * the correct fsid and the devid that is used
+                         * for the dev_replace procedure.
+                         * In the second step, the dev_replace state is
+                         * read from the device tree and it is known
+                         * whether the procedure is really active or
+                         * not, which means whether this device is
+                         * used or whether it should be removed.
+                         */
+                        if (step == 0 || device->is_tgtdev_for_dev_replace) {
+                                continue;
+                        }
+                }
                if (device->bdev) {
                        blkdev_put(device->bdev, device->mode);
                        device->bdev = NULL;
@@ -497,7 +566,8 @@ again:
                if (device->writeable) {
                        list_del_init(&device->dev_alloc_list);
                        device->writeable = 0;
-                        fs_devices->rw_devices--;
+                        if (!device->is_tgtdev_for_dev_replace)
+                                fs_devices->rw_devices--;
                }
                list_del_init(&device->dev_list);
                fs_devices->num_devices--;
@@ -555,7 +625,7 @@ static int __btrfs_close_devices(struct btrfs_fs_devices *fs_devices)
                if (device->bdev)
                        fs_devices->open_devices--;
-                if (device->writeable) {
+                if (device->writeable && !device->is_tgtdev_for_dev_replace) {
                        list_del_init(&device->dev_alloc_list);
                        fs_devices->rw_devices--;
                }
@@ -637,18 +707,10 @@ static int __btrfs_open_devices(struct btrfs_fs_devices *fs_devices,
                if (!device->name)
                        continue;
-                bdev = blkdev_get_by_path(device->name->str, flags, holder);
+                ret = btrfs_get_bdev_and_sb(device->name->str, flags, holder, 1,
-                if (IS_ERR(bdev)) {
+                                            &bdev, &bh);
-                        printk(KERN_INFO "btrfs: open %s failed\n", device->name->str);
+                if (ret)
-                        goto error;
+                        continue;
-                }
-                filemap_write_and_wait(bdev->bd_inode->i_mapping);
-                invalidate_bdev(bdev);
-                set_blocksize(bdev, 4096);
-                bh = btrfs_read_dev_super(bdev);
-                if (!bh)
-                        goto error_close;
                disk_super = (struct btrfs_super_block *)bh->b_data;
                devid = btrfs_stack_device_id(&disk_super->dev_item);
@@ -687,7 +749,7 @@ static int __btrfs_open_devices(struct btrfs_fs_devices *fs_devices,
                        fs_devices->rotating = 1;
                fs_devices->open_devices++;
-                if (device->writeable) {
+                if (device->writeable && !device->is_tgtdev_for_dev_replace) {
                        fs_devices->rw_devices++;
                        list_add(&device->dev_alloc_list,
                                 &fs_devices->alloc_list);
@@ -697,9 +759,7 @@ static int __btrfs_open_devices(struct btrfs_fs_devices *fs_devices,
 error_brelse:
                brelse(bh);
-error_close:
                blkdev_put(bdev, flags);
-error:
                continue;
        }
        if (fs_devices->open_devices == 0) {
@@ -744,40 +804,30 @@ int btrfs_scan_one_device(const char *path, fmode_t flags, void *holder,
        u64 total_devices;
        flags |= FMODE_EXCL;
-        bdev = blkdev_get_by_path(path, flags, holder);
-        if (IS_ERR(bdev)) {
-                ret = PTR_ERR(bdev);
-                goto error;
-        }
        mutex_lock(&uuid_mutex);
-        ret = set_blocksize(bdev, 4096);
+        ret = btrfs_get_bdev_and_sb(path, flags, holder, 0, &bdev, &bh);
        if (ret)
-                goto error_close;
+                goto error;
-        bh = btrfs_read_dev_super(bdev);
-        if (!bh) {
-                ret = -EINVAL;
-                goto error_close;
-        }
        disk_super = (struct btrfs_super_block *)bh->b_data;
        devid = btrfs_stack_device_id(&disk_super->dev_item);
        transid = btrfs_super_generation(disk_super);
        total_devices = btrfs_super_num_devices(disk_super);
-        if (disk_super->label[0])
+        if (disk_super->label[0]) {
+                if (disk_super->label[BTRFS_LABEL_SIZE - 1])
+                        disk_super->label[BTRFS_LABEL_SIZE - 1] = '\0';
                printk(KERN_INFO "device label %s ", disk_super->label);
-        else
+        } else {
                printk(KERN_INFO "device fsid %pU ", disk_super->fsid);
+        }
        printk(KERN_CONT "devid %llu transid %llu %s\n",
               (unsigned long long)devid, (unsigned long long)transid, path);
        ret = device_list_add(path, disk_super, devid, fs_devices_ret);
        if (!ret && fs_devices_ret)
                (*fs_devices_ret)->total_devices = total_devices;
        brelse(bh);
-error_close:
-        mutex_unlock(&uuid_mutex);
        blkdev_put(bdev, flags);
 error:
+        mutex_unlock(&uuid_mutex);
        return ret;
 }
@@ -796,7 +846,7 @@ int btrfs_account_dev_extents_size(struct btrfs_device *device, u64 start,
        *length = 0;
-        if (start >= device->total_bytes)
+        if (start >= device->total_bytes || device->is_tgtdev_for_dev_replace)
                return 0;
        path = btrfs_alloc_path();
@@ -913,7 +963,7 @@ int find_free_dev_extent(struct btrfs_device *device, u64 num_bytes,
        max_hole_size = 0;
        hole_size = 0;
-        if (search_start >= search_end) {
+        if (search_start >= search_end || device->is_tgtdev_for_dev_replace) {
                ret = -ENOSPC;
                goto error;
        }
@@ -1096,6 +1146,7 @@ int btrfs_alloc_dev_extent(struct btrfs_trans_handle *trans,
        struct btrfs_key key;
        WARN_ON(!device->in_fs_metadata);
+        WARN_ON(device->is_tgtdev_for_dev_replace);
        path = btrfs_alloc_path();
        if (!path)
                return -ENOMEM;
@@ -1330,16 +1381,22 @@ int btrfs_rm_device(struct btrfs_root *root, char *device_path)
                root->fs_info->avail_system_alloc_bits |
                root->fs_info->avail_metadata_alloc_bits;
-        if ((all_avail & BTRFS_BLOCK_GROUP_RAID10) &&
+        num_devices = root->fs_info->fs_devices->num_devices;
-            root->fs_info->fs_devices->num_devices <= 4) {
+        btrfs_dev_replace_lock(&root->fs_info->dev_replace);
+        if (btrfs_dev_replace_is_ongoing(&root->fs_info->dev_replace)) {
+                WARN_ON(num_devices < 1);
+                num_devices--;
+        }
+        btrfs_dev_replace_unlock(&root->fs_info->dev_replace);
+        if ((all_avail & BTRFS_BLOCK_GROUP_RAID10) && num_devices <= 4) {
                printk(KERN_ERR "btrfs: unable to go below four devices "
                       "on raid10\n");
                ret = -EINVAL;
                goto out;
        }
-        if ((all_avail & BTRFS_BLOCK_GROUP_RAID1) &&
+        if ((all_avail & BTRFS_BLOCK_GROUP_RAID1) && num_devices <= 2) {
-            root->fs_info->fs_devices->num_devices <= 2) {
                printk(KERN_ERR "btrfs: unable to go below two "
                       "devices on raid1\n");
                ret = -EINVAL;
@@ -1357,7 +1414,9 @@ int btrfs_rm_device(struct btrfs_root *root, char *device_path)
                 * is held.
                 */
                list_for_each_entry(tmp, devices, dev_list) {
-                        if (tmp->in_fs_metadata && !tmp->bdev) {
+                        if (tmp->in_fs_metadata &&
+                            !tmp->is_tgtdev_for_dev_replace &&
+                            !tmp->bdev) {
                                device = tmp;
                                break;
                        }
@@ -1371,24 +1430,16 @@ int btrfs_rm_device(struct btrfs_root *root, char *device_path)
                        goto out;
                }
        } else {
-                bdev = blkdev_get_by_path(device_path, FMODE_READ | FMODE_EXCL,
+                ret = btrfs_get_bdev_and_sb(device_path,
-                                          root->fs_info->bdev_holder);
+                                            FMODE_READ | FMODE_EXCL,
-                if (IS_ERR(bdev)) {
+                                            root->fs_info->bdev_holder, 0,
-                        ret = PTR_ERR(bdev);
+                                            &bdev, &bh);
+                if (ret)
                        goto out;
-                }
-                set_blocksize(bdev, 4096);
-                invalidate_bdev(bdev);
-                bh = btrfs_read_dev_super(bdev);
-                if (!bh) {
-                        ret = -EINVAL;
-                        goto error_close;
-                }
                disk_super = (struct btrfs_super_block *)bh->b_data;
                devid = btrfs_stack_device_id(&disk_super->dev_item);
                dev_uuid = disk_super->dev_item.uuid;
-                device = btrfs_find_device(root, devid, dev_uuid,
+                device = btrfs_find_device(root->fs_info, devid, dev_uuid,
                                           disk_super->fsid);
                if (!device) {
                        ret = -ENOENT;
@@ -1396,6 +1447,12 @@ int btrfs_rm_device(struct btrfs_root *root, char *device_path)
                }
        }
+        if (device->is_tgtdev_for_dev_replace) {
+                pr_err("btrfs: unable to remove the dev_replace target dev\n");
+                ret = -EINVAL;
+                goto error_brelse;
+        }
        if (device->writeable && root->fs_info->fs_devices->rw_devices == 1) {
                printk(KERN_ERR "btrfs: unable to remove the only writeable "
                       "device\n");
@@ -1415,6 +1472,11 @@ int btrfs_rm_device(struct btrfs_root *root, char *device_path)
        if (ret)
                goto error_undo;
+        /*
+         * TODO: the superblock still includes this device in its num_devices
+         * counter although write_all_supers() is not locked out. This
+         * could give a filesystem state which requires a degraded mount.
+         */
        ret = btrfs_rm_dev_item(root->fs_info->chunk_root, device);
        if (ret)
                goto error_undo;
@@ -1425,7 +1487,7 @@ int btrfs_rm_device(struct btrfs_root *root, char *device_path)
        spin_unlock(&root->fs_info->free_chunk_lock);
        device->in_fs_metadata = 0;
-        btrfs_scrub_cancel_dev(root, device);
+        btrfs_scrub_cancel_dev(root->fs_info, device);
        /*
         * the device list mutex makes sure that we don't change
@@ -1482,7 +1544,7 @@ int btrfs_rm_device(struct btrfs_root *root, char *device_path)
         * at this point, the device is zero sized.  We want to
         * remove it from the devices list and zero out the old super
         */
-        if (clear_super) {
+        if (clear_super && disk_super) {
                /* make sure this device isn't detected as part of
                 * the FS anymore
                 */
@@ -1493,9 +1555,11 @@ int btrfs_rm_device(struct btrfs_root *root, char *device_path)
        ret = 0;
+        /* Notify udev that device has changed */
+        btrfs_kobject_uevent(bdev, KOBJ_CHANGE);
 error_brelse:
        brelse(bh);
-error_close:
        if (bdev)
                blkdev_put(bdev, FMODE_READ | FMODE_EXCL);
 out:
@@ -1512,6 +1576,112 @@ error_undo:
        goto error_brelse;
 }
+void btrfs_rm_dev_replace_srcdev(struct btrfs_fs_info *fs_info,
+                                 struct btrfs_device *srcdev)
+{
+        WARN_ON(!mutex_is_locked(&fs_info->fs_devices->device_list_mutex));
+        list_del_rcu(&srcdev->dev_list);
+        list_del_rcu(&srcdev->dev_alloc_list);
+        fs_info->fs_devices->num_devices--;
+        if (srcdev->missing) {
+                fs_info->fs_devices->missing_devices--;
+                fs_info->fs_devices->rw_devices++;
+        }
+        if (srcdev->can_discard)
+                fs_info->fs_devices->num_can_discard--;
+        if (srcdev->bdev)
+                fs_info->fs_devices->open_devices--;
+        call_rcu(&srcdev->rcu, free_device);
+}
+void btrfs_destroy_dev_replace_tgtdev(struct btrfs_fs_info *fs_info,
+                                      struct btrfs_device *tgtdev)
+{
+        struct btrfs_device *next_device;
+        WARN_ON(!tgtdev);
+        mutex_lock(&fs_info->fs_devices->device_list_mutex);
+        if (tgtdev->bdev) {
+                btrfs_scratch_superblock(tgtdev);
+                fs_info->fs_devices->open_devices--;
+        }
+        fs_info->fs_devices->num_devices--;
+        if (tgtdev->can_discard)
+                fs_info->fs_devices->num_can_discard++;
+        next_device = list_entry(fs_info->fs_devices->devices.next,
+                                 struct btrfs_device, dev_list);
+        if (tgtdev->bdev == fs_info->sb->s_bdev)
+                fs_info->sb->s_bdev = next_device->bdev;
+        if (tgtdev->bdev == fs_info->fs_devices->latest_bdev)
+                fs_info->fs_devices->latest_bdev = next_device->bdev;
+        list_del_rcu(&tgtdev->dev_list);
+        call_rcu(&tgtdev->rcu, free_device);
+        mutex_unlock(&fs_info->fs_devices->device_list_mutex);
+}
+int btrfs_find_device_by_path(struct btrfs_root *root, char *device_path,
+                              struct btrfs_device **device)
+{
+        int ret = 0;
+        struct btrfs_super_block *disk_super;
+        u64 devid;
+        u8 *dev_uuid;
+        struct block_device *bdev;
+        struct buffer_head *bh;
+        *device = NULL;
+        ret = btrfs_get_bdev_and_sb(device_path, FMODE_READ,
+                                    root->fs_info->bdev_holder, 0, &bdev, &bh);
+        if (ret)
+                return ret;
+        disk_super = (struct btrfs_super_block *)bh->b_data;
+        devid = btrfs_stack_device_id(&disk_super->dev_item);
+        dev_uuid = disk_super->dev_item.uuid;
+        *device = btrfs_find_device(root->fs_info, devid, dev_uuid,
+                                    disk_super->fsid);
+        brelse(bh);
+        if (!*device)
+                ret = -ENOENT;
+        blkdev_put(bdev, FMODE_READ);
+        return ret;
+}
+int btrfs_find_device_missing_or_by_path(struct btrfs_root *root,
+                                         char *device_path,
+                                         struct btrfs_device **device)
+{
+        *device = NULL;
+        if (strcmp(device_path, "missing") == 0) {
+                struct list_head *devices;
+                struct btrfs_device *tmp;
+                devices = &root->fs_info->fs_devices->devices;
+                /*
+                 * It is safe to read the devices since the volume_mutex
+                 * is held by the caller.
+                 */
+                list_for_each_entry(tmp, devices, dev_list) {
+                        if (tmp->in_fs_metadata && !tmp->bdev) {
+                                *device = tmp;
+                                break;
+                        }
+                }
+                if (!*device) {
+                        pr_err("btrfs: no missing device found\n");
+                        return -ENOENT;
+                }
+                return 0;
+        } else {
+                return btrfs_find_device_by_path(root, device_path, device);
+        }
+}
 /*
 * does all the dirty work required for changing file system's UUID.
 */
@@ -1630,7 +1800,8 @@ next_slot:
                read_extent_buffer(leaf, fs_uuid,
                                   (unsigned long)btrfs_device_fsid(dev_item),
                                   BTRFS_UUID_SIZE);
-                device = btrfs_find_device(root, devid, dev_uuid, fs_uuid);
+                device = btrfs_find_device(root->fs_info, devid, dev_uuid,
+                                           fs_uuid);
                BUG_ON(!device); /* Logic error */
                if (device->fs_devices->seeding) {
@@ -1678,16 +1849,17 @@ int btrfs_init_new_device(struct btrfs_root *root, char *device_path)
        filemap_write_and_wait(bdev->bd_inode->i_mapping);
        devices = &root->fs_info->fs_devices->devices;
-        /*
-         * we have the volume lock, so we don't need the extra
+        mutex_lock(&root->fs_info->fs_devices->device_list_mutex);
-         * device list mutex while reading the list here.
-         */
        list_for_each_entry(device, devices, dev_list) {
                if (device->bdev == bdev) {
                        ret = -EEXIST;
+                        mutex_unlock(
+                                &root->fs_info->fs_devices->device_list_mutex);
                        goto error;
                }
        }
+        mutex_unlock(&root->fs_info->fs_devices->device_list_mutex);
        device = kzalloc(sizeof(*device), GFP_NOFS);
        if (!device) {
@@ -1737,6 +1909,7 @@ int btrfs_init_new_device(struct btrfs_root *root, char *device_path)
        device->dev_root = root->fs_info->dev_root;
        device->bdev = bdev;
        device->in_fs_metadata = 1;
+        device->is_tgtdev_for_dev_replace = 0;
        device->mode = FMODE_EXCL;
        set_blocksize(device->bdev, 4096);
@@ -1844,6 +2017,98 @@ error:
        return ret;
 }
+int btrfs_init_dev_replace_tgtdev(struct btrfs_root *root, char *device_path,
+                                  struct btrfs_device **device_out)
+{
+        struct request_queue *q;
+        struct btrfs_device *device;
+        struct block_device *bdev;
+        struct btrfs_fs_info *fs_info = root->fs_info;
+        struct list_head *devices;
+        struct rcu_string *name;
+        int ret = 0;
+        *device_out = NULL;
+        if (fs_info->fs_devices->seeding)
+                return -EINVAL;
+        bdev = blkdev_get_by_path(device_path, FMODE_WRITE | FMODE_EXCL,
+                                  fs_info->bdev_holder);
+        if (IS_ERR(bdev))
+                return PTR_ERR(bdev);
+        filemap_write_and_wait(bdev->bd_inode->i_mapping);
+        devices = &fs_info->fs_devices->devices;
+        list_for_each_entry(device, devices, dev_list) {
+                if (device->bdev == bdev) {
+                        ret = -EEXIST;
+                        goto error;
+                }
+        }
+        device = kzalloc(sizeof(*device), GFP_NOFS);
+        if (!device) {
+                ret = -ENOMEM;
+                goto error;
+        }
+        name = rcu_string_strdup(device_path, GFP_NOFS);
+        if (!name) {
+                kfree(device);
+                ret = -ENOMEM;
+                goto error;
+        }
+        rcu_assign_pointer(device->name, name);
+        q = bdev_get_queue(bdev);
+        if (blk_queue_discard(q))
+                device->can_discard = 1;
+        mutex_lock(&root->fs_info->fs_devices->device_list_mutex);
+        device->writeable = 1;
+        device->work.func = pending_bios_fn;
+        generate_random_uuid(device->uuid);
+        device->devid = BTRFS_DEV_REPLACE_DEVID;
+        spin_lock_init(&device->io_lock);
+        device->generation = 0;
+        device->io_width = root->sectorsize;
+        device->io_align = root->sectorsize;
+        device->sector_size = root->sectorsize;
+        device->total_bytes = i_size_read(bdev->bd_inode);
+        device->disk_total_bytes = device->total_bytes;
+        device->dev_root = fs_info->dev_root;
+        device->bdev = bdev;
+        device->in_fs_metadata = 1;
+        device->is_tgtdev_for_dev_replace = 1;
+        device->mode = FMODE_EXCL;
+        set_blocksize(device->bdev, 4096);
+        device->fs_devices = fs_info->fs_devices;
+        list_add(&device->dev_list, &fs_info->fs_devices->devices);
+        fs_info->fs_devices->num_devices++;
+        fs_info->fs_devices->open_devices++;
+        if (device->can_discard)
+                fs_info->fs_devices->num_can_discard++;
+        mutex_unlock(&root->fs_info->fs_devices->device_list_mutex);
+        *device_out = device;
+        return ret;
+error:
+        blkdev_put(bdev, FMODE_EXCL);
+        return ret;
+}
+void btrfs_init_dev_replace_tgtdev_for_resume(struct btrfs_fs_info *fs_info,
+                                              struct btrfs_device *tgtdev)
+{
+        WARN_ON(fs_info->fs_devices->rw_devices == 0);
+        tgtdev->io_width = fs_info->dev_root->sectorsize;
+        tgtdev->io_align = fs_info->dev_root->sectorsize;
+        tgtdev->sector_size = fs_info->dev_root->sectorsize;
+        tgtdev->dev_root = fs_info->dev_root;
+        tgtdev->in_fs_metadata = 1;
+}
 static noinline int btrfs_update_device(struct btrfs_trans_handle *trans,
                                        struct btrfs_device *device)
 {
@@ -1900,7 +2165,8 @@ static int __btrfs_grow_device(struct btrfs_trans_handle *trans,
        if (!device->writeable)
                return -EACCES;
-        if (new_size <= device->total_bytes)
+        if (new_size <= device->total_bytes ||
+            device->is_tgtdev_for_dev_replace)
                return -EINVAL;
        btrfs_set_super_total_bytes(super_copy, old_total + diff);
@@ -2338,18 +2604,6 @@ static int chunk_profiles_filter(u64 chunk_type,
        return 1;
 }
-static u64 div_factor_fine(u64 num, int factor)
-{
-        if (factor <= 0)
-                return 0;
-        if (factor >= 100)
-                return num;
-        num *= factor;
-        do_div(num, 100);
-        return num;
-}
 static int chunk_usage_filter(struct btrfs_fs_info *fs_info, u64 chunk_offset,
                              struct btrfs_balance_args *bargs)
 {
@@ -2514,15 +2768,6 @@ static int should_balance_chunk(struct btrfs_root *root,
        return 1;
 }
-static u64 div_factor(u64 num, int factor)
-{
-        if (factor == 10)
-                return num;
-        num *= factor;
-        do_div(num, 10);
-        return num;
-}
 static int __btrfs_balance(struct btrfs_fs_info *fs_info)
 {
        struct btrfs_balance_control *bctl = fs_info->balance_ctl;
@@ -2550,7 +2795,8 @@ static int __btrfs_balance(struct btrfs_fs_info *fs_info)
                size_to_free = div_factor(old_size, 1);
                size_to_free = min(size_to_free, (u64)1 * 1024 * 1024);
                if (!device->writeable ||
-                    device->total_bytes - device->bytes_used > size_to_free)
+                    device->total_bytes - device->bytes_used > size_to_free ||
+                    device->is_tgtdev_for_dev_replace)
                        continue;
                ret = btrfs_shrink_device(device, old_size - size_to_free);
@@ -2728,6 +2974,7 @@ int btrfs_balance(struct btrfs_balance_control *bctl,
        u64 allowed;
        int mixed = 0;
        int ret;
+        u64 num_devices;
        if (btrfs_fs_closing(fs_info) ||
            atomic_read(&fs_info->balance_pause_req) ||
@@ -2756,10 +3003,17 @@ int btrfs_balance(struct btrfs_balance_control *bctl,
                }
        }
+        num_devices = fs_info->fs_devices->num_devices;
+        btrfs_dev_replace_lock(&fs_info->dev_replace);
+        if (btrfs_dev_replace_is_ongoing(&fs_info->dev_replace)) {
+                BUG_ON(num_devices < 1);
+                num_devices--;
+        }
+        btrfs_dev_replace_unlock(&fs_info->dev_replace);
        allowed = BTRFS_AVAIL_ALLOC_BIT_SINGLE;
-        if (fs_info->fs_devices->num_devices == 1)
+        if (num_devices == 1)
                allowed |= BTRFS_BLOCK_GROUP_DUP;
-        else if (fs_info->fs_devices->num_devices < 4)
+        else if (num_devices < 4)
                allowed |= (BTRFS_BLOCK_GROUP_RAID0 | BTRFS_BLOCK_GROUP_RAID1);
        else
                allowed |= (BTRFS_BLOCK_GROUP_RAID0 | BTRFS_BLOCK_GROUP_RAID1 |
@@ -2902,6 +3156,7 @@ static int balance_kthread(void *data)
                ret = btrfs_balance(fs_info->balance_ctl, NULL);
        }
+        atomic_set(&fs_info->mutually_exclusive_operation_running, 0);
        mutex_unlock(&fs_info->balance_mutex);
        mutex_unlock(&fs_info->volume_mutex);
@@ -2924,6 +3179,7 @@ int btrfs_resume_balance_async(struct btrfs_fs_info *fs_info)
                return 0;
        }
+        WARN_ON(atomic_xchg(&fs_info->mutually_exclusive_operation_running, 1));
        tsk = kthread_run(balance_kthread, fs_info, "btrfs-balance");
        if (IS_ERR(tsk))
                return PTR_ERR(tsk);
@@ -3080,7 +3336,7 @@ int btrfs_shrink_device(struct btrfs_device *device, u64 new_size)
        u64 old_size = device->total_bytes;
        u64 diff = device->total_bytes - new_size;
-        if (new_size >= device->total_bytes)
+        if (device->is_tgtdev_for_dev_replace)
                return -EINVAL;
        path = btrfs_alloc_path();
@@ -3235,6 +3491,14 @@ static int btrfs_cmp_device_info(const void *a, const void *b)
        return 0;
 }
+struct btrfs_raid_attr btrfs_raid_array[BTRFS_NR_RAID_TYPES] = {
+        { 2, 1, 0, 4, 2, 2 /* raid10 */ },
+        { 1, 1, 2, 2, 2, 2 /* raid1 */ },
+        { 1, 2, 1, 1, 1, 2 /* dup */ },
+        { 1, 1, 0, 2, 1, 1 /* raid0 */ },
+        { 1, 1, 0, 1, 1, 1 /* single */ },
+};
 static int __btrfs_alloc_chunk(struct btrfs_trans_handle *trans,
                               struct btrfs_root *extent_root,
                               struct map_lookup **map_ret,
@@ -3264,43 +3528,21 @@ static int __btrfs_alloc_chunk(struct btrfs_trans_handle *trans,
        int ndevs;
        int i;
        int j;
+        int index;
        BUG_ON(!alloc_profile_is_valid(type, 0));
        if (list_empty(&fs_devices->alloc_list))
                return -ENOSPC;
-        sub_stripes = 1;
+        index = __get_raid_index(type);
-        dev_stripes = 1;
-        devs_increment = 1;
-        ncopies = 1;
-        devs_max = 0;   /* 0 == as many as possible */
-        devs_min = 1;
-        /*
+        sub_stripes = btrfs_raid_array[index].sub_stripes;
-         * define the properties of each RAID type.
+        dev_stripes = btrfs_raid_array[index].dev_stripes;
-         * FIXME: move this to a global table and use it in all RAID
+        devs_max = btrfs_raid_array[index].devs_max;
-         * calculation code
+        devs_min = btrfs_raid_array[index].devs_min;
-         */
+        devs_increment = btrfs_raid_array[index].devs_increment;
-        if (type & (BTRFS_BLOCK_GROUP_DUP)) {
+        ncopies = btrfs_raid_array[index].ncopies;
-                dev_stripes = 2;
-                ncopies = 2;
-                devs_max = 1;
-        } else if (type & (BTRFS_BLOCK_GROUP_RAID0)) {
-                devs_min = 2;
-        } else if (type & (BTRFS_BLOCK_GROUP_RAID1)) {
-                devs_increment = 2;
-                ncopies = 2;
-                devs_max = 2;
-                devs_min = 2;
-        } else if (type & (BTRFS_BLOCK_GROUP_RAID10)) {
-                sub_stripes = 2;
-                devs_increment = 2;
-                ncopies = 2;
-                devs_min = 4;
-        } else {
-                devs_max = 1;
-        }
        if (type & BTRFS_BLOCK_GROUP_DATA) {
                max_stripe_size = 1024 * 1024 * 1024;
@@ -3347,13 +3589,13 @@ static int __btrfs_alloc_chunk(struct btrfs_trans_handle *trans,
                cur = cur->next;
                if (!device->writeable) {
-                        printk(KERN_ERR
+                        WARN(1, KERN_ERR
                               "btrfs: read-only device in alloc_list\n");
-                        WARN_ON(1);
                        continue;
                }
-                if (!device->in_fs_metadata)
+                if (!device->in_fs_metadata ||
+                    device->is_tgtdev_for_dev_replace)
                        continue;
                if (device->total_bytes > device->bytes_used)
@@ -3382,6 +3624,7 @@ static int __btrfs_alloc_chunk(struct btrfs_trans_handle *trans,
                devices_info[ndevs].total_avail = total_avail;
                devices_info[ndevs].dev = device;
                ++ndevs;
+                WARN_ON(ndevs > fs_devices->rw_devices);
        }
        /*
@@ -3740,8 +3983,9 @@ void btrfs_mapping_tree_free(struct btrfs_mapping_tree *tree)
        }
 }
-int btrfs_num_copies(struct btrfs_mapping_tree *map_tree, u64 logical, u64 len)
+int btrfs_num_copies(struct btrfs_fs_info *fs_info, u64 logical, u64 len)
 {
+        struct btrfs_mapping_tree *map_tree = &fs_info->mapping_tree;
        struct extent_map *em;
        struct map_lookup *map;
        struct extent_map_tree *em_tree = &map_tree->map_tree;
@@ -3761,32 +4005,60 @@ int btrfs_num_copies(struct btrfs_mapping_tree *map_tree, u64 logical, u64 len)
        else
                ret = 1;
        free_extent_map(em);
+        btrfs_dev_replace_lock(&fs_info->dev_replace);
+        if (btrfs_dev_replace_is_ongoing(&fs_info->dev_replace))
+                ret++;
+        btrfs_dev_replace_unlock(&fs_info->dev_replace);
        return ret;
 }
-static int find_live_mirror(struct map_lookup *map, int first, int num,
+static int find_live_mirror(struct btrfs_fs_info *fs_info,
-                            int optimal)
+                            struct map_lookup *map, int first, int num,
+                            int optimal, int dev_replace_is_ongoing)
 {
        int i;
-        if (map->stripes[optimal].dev->bdev)
+        int tolerance;
-                return optimal;
+        struct btrfs_device *srcdev;
-        for (i = first; i < first + num; i++) {
-                if (map->stripes[i].dev->bdev)
+        if (dev_replace_is_ongoing &&
-                        return i;
+            fs_info->dev_replace.cont_reading_from_srcdev_mode ==
+             BTRFS_DEV_REPLACE_ITEM_CONT_READING_FROM_SRCDEV_MODE_AVOID)
+                srcdev = fs_info->dev_replace.srcdev;
+        else
+                srcdev = NULL;
+        /*
+         * try to avoid the drive that is the source drive for a
+         * dev-replace procedure, only choose it if no other non-missing
+         * mirror is available
+         */
+        for (tolerance = 0; tolerance < 2; tolerance++) {
+                if (map->stripes[optimal].dev->bdev &&
+                    (tolerance || map->stripes[optimal].dev != srcdev))
+                        return optimal;
+                for (i = first; i < first + num; i++) {
+                        if (map->stripes[i].dev->bdev &&
+                            (tolerance || map->stripes[i].dev != srcdev))
+                                return i;
+                }
        }
        /* we couldn't find one that doesn't fail.  Just return something
         * and the io error handling code will clean up eventually
         */
        return optimal;
 }
-static int __btrfs_map_block(struct btrfs_mapping_tree *map_tree, int rw,
+static int __btrfs_map_block(struct btrfs_fs_info *fs_info, int rw,
                             u64 logical, u64 *length,
                             struct btrfs_bio **bbio_ret,
                             int mirror_num)
 {
        struct extent_map *em;
        struct map_lookup *map;
+        struct btrfs_mapping_tree *map_tree = &fs_info->mapping_tree;
        struct extent_map_tree *em_tree = &map_tree->map_tree;
        u64 offset;
        u64 stripe_offset;
@@ -3800,6 +4072,11 @@ static int __btrfs_map_block(struct btrfs_mapping_tree *map_tree, int rw,
        int num_stripes;
        int max_errors = 0;
        struct btrfs_bio *bbio = NULL;
+        struct btrfs_dev_replace *dev_replace = &fs_info->dev_replace;
+        int dev_replace_is_ongoing = 0;
+        int num_alloc_stripes;
+        int patch_the_first_stripe_for_dev_replace = 0;
+        u64 physical_to_patch_in_first_stripe = 0;
        read_lock(&em_tree->lock);
        em = lookup_extent_mapping(em_tree, logical, *length);
@@ -3816,9 +4093,6 @@ static int __btrfs_map_block(struct btrfs_mapping_tree *map_tree, int rw,
        map = (struct map_lookup *)em->bdev;
        offset = logical - em->start;
-        if (mirror_num > map->num_stripes)
-                mirror_num = 0;
        stripe_nr = offset;
        /*
         * stripe_nr counts the total number of stripes we have to stride
@@ -3845,6 +4119,93 @@ static int __btrfs_map_block(struct btrfs_mapping_tree *map_tree, int rw,
        if (!bbio_ret)
                goto out;
+        btrfs_dev_replace_lock(dev_replace);
+        dev_replace_is_ongoing = btrfs_dev_replace_is_ongoing(dev_replace);
+        if (!dev_replace_is_ongoing)
+                btrfs_dev_replace_unlock(dev_replace);
+        if (dev_replace_is_ongoing && mirror_num == map->num_stripes + 1 &&
+            !(rw & (REQ_WRITE | REQ_DISCARD | REQ_GET_READ_MIRRORS)) &&
+            dev_replace->tgtdev != NULL) {
+                /*
+                 * in dev-replace case, for repair case (that's the only
+                 * case where the mirror is selected explicitly when
+                 * calling btrfs_map_block), blocks left of the left cursor
+                 * can also be read from the target drive.
+                 * For REQ_GET_READ_MIRRORS, the target drive is added as
+                 * the last one to the array of stripes. For READ, it also
+                 * needs to be supported using the same mirror number.
+                 * If the requested block is not left of the left cursor,
+                 * EIO is returned. This can happen because btrfs_num_copies()
+                 * returns one more in the dev-replace case.
+                 */
+                u64 tmp_length = *length;
+                struct btrfs_bio *tmp_bbio = NULL;
+                int tmp_num_stripes;
+                u64 srcdev_devid = dev_replace->srcdev->devid;
+                int index_srcdev = 0;
+                int found = 0;
+                u64 physical_of_found = 0;
+                ret = __btrfs_map_block(fs_info, REQ_GET_READ_MIRRORS,
+                             logical, &tmp_length, &tmp_bbio, 0);
+                if (ret) {
+                        WARN_ON(tmp_bbio != NULL);
+                        goto out;
+                }
+                tmp_num_stripes = tmp_bbio->num_stripes;
+                if (mirror_num > tmp_num_stripes) {
+                        /*
+                         * REQ_GET_READ_MIRRORS does not contain this
+                         * mirror, that means that the requested area
+                         * is not left of the left cursor
+                         */
+                        ret = -EIO;
+                        kfree(tmp_bbio);
+                        goto out;
+                }
+                /*
+                 * process the rest of the function using the mirror_num
+                 * of the source drive. Therefore look it up first.
+                 * At the end, patch the device pointer to the one of the
+                 * target drive.
+                 */
+                for (i = 0; i < tmp_num_stripes; i++) {
+                        if (tmp_bbio->stripes[i].dev->devid == srcdev_devid) {
+                                /*
+                                 * In case of DUP, in order to keep it
+                                 * simple, only add the mirror with the
+                                 * lowest physical address
+                                 */
+                                if (found &&
+                                    physical_of_found <=
+                                     tmp_bbio->stripes[i].physical)
+                                        continue;
+                                index_srcdev = i;
+                                found = 1;
+                                physical_of_found =
+                                        tmp_bbio->stripes[i].physical;
+                        }
+                }
+                if (found) {
+                        mirror_num = index_srcdev + 1;
+                        patch_the_first_stripe_for_dev_replace = 1;
+                        physical_to_patch_in_first_stripe = physical_of_found;
+                } else {
+                        WARN_ON(1);
+                        ret = -EIO;
+                        kfree(tmp_bbio);
+                        goto out;
+                }
+                kfree(tmp_bbio);
+        } else if (mirror_num > map->num_stripes) {
+                mirror_num = 0;
+        }
        num_stripes = 1;
        stripe_index = 0;
        stripe_nr_orig = stripe_nr;
@@ -3859,19 +4220,20 @@ static int __btrfs_map_block(struct btrfs_mapping_tree *map_tree, int rw,
                                            stripe_nr_end - stripe_nr_orig);
                stripe_index = do_div(stripe_nr, map->num_stripes);
        } else if (map->type & BTRFS_BLOCK_GROUP_RAID1) {
-                if (rw & (REQ_WRITE | REQ_DISCARD))
+                if (rw & (REQ_WRITE | REQ_DISCARD | REQ_GET_READ_MIRRORS))
                        num_stripes = map->num_stripes;
                else if (mirror_num)
                        stripe_index = mirror_num - 1;
                else {
-                        stripe_index = find_live_mirror(map, 0,
+                        stripe_index = find_live_mirror(fs_info, map, 0,
                                            map->num_stripes,
-                                            current->pid % map->num_stripes);
+                                            current->pid % map->num_stripes,
+                                            dev_replace_is_ongoing);
                        mirror_num = stripe_index + 1;
                }
        } else if (map->type & BTRFS_BLOCK_GROUP_DUP) {
-                if (rw & (REQ_WRITE | REQ_DISCARD)) {
+                if (rw & (REQ_WRITE | REQ_DISCARD | REQ_GET_READ_MIRRORS)) {
                        num_stripes = map->num_stripes;
                } else if (mirror_num) {
                        stripe_index = mirror_num - 1;
@@ -3885,7 +4247,7 @@ static int __btrfs_map_block(struct btrfs_mapping_tree *map_tree, int rw,
                stripe_index = do_div(stripe_nr, factor);
                stripe_index *= map->sub_stripes;
-                if (rw & REQ_WRITE)
+                if (rw & (REQ_WRITE | REQ_GET_READ_MIRRORS))
                        num_stripes = map->sub_stripes;
                else if (rw & REQ_DISCARD)
                        num_stripes = min_t(u64, map->sub_stripes *
@@ -3895,9 +4257,11 @@ static int __btrfs_map_block(struct btrfs_mapping_tree *map_tree, int rw,
                        stripe_index += mirror_num - 1;
                else {
                        int old_stripe_index = stripe_index;
-                        stripe_index = find_live_mirror(map, stripe_index,
+                        stripe_index = find_live_mirror(fs_info, map,
+                                              stripe_index,
                                              map->sub_stripes, stripe_index +
-                                              current->pid % map->sub_stripes);
+                                              current->pid % map->sub_stripes,
+                                              dev_replace_is_ongoing);
                        mirror_num = stripe_index - old_stripe_index + 1;
                }
        } else {
@@ -3911,7 +4275,14 @@ static int __btrfs_map_block(struct btrfs_mapping_tree *map_tree, int rw,
        }
        BUG_ON(stripe_index >= map->num_stripes);
-        bbio = kzalloc(btrfs_bio_size(num_stripes), GFP_NOFS);
+        num_alloc_stripes = num_stripes;
+        if (dev_replace_is_ongoing) {
+                if (rw & (REQ_WRITE | REQ_DISCARD))
+                        num_alloc_stripes <<= 1;
+                if (rw & REQ_GET_READ_MIRRORS)
+                        num_alloc_stripes++;
+        }
+        bbio = kzalloc(btrfs_bio_size(num_alloc_stripes), GFP_NOFS);
        if (!bbio) {
                ret = -ENOMEM;
                goto out;
@@ -3998,7 +4369,7 @@ static int __btrfs_map_block(struct btrfs_mapping_tree *map_tree, int rw,
                }
        }
-        if (rw & REQ_WRITE) {
+        if (rw & (REQ_WRITE | REQ_GET_READ_MIRRORS)) {
                if (map->type & (BTRFS_BLOCK_GROUP_RAID1 |
                                 BTRFS_BLOCK_GROUP_RAID10 |
                                 BTRFS_BLOCK_GROUP_DUP)) {
@@ -4006,20 +4377,115 @@ static int __btrfs_map_block(struct btrfs_mapping_tree *map_tree, int rw,
                }
        }
+        if (dev_replace_is_ongoing && (rw & (REQ_WRITE | REQ_DISCARD)) &&
+            dev_replace->tgtdev != NULL) {
+                int index_where_to_add;
+                u64 srcdev_devid = dev_replace->srcdev->devid;
+                /*
+                 * duplicate the write operations while the dev replace
+                 * procedure is running. Since the copying of the old disk
+                 * to the new disk takes place at run time while the
+                 * filesystem is mounted writable, the regular write
+                 * operations to the old disk have to be duplicated to go
+                 * to the new disk as well.
+                 * Note that device->missing is handled by the caller, and
+                 * that the write to the old disk is already set up in the
+                 * stripes array.
+                 */
+                index_where_to_add = num_stripes;
+                for (i = 0; i < num_stripes; i++) {
+                        if (bbio->stripes[i].dev->devid == srcdev_devid) {
+                                /* write to new disk, too */
+                                struct btrfs_bio_stripe *new =
+                                        bbio->stripes + index_where_to_add;
+                                struct btrfs_bio_stripe *old =
+                                        bbio->stripes + i;
+                                new->physical = old->physical;
+                                new->length = old->length;
+                                new->dev = dev_replace->tgtdev;
+                                index_where_to_add++;
+                                max_errors++;
+                        }
+                }
+                num_stripes = index_where_to_add;
+        } else if (dev_replace_is_ongoing && (rw & REQ_GET_READ_MIRRORS) &&
+                   dev_replace->tgtdev != NULL) {
+                u64 srcdev_devid = dev_replace->srcdev->devid;
+                int index_srcdev = 0;
+                int found = 0;
+                u64 physical_of_found = 0;
+                /*
+                 * During the dev-replace procedure, the target drive can
+                 * also be used to read data in case it is needed to repair
+                 * a corrupt block elsewhere. This is possible if the
+                 * requested area is left of the left cursor. In this area,
+                 * the target drive is a full copy of the source drive.
+                 */
+                for (i = 0; i < num_stripes; i++) {
+                        if (bbio->stripes[i].dev->devid == srcdev_devid) {
+                                /*
+                                 * In case of DUP, in order to keep it
+                                 * simple, only add the mirror with the
+                                 * lowest physical address
+                                 */
+                                if (found &&
+                                    physical_of_found <=
+                                     bbio->stripes[i].physical)
+                                        continue;
+                                index_srcdev = i;
+                                found = 1;
+                                physical_of_found = bbio->stripes[i].physical;
+                        }
+                }
+                if (found) {
+                        u64 length = map->stripe_len;
+                        if (physical_of_found + length <=
+                            dev_replace->cursor_left) {
+                                struct btrfs_bio_stripe *tgtdev_stripe =
+                                        bbio->stripes + num_stripes;
+                                tgtdev_stripe->physical = physical_of_found;
+                                tgtdev_stripe->length =
+                                        bbio->stripes[index_srcdev].length;
+                                tgtdev_stripe->dev = dev_replace->tgtdev;
+                                num_stripes++;
+                        }
+                }
+        }
        *bbio_ret = bbio;
        bbio->num_stripes = num_stripes;
        bbio->max_errors = max_errors;
        bbio->mirror_num = mirror_num;
+        /*
+         * this is the case that REQ_READ && dev_replace_is_ongoing &&
+         * mirror_num == num_stripes + 1 && dev_replace target drive is
+         * available as a mirror
+         */
+        if (patch_the_first_stripe_for_dev_replace && num_stripes > 0) {
+                WARN_ON(num_stripes > 1);
+                bbio->stripes[0].dev = dev_replace->tgtdev;
+                bbio->stripes[0].physical = physical_to_patch_in_first_stripe;
+                bbio->mirror_num = map->num_stripes + 1;
+        }
 out:
+        if (dev_replace_is_ongoing)
+                btrfs_dev_replace_unlock(dev_replace);
        free_extent_map(em);
        return ret;
 }
-int btrfs_map_block(struct btrfs_mapping_tree *map_tree, int rw,
+int btrfs_map_block(struct btrfs_fs_info *fs_info, int rw,
                      u64 logical, u64 *length,
                      struct btrfs_bio **bbio_ret, int mirror_num)
 {
-        return __btrfs_map_block(map_tree, rw, logical, length, bbio_ret,
+        return __btrfs_map_block(fs_info, rw, logical, length, bbio_ret,
                                 mirror_num);
 }
@@ -4238,10 +4704,116 @@ static noinline void schedule_bio(struct btrfs_root *root,
                                   &device->work);
 }
+static int bio_size_ok(struct block_device *bdev, struct bio *bio,
+                       sector_t sector)
+{
+        struct bio_vec *prev;
+        struct request_queue *q = bdev_get_queue(bdev);
+        unsigned short max_sectors = queue_max_sectors(q);
+        struct bvec_merge_data bvm = {
+                .bi_bdev = bdev,
+                .bi_sector = sector,
+                .bi_rw = bio->bi_rw,
+        };
+        if (bio->bi_vcnt == 0) {
+                WARN_ON(1);
+                return 1;
+        }
+        prev = &bio->bi_io_vec[bio->bi_vcnt - 1];
+        if ((bio->bi_size >> 9) > max_sectors)
+                return 0;
+        if (!q->merge_bvec_fn)
+                return 1;
+        bvm.bi_size = bio->bi_size - prev->bv_len;
+        if (q->merge_bvec_fn(q, &bvm, prev) < prev->bv_len)
+                return 0;
+        return 1;
+}
+static void submit_stripe_bio(struct btrfs_root *root, struct btrfs_bio *bbio,
+                              struct bio *bio, u64 physical, int dev_nr,
+                              int rw, int async)
+{
+        struct btrfs_device *dev = bbio->stripes[dev_nr].dev;
+        bio->bi_private = bbio;
+        bio->bi_private = merge_stripe_index_into_bio_private(
+                        bio->bi_private, (unsigned int)dev_nr);
+        bio->bi_end_io = btrfs_end_bio;
+        bio->bi_sector = physical >> 9;
+#ifdef DEBUG
+        {
+                struct rcu_string *name;
+                rcu_read_lock();
+                name = rcu_dereference(dev->name);
+                pr_debug("btrfs_map_bio: rw %d, sector=%llu, dev=%lu "
+                         "(%s id %llu), size=%u\n", rw,
+                         (u64)bio->bi_sector, (u_long)dev->bdev->bd_dev,
+                         name->str, dev->devid, bio->bi_size);
+                rcu_read_unlock();
+        }
+#endif
+        bio->bi_bdev = dev->bdev;
+        if (async)
+                schedule_bio(root, dev, rw, bio);
+        else
+                btrfsic_submit_bio(rw, bio);
+}
+static int breakup_stripe_bio(struct btrfs_root *root, struct btrfs_bio *bbio,
+                              struct bio *first_bio, struct btrfs_device *dev,
+                              int dev_nr, int rw, int async)
+{
+        struct bio_vec *bvec = first_bio->bi_io_vec;
+        struct bio *bio;
+        int nr_vecs = bio_get_nr_vecs(dev->bdev);
+        u64 physical = bbio->stripes[dev_nr].physical;
+again:
+        bio = btrfs_bio_alloc(dev->bdev, physical >> 9, nr_vecs, GFP_NOFS);
+        if (!bio)
+                return -ENOMEM;
+        while (bvec <= (first_bio->bi_io_vec + first_bio->bi_vcnt - 1)) {
+                if (bio_add_page(bio, bvec->bv_page, bvec->bv_len,
+                                 bvec->bv_offset) < bvec->bv_len) {
+                        u64 len = bio->bi_size;
+                        atomic_inc(&bbio->stripes_pending);
+                        submit_stripe_bio(root, bbio, bio, physical, dev_nr,
+                                          rw, async);
+                        physical += len;
+                        goto again;
+                }
+                bvec++;
+        }
+        submit_stripe_bio(root, bbio, bio, physical, dev_nr, rw, async);
+        return 0;
+}
+static void bbio_error(struct btrfs_bio *bbio, struct bio *bio, u64 logical)
+{
+        atomic_inc(&bbio->error);
+        if (atomic_dec_and_test(&bbio->stripes_pending)) {
+                bio->bi_private = bbio->private;
+                bio->bi_end_io = bbio->end_io;
+                bio->bi_bdev = (struct block_device *)
+                        (unsigned long)bbio->mirror_num;
+                bio->bi_sector = logical >> 9;
+                kfree(bbio);
+                bio_endio(bio, -EIO);
+        }
+}
 int btrfs_map_bio(struct btrfs_root *root, int rw, struct bio *bio,
                  int mirror_num, int async_submit)
 {
-        struct btrfs_mapping_tree *map_tree;
        struct btrfs_device *dev;
        struct bio *first_bio = bio;
        u64 logical = (u64)bio->bi_sector << 9;
@@ -4253,12 +4825,11 @@ int btrfs_map_bio(struct btrfs_root *root, int rw, struct bio *bio,
        struct btrfs_bio *bbio = NULL;
        length = bio->bi_size;
-        map_tree = &root->fs_info->mapping_tree;
        map_length = length;
-        ret = btrfs_map_block(map_tree, rw, logical, &map_length, &bbio,
+        ret = btrfs_map_block(root->fs_info, rw, logical, &map_length, &bbio,
                              mirror_num);
-        if (ret) /* -ENOMEM */
+        if (ret)
                return ret;
        total_devs = bbio->num_stripes;
@@ -4276,52 +4847,48 @@ int btrfs_map_bio(struct btrfs_root *root, int rw, struct bio *bio,
        atomic_set(&bbio->stripes_pending, bbio->num_stripes);
        while (dev_nr < total_devs) {
+                dev = bbio->stripes[dev_nr].dev;
+                if (!dev || !dev->bdev || (rw & WRITE && !dev->writeable)) {
+                        bbio_error(bbio, first_bio, logical);
+                        dev_nr++;
+                        continue;
+                }
+                /*
+                 * Check and see if we're ok with this bio based on it's size
+                 * and offset with the given device.
+                 */
+                if (!bio_size_ok(dev->bdev, first_bio,
+                                 bbio->stripes[dev_nr].physical >> 9)) {
+                        ret = breakup_stripe_bio(root, bbio, first_bio, dev,
+                                                 dev_nr, rw, async_submit);
+                        BUG_ON(ret);
+                        dev_nr++;
+                        continue;
+                }
                if (dev_nr < total_devs - 1) {
                        bio = bio_clone(first_bio, GFP_NOFS);
                        BUG_ON(!bio); /* -ENOMEM */
                } else {
                        bio = first_bio;
                }
-                bio->bi_private = bbio;
-                bio->bi_private = merge_stripe_index_into_bio_private(
+                submit_stripe_bio(root, bbio, bio,
-                                bio->bi_private, (unsigned int)dev_nr);
+                                  bbio->stripes[dev_nr].physical, dev_nr, rw,
-                bio->bi_end_io = btrfs_end_bio;
+                                  async_submit);
-                bio->bi_sector = bbio->stripes[dev_nr].physical >> 9;
-                dev = bbio->stripes[dev_nr].dev;
-                if (dev && dev->bdev && (rw != WRITE || dev->writeable)) {
-#ifdef DEBUG
-                        struct rcu_string *name;
-                        rcu_read_lock();
-                        name = rcu_dereference(dev->name);
-                        pr_debug("btrfs_map_bio: rw %d, secor=%llu, dev=%lu "
-                                 "(%s id %llu), size=%u\n", rw,
-                                 (u64)bio->bi_sector, (u_long)dev->bdev->bd_dev,
-                                 name->str, dev->devid, bio->bi_size);
-                        rcu_read_unlock();
-#endif
-                        bio->bi_bdev = dev->bdev;
-                        if (async_submit)
-                                schedule_bio(root, dev, rw, bio);
-                        else
-                                btrfsic_submit_bio(rw, bio);
-                } else {
-                        bio->bi_bdev = root->fs_info->fs_devices->latest_bdev;
-                        bio->bi_sector = logical >> 9;
-                        bio_endio(bio, -EIO);
-                }
                dev_nr++;
        }
        return 0;
 }
-struct btrfs_device *btrfs_find_device(struct btrfs_root *root, u64 devid,
+struct btrfs_device *btrfs_find_device(struct btrfs_fs_info *fs_info, u64 devid,
                                       u8 *uuid, u8 *fsid)
 {
        struct btrfs_device *device;
        struct btrfs_fs_devices *cur_devices;
-        cur_devices = root->fs_info->fs_devices;
+        cur_devices = fs_info->fs_devices;
        while (cur_devices) {
                if (!fsid ||
                    !memcmp(cur_devices->fsid, fsid, BTRFS_UUID_SIZE)) {
@@ -4402,6 +4969,7 @@ static int read_one_chunk(struct btrfs_root *root, struct btrfs_key *key,
        em->bdev = (struct block_device *)map;
        em->start = logical;
        em->len = length;
+        em->orig_start = 0;
        em->block_start = 0;
        em->block_len = em->len;
@@ -4419,8 +4987,8 @@ static int read_one_chunk(struct btrfs_root *root, struct btrfs_key *key,
                read_extent_buffer(leaf, uuid, (unsigned long)
                                   btrfs_stripe_dev_uuid_nr(chunk, i),
                                   BTRFS_UUID_SIZE);
-                map->stripes[i].dev = btrfs_find_device(root, devid, uuid,
+                map->stripes[i].dev = btrfs_find_device(root->fs_info, devid,
-                                                        NULL);
+                                                        uuid, NULL);
                if (!map->stripes[i].dev && !btrfs_test_opt(root, DEGRADED)) {
                        kfree(map);
                        free_extent_map(em);
@@ -4461,6 +5029,8 @@ static void fill_device_from_item(struct extent_buffer *leaf,
        device->io_align = btrfs_device_io_align(leaf, dev_item);
        device->io_width = btrfs_device_io_width(leaf, dev_item);
        device->sector_size = btrfs_device_sector_size(leaf, dev_item);
+        WARN_ON(device->devid == BTRFS_DEV_REPLACE_DEVID);
+        device->is_tgtdev_for_dev_replace = 0;
        ptr = (unsigned long)btrfs_device_uuid(dev_item);
        read_extent_buffer(leaf, device->uuid, ptr, BTRFS_UUID_SIZE);
@@ -4538,7 +5108,7 @@ static int read_one_dev(struct btrfs_root *root,
                        return ret;
        }
-        device = btrfs_find_device(root, devid, dev_uuid, fs_uuid);
+        device = btrfs_find_device(root->fs_info, devid, dev_uuid, fs_uuid);
        if (!device || !device->bdev) {
                if (!btrfs_test_opt(root, DEGRADED))
                        return -EIO;
@@ -4571,7 +5141,7 @@ static int read_one_dev(struct btrfs_root *root,
        fill_device_from_item(leaf, dev_item, device);
        device->dev_root = root->fs_info->dev_root;
        device->in_fs_metadata = 1;
-        if (device->writeable) {
+        if (device->writeable && !device->is_tgtdev_for_dev_replace) {
                device->fs_devices->total_rw_bytes += device->total_bytes;
                spin_lock(&root->fs_info->free_chunk_lock);
                root->fs_info->free_chunk_space += device->total_bytes -
@@ -4930,7 +5500,7 @@ int btrfs_get_dev_stats(struct btrfs_root *root,
        int i;
        mutex_lock(&fs_devices->device_list_mutex);
-        dev = btrfs_find_device(root, stats->devid, NULL, NULL);
+        dev = btrfs_find_device(root->fs_info, stats->devid, NULL, NULL);
        mutex_unlock(&fs_devices->device_list_mutex);
        if (!dev) {
@@ -4958,3 +5528,21 @@ int btrfs_get_dev_stats(struct btrfs_root *root,
                stats->nr_items = BTRFS_DEV_STAT_VALUES_MAX;
        return 0;
 }
+int btrfs_scratch_superblock(struct btrfs_device *device)
+{
+        struct buffer_head *bh;
+        struct btrfs_super_block *disk_super;
+        bh = btrfs_read_dev_super(device->bdev);
+        if (!bh)
+                return -EINVAL;
+        disk_super = (struct btrfs_super_block *)bh->b_data;
+        memset(&disk_super->magic, 0, sizeof(disk_super->magic));
+        set_buffer_dirty(bh);
+        sync_dirty_buffer(bh);
+        brelse(bh);
+        return 0;
+}
diff --git a/fs/btrfs/volumes.h b/fs/btrfs/volumes.h
index 53c06af92e8d..d3c3939ac751 100644
--- a/fs/btrfs/volumes.h
+++ b/fs/btrfs/volumes.h
@@ -50,6 +50,7 @@ struct btrfs_device {
        int in_fs_metadata;
        int missing;
        int can_discard;
+        int is_tgtdev_for_dev_replace;
        spinlock_t io_lock;
@@ -88,7 +89,7 @@ struct btrfs_device {
        u8 uuid[BTRFS_UUID_SIZE];
        /* per-device scrub information */
-        struct scrub_dev *scrub_device;
+        struct scrub_ctx *scrub_device;
        struct btrfs_work work;
        struct rcu_head rcu;
@@ -179,6 +180,15 @@ struct btrfs_device_info {
        u64 total_avail;
 };
+struct btrfs_raid_attr {
+        int sub_stripes;        /* sub_stripes info for map */
+        int dev_stripes;        /* stripes per dev */
+        int devs_max;           /* max devs to use */
+        int devs_min;           /* min devs needed */
+        int devs_increment;     /* ndevs has to be a multiple of this */
+        int ncopies;            /* how many copies to data has */
+};
 struct map_lookup {
        u64 type;
        int io_align;
@@ -248,7 +258,7 @@ int btrfs_alloc_dev_extent(struct btrfs_trans_handle *trans,
                           struct btrfs_device *device,
                           u64 chunk_tree, u64 chunk_objectid,
                           u64 chunk_offset, u64 start, u64 num_bytes);
-int btrfs_map_block(struct btrfs_mapping_tree *map_tree, int rw,
+int btrfs_map_block(struct btrfs_fs_info *fs_info, int rw,
                    u64 logical, u64 *length,
                    struct btrfs_bio **bbio_ret, int mirror_num);
 int btrfs_rmap_block(struct btrfs_mapping_tree *map_tree,
@@ -267,19 +277,27 @@ int btrfs_open_devices(struct btrfs_fs_devices *fs_devices,
 int btrfs_scan_one_device(const char *path, fmode_t flags, void *holder,
                          struct btrfs_fs_devices **fs_devices_ret);
 int btrfs_close_devices(struct btrfs_fs_devices *fs_devices);
-void btrfs_close_extra_devices(struct btrfs_fs_devices *fs_devices);
+void btrfs_close_extra_devices(struct btrfs_fs_info *fs_info,
+                               struct btrfs_fs_devices *fs_devices, int step);
+int btrfs_find_device_missing_or_by_path(struct btrfs_root *root,
+                                         char *device_path,
+                                         struct btrfs_device **device);
+int btrfs_find_device_by_path(struct btrfs_root *root, char *device_path,
+                              struct btrfs_device **device);
 int btrfs_add_device(struct btrfs_trans_handle *trans,
                     struct btrfs_root *root,
                     struct btrfs_device *device);
 int btrfs_rm_device(struct btrfs_root *root, char *device_path);
 void btrfs_cleanup_fs_uuids(void);
-int btrfs_num_copies(struct btrfs_mapping_tree *map_tree, u64 logical, u64 len);
+int btrfs_num_copies(struct btrfs_fs_info *fs_info, u64 logical, u64 len);
 int btrfs_grow_device(struct btrfs_trans_handle *trans,
                      struct btrfs_device *device, u64 new_size);
-struct btrfs_device *btrfs_find_device(struct btrfs_root *root, u64 devid,
+struct btrfs_device *btrfs_find_device(struct btrfs_fs_info *fs_info, u64 devid,
                                       u8 *uuid, u8 *fsid);
 int btrfs_shrink_device(struct btrfs_device *device, u64 new_size);
 int btrfs_init_new_device(struct btrfs_root *root, char *path);
+int btrfs_init_dev_replace_tgtdev(struct btrfs_root *root, char *device_path,
+                                  struct btrfs_device **device_out);
 int btrfs_balance(struct btrfs_balance_control *bctl,
                  struct btrfs_ioctl_balance_args *bargs);
 int btrfs_resume_balance_async(struct btrfs_fs_info *fs_info);
@@ -296,6 +314,13 @@ int btrfs_get_dev_stats(struct btrfs_root *root,
 int btrfs_init_dev_stats(struct btrfs_fs_info *fs_info);
 int btrfs_run_dev_stats(struct btrfs_trans_handle *trans,
                        struct btrfs_fs_info *fs_info);
+void btrfs_rm_dev_replace_srcdev(struct btrfs_fs_info *fs_info,
+                                 struct btrfs_device *srcdev);
+void btrfs_destroy_dev_replace_tgtdev(struct btrfs_fs_info *fs_info,
+                                      struct btrfs_device *tgtdev);
+void btrfs_init_dev_replace_tgtdev_for_resume(struct btrfs_fs_info *fs_info,
+                                              struct btrfs_device *tgtdev);
+int btrfs_scratch_superblock(struct btrfs_device *device);
 static inline void btrfs_dev_stat_inc(struct btrfs_device *dev,
                                      int index)
diff --git a/fs/btrfs/xattr.c b/fs/btrfs/xattr.c
index 3f4e2d69e83a..446a6848c554 100644
--- a/fs/btrfs/xattr.c
+++ b/fs/btrfs/xattr.c
@@ -122,6 +122,16 @@ static int do_setxattr(struct btrfs_trans_handle *trans,
                 */
                if (!value)
                        goto out;
+        } else {
+                di = btrfs_lookup_xattr(NULL, root, path, btrfs_ino(inode),
+                                        name, name_len, 0);
+                if (IS_ERR(di)) {
+                        ret = PTR_ERR(di);
+                        goto out;
+                }
+                if (!di && !value)
+                        goto out;
+                btrfs_release_path(path);
        }
 again:
@@ -198,6 +208,7 @@ int __btrfs_setxattr(struct btrfs_trans_handle *trans,
        inode_inc_iversion(inode);
        inode->i_ctime = CURRENT_TIME;
+        set_bit(BTRFS_INODE_COPY_EVERYTHING, &BTRFS_I(inode)->runtime_flags);
        ret = btrfs_update_inode(trans, root, inode);
        BUG_ON(ret);
 out:
@@ -265,7 +276,7 @@ ssize_t btrfs_listxattr(struct dentry *dentry, char *buffer, size_t size)
                di = btrfs_item_ptr(leaf, slot, struct btrfs_dir_item);
                if (verify_dir_item(root, leaf, di))
-                        continue;
+                        goto next;
                name_len = btrfs_dir_name_len(leaf, di);
                total_size += name_len + 1;
diff --git a/fs/buffer.c b/fs/buffer.c
index b5f044283edb..c017a2dfb909 100644
--- a/fs/buffer.c
+++ b/fs/buffer.c
@@ -46,8 +46,7 @@ static int fsync_buffers_list(spinlock_t *lock, struct list_head *list);
 #define BH_ENTRY(list) list_entry((list), struct buffer_head, b_assoc_buffers)
-inline void
+void init_buffer(struct buffer_head *bh, bh_end_io_t *handler, void *private)
-init_buffer(struct buffer_head *bh, bh_end_io_t *handler, void *private)
 {
        bh->b_end_io = handler;
        bh->b_private = private;
@@ -555,7 +554,7 @@ void emergency_thaw_all(void)
 */
 int sync_mapping_buffers(struct address_space *mapping)
 {
-        struct address_space *buffer_mapping = mapping->assoc_mapping;
+        struct address_space *buffer_mapping = mapping->private_data;
        if (buffer_mapping == NULL || list_empty(&mapping->private_list))
                return 0;
@@ -588,10 +587,10 @@ void mark_buffer_dirty_inode(struct buffer_head *bh, struct inode *inode)
        struct address_space *buffer_mapping = bh->b_page->mapping;
        mark_buffer_dirty(bh);
-        if (!mapping->assoc_mapping) {
+        if (!mapping->private_data) {
-                mapping->assoc_mapping = buffer_mapping;
+                mapping->private_data = buffer_mapping;
        } else {
-                BUG_ON(mapping->assoc_mapping != buffer_mapping);
+                BUG_ON(mapping->private_data != buffer_mapping);
        }
        if (!bh->b_assoc_map) {
                spin_lock(&buffer_mapping->private_lock);
@@ -788,7 +787,7 @@ void invalidate_inode_buffers(struct inode *inode)
        if (inode_has_buffers(inode)) {
                struct address_space *mapping = &inode->i_data;
                struct list_head *list = &mapping->private_list;
-                struct address_space *buffer_mapping = mapping->assoc_mapping;
+                struct address_space *buffer_mapping = mapping->private_data;
                spin_lock(&buffer_mapping->private_lock);
                while (!list_empty(list))
@@ -811,7 +810,7 @@ int remove_inode_buffers(struct inode *inode)
        if (inode_has_buffers(inode)) {
                struct address_space *mapping = &inode->i_data;
                struct list_head *list = &mapping->private_list;
-                struct address_space *buffer_mapping = mapping->assoc_mapping;
+                struct address_space *buffer_mapping = mapping->private_data;
                spin_lock(&buffer_mapping->private_lock);
                while (!list_empty(list)) {
@@ -850,13 +849,10 @@ try_again:
                if (!bh)
                        goto no_grow;
-                bh->b_bdev = NULL;
                bh->b_this_page = head;
                bh->b_blocknr = -1;
                head = bh;
-                bh->b_state = 0;
-                atomic_set(&bh->b_count, 0);
                bh->b_size = size;
                /* Link the buffer to its page */
@@ -911,6 +907,18 @@ link_dev_buffers(struct page *page, struct buffer_head *head)
        attach_page_buffers(page, head);
 }
+static sector_t blkdev_max_block(struct block_device *bdev, unsigned int size)
+{
+        sector_t retval = ~((sector_t)0);
+        loff_t sz = i_size_read(bdev->bd_inode);
+        if (sz) {
+                unsigned int sizebits = blksize_bits(size);
+                retval = (sz >> sizebits);
+        }
+        return retval;
+}
 /*
 * Initialise the state of a blockdev page's buffers.
 */ 
@@ -921,7 +929,7 @@ init_page_buffers(struct page *page, struct block_device *bdev,
        struct buffer_head *head = page_buffers(page);
        struct buffer_head *bh = head;
        int uptodate = PageUptodate(page);
-        sector_t end_block = blkdev_max_block(I_BDEV(bdev->bd_inode));
+        sector_t end_block = blkdev_max_block(I_BDEV(bdev->bd_inode), size);
        do {
                if (!buffer_mapped(bh)) {
@@ -1553,6 +1561,28 @@ void unmap_underlying_metadata(struct block_device *bdev, sector_t block)
 EXPORT_SYMBOL(unmap_underlying_metadata);
 /*
+ * Size is a power-of-two in the range 512..PAGE_SIZE,
+ * and the case we care about most is PAGE_SIZE.
+ *
+ * So this *could* possibly be written with those
+ * constraints in mind (relevant mostly if some
+ * architecture has a slow bit-scan instruction)
+ */
+static inline int block_size_bits(unsigned int blocksize)
+{
+        return ilog2(blocksize);
+}
+static struct buffer_head *create_page_buffers(struct page *page, struct inode *inode, unsigned int b_state)
+{
+        BUG_ON(!PageLocked(page));
+        if (!page_has_buffers(page))
+                create_empty_buffers(page, 1 << ACCESS_ONCE(inode->i_blkbits), b_state);
+        return page_buffers(page);
+}
+/*
 * NOTE! All mapped/uptodate combinations are valid:
 *
 *      Mapped  Uptodate        Meaning
@@ -1589,19 +1619,13 @@ static int __block_write_full_page(struct inode *inode, struct page *page,
        sector_t block;
        sector_t last_block;
        struct buffer_head *bh, *head;
-        const unsigned blocksize = 1 << inode->i_blkbits;
+        unsigned int blocksize, bbits;
        int nr_underway = 0;
        int write_op = (wbc->sync_mode == WB_SYNC_ALL ?
                        WRITE_SYNC : WRITE);
-        BUG_ON(!PageLocked(page));
+        head = create_page_buffers(page, inode,
-        last_block = (i_size_read(inode) - 1) >> inode->i_blkbits;
-        if (!page_has_buffers(page)) {
-                create_empty_buffers(page, blocksize,
                                        (1 << BH_Dirty)|(1 << BH_Uptodate));
-        }
        /*
         * Be very careful.  We have no exclusion from __set_page_dirty_buffers
@@ -1613,9 +1637,12 @@ static int __block_write_full_page(struct inode *inode, struct page *page,
         * handle that here by just cleaning them.
         */
-        block = (sector_t)page->index << (PAGE_CACHE_SHIFT - inode->i_blkbits);
-        head = page_buffers(page);
        bh = head;
+        blocksize = bh->b_size;
+        bbits = block_size_bits(blocksize);
+        block = (sector_t)page->index << (PAGE_CACHE_SHIFT - bbits);
+        last_block = (i_size_read(inode) - 1) >> bbits;
        /*
         * Get all the dirty buffers mapped to disk addresses and
@@ -1806,12 +1833,10 @@ int __block_write_begin(struct page *page, loff_t pos, unsigned len,
        BUG_ON(to > PAGE_CACHE_SIZE);
        BUG_ON(from > to);
-        blocksize = 1 << inode->i_blkbits;
+        head = create_page_buffers(page, inode, 0);
-        if (!page_has_buffers(page))
+        blocksize = head->b_size;
-                create_empty_buffers(page, blocksize, 0);
+        bbits = block_size_bits(blocksize);
-        head = page_buffers(page);
-        bbits = inode->i_blkbits;
        block = (sector_t)page->index << (PAGE_CACHE_SHIFT - bbits);
        for(bh = head, block_start = 0; bh != head || !block_start;
@@ -1881,11 +1906,11 @@ static int __block_commit_write(struct inode *inode, struct page *page,
        unsigned blocksize;
        struct buffer_head *bh, *head;
-        blocksize = 1 << inode->i_blkbits;
+        bh = head = page_buffers(page);
+        blocksize = bh->b_size;
-        for(bh = head = page_buffers(page), block_start = 0;
+        block_start = 0;
-            bh != head || !block_start;
+        do {
-            block_start=block_end, bh = bh->b_this_page) {
                block_end = block_start + blocksize;
                if (block_end <= from || block_start >= to) {
                        if (!buffer_uptodate(bh))
@@ -1895,7 +1920,10 @@ static int __block_commit_write(struct inode *inode, struct page *page,
                        mark_buffer_dirty(bh);
                }
                clear_buffer_new(bh);
-        }
+                block_start = block_end;
+                bh = bh->b_this_page;
+        } while (bh != head);
        /*
         * If this is a partial write which happened to make all buffers
@@ -2020,7 +2048,6 @@ EXPORT_SYMBOL(generic_write_end);
 int block_is_partially_uptodate(struct page *page, read_descriptor_t *desc,
                                        unsigned long from)
 {
-        struct inode *inode = page->mapping->host;
        unsigned block_start, block_end, blocksize;
        unsigned to;
        struct buffer_head *bh, *head;
@@ -2029,13 +2056,13 @@ int block_is_partially_uptodate(struct page *page, read_descriptor_t *desc,
        if (!page_has_buffers(page))
                return 0;
-        blocksize = 1 << inode->i_blkbits;
+        head = page_buffers(page);
+        blocksize = head->b_size;
        to = min_t(unsigned, PAGE_CACHE_SIZE - from, desc->count);
        to = from + to;
        if (from < blocksize && to > PAGE_CACHE_SIZE - blocksize)
                return 0;
-        head = page_buffers(page);
        bh = head;
        block_start = 0;
        do {
@@ -2068,18 +2095,16 @@ int block_read_full_page(struct page *page, get_block_t *get_block)
        struct inode *inode = page->mapping->host;
        sector_t iblock, lblock;
        struct buffer_head *bh, *head, *arr[MAX_BUF_PER_PAGE];
-        unsigned int blocksize;
+        unsigned int blocksize, bbits;
        int nr, i;
        int fully_mapped = 1;
-        BUG_ON(!PageLocked(page));
+        head = create_page_buffers(page, inode, 0);
-        blocksize = 1 << inode->i_blkbits;
+        blocksize = head->b_size;
-        if (!page_has_buffers(page))
+        bbits = block_size_bits(blocksize);
-                create_empty_buffers(page, blocksize, 0);
-        head = page_buffers(page);
-        iblock = (sector_t)page->index << (PAGE_CACHE_SHIFT - inode->i_blkbits);
+        iblock = (sector_t)page->index << (PAGE_CACHE_SHIFT - bbits);
-        lblock = (i_size_read(inode)+blocksize-1) >> inode->i_blkbits;
+        lblock = (i_size_read(inode)+blocksize-1) >> bbits;
        bh = head;
        nr = 0;
        i = 0;
@@ -2864,6 +2889,55 @@ static void end_bio_bh_io_sync(struct bio *bio, int err)
        bio_put(bio);
 }
+/*
+ * This allows us to do IO even on the odd last sectors
+ * of a device, even if the bh block size is some multiple
+ * of the physical sector size.
+ *
+ * We'll just truncate the bio to the size of the device,
+ * and clear the end of the buffer head manually.
+ *
+ * Truly out-of-range accesses will turn into actual IO
+ * errors, this only handles the "we need to be able to
+ * do IO at the final sector" case.
+ */
+static void guard_bh_eod(int rw, struct bio *bio, struct buffer_head *bh)
+{
+        sector_t maxsector;
+        unsigned bytes;
+        maxsector = i_size_read(bio->bi_bdev->bd_inode) >> 9;
+        if (!maxsector)
+                return;
+        /*
+         * If the *whole* IO is past the end of the device,
+         * let it through, and the IO layer will turn it into
+         * an EIO.
+         */
+        if (unlikely(bio->bi_sector >= maxsector))
+                return;
+        maxsector -= bio->bi_sector;
+        bytes = bio->bi_size;
+        if (likely((bytes >> 9) <= maxsector))
+                return;
+        /* Uhhuh. We've got a bh that straddles the device size! */
+        bytes = maxsector << 9;
+        /* Truncate the bio.. */
+        bio->bi_size = bytes;
+        bio->bi_io_vec[0].bv_len = bytes;
+        /* ..and clear the end of the buffer for reads */
+        if ((rw & RW_MASK) == READ) {
+                void *kaddr = kmap_atomic(bh->b_page);
+                memset(kaddr + bh_offset(bh) + bytes, 0, bh->b_size - bytes);
+                kunmap_atomic(kaddr);
+        }
+}
 int submit_bh(int rw, struct buffer_head * bh)
 {
        struct bio *bio;
@@ -2900,6 +2974,9 @@ int submit_bh(int rw, struct buffer_head * bh)
        bio->bi_end_io = end_bio_bh_io_sync;
        bio->bi_private = bh;
+        /* Take care of bh's that straddle the end of the device */
+        guard_bh_eod(rw, bio, bh);
        bio_get(bio);
        submit_bio(rw, bio);
diff --git a/fs/ceph/dir.c b/fs/ceph/dir.c
index e5b77319c97b..8c1aabe93b67 100644
--- a/fs/ceph/dir.c
+++ b/fs/ceph/dir.c
@@ -454,7 +454,7 @@ static void reset_readdir(struct ceph_file_info *fi)
        fi->flags &= ~CEPH_F_ATEND;
 }
-static loff_t ceph_dir_llseek(struct file *file, loff_t offset, int origin)
+static loff_t ceph_dir_llseek(struct file *file, loff_t offset, int whence)
 {
        struct ceph_file_info *fi = file->private_data;
        struct inode *inode = file->f_mapping->host;
@@ -463,7 +463,7 @@ static loff_t ceph_dir_llseek(struct file *file, loff_t offset, int origin)
        mutex_lock(&inode->i_mutex);
        retval = -EINVAL;
-        switch (origin) {
+        switch (whence) {
        case SEEK_END:
                offset += inode->i_size + 2;   /* FIXME */
                break;
diff --git a/fs/ceph/export.c b/fs/ceph/export.c
index 9349bb37a2fe..ca3ab3f9ca70 100644
--- a/fs/ceph/export.c
+++ b/fs/ceph/export.c
@@ -56,13 +56,15 @@ static int ceph_encode_fh(struct inode *inode, u32 *rawfh, int *max_len,
        struct ceph_nfs_confh *cfh = (void *)rawfh;
        int connected_handle_length = sizeof(*cfh)/4;
        int handle_length = sizeof(*fh)/4;
-        struct dentry *dentry = d_find_alias(inode);
+        struct dentry *dentry;
        struct dentry *parent;
        /* don't re-export snaps */
        if (ceph_snap(inode) != CEPH_NOSNAP)
                return -EINVAL;
+        dentry = d_find_alias(inode);
        /* if we found an alias, generate a connectable fh */
        if (*max_len >= connected_handle_length && dentry) {
                dout("encode_fh %p connectable\n", dentry);
diff --git a/fs/ceph/file.c b/fs/ceph/file.c
index 5840d2aaed15..d4dfdcf76d7f 100644
--- a/fs/ceph/file.c
+++ b/fs/ceph/file.c
@@ -797,7 +797,7 @@ out:
 /*
 * llseek.  be sure to verify file size on SEEK_END.
 */
-static loff_t ceph_llseek(struct file *file, loff_t offset, int origin)
+static loff_t ceph_llseek(struct file *file, loff_t offset, int whence)
 {
        struct inode *inode = file->f_mapping->host;
        int ret;
@@ -805,7 +805,7 @@ static loff_t ceph_llseek(struct file *file, loff_t offset, int origin)
        mutex_lock(&inode->i_mutex);
        __ceph_do_pending_vmtruncate(inode);
-        if (origin == SEEK_END || origin == SEEK_DATA || origin == SEEK_HOLE) {
+        if (whence == SEEK_END || whence == SEEK_DATA || whence == SEEK_HOLE) {
                ret = ceph_do_getattr(inode, CEPH_STAT_CAP_SIZE);
                if (ret < 0) {
                        offset = ret;
@@ -813,7 +813,7 @@ static loff_t ceph_llseek(struct file *file, loff_t offset, int origin)
                }
        }
-        switch (origin) {
+        switch (whence) {
        case SEEK_END:
                offset += inode->i_size;
                break;
diff --git a/fs/cifs/Kconfig b/fs/cifs/Kconfig
index 2075ddfffa73..21ff76c22a17 100644
--- a/fs/cifs/Kconfig
+++ b/fs/cifs/Kconfig
@@ -122,9 +122,17 @@ config CIFS_ACL
            Allows fetching CIFS/NTFS ACL from the server.  The DACL blob
            is handed over to the application/caller.
+config CIFS_DEBUG
+        bool "Enable CIFS debugging routines"
+        default y
+        depends on CIFS
+        help
+           Enabling this option adds helpful debugging messages to
+           the cifs code which increases the size of the cifs module.
+           If unsure, say Y.
 config CIFS_DEBUG2
        bool "Enable additional CIFS debugging routines"
-        depends on CIFS
+        depends on CIFS_DEBUG
        help
           Enabling this option adds a few more debugging routines
           to the cifs code which slightly increases the size of
diff --git a/fs/cifs/README b/fs/cifs/README
index 22ab7b5b8da7..2d5622f60e11 100644
--- a/fs/cifs/README
+++ b/fs/cifs/README
@@ -480,7 +480,7 @@ A partial list of the supported mount options follows:
                Unicode on the wire.
 nomapchars     Do not translate any of these seven characters (default).
 nocase         Request case insensitive path name matching (case
-                sensitive is the default if the server suports it).
+                sensitive is the default if the server supports it).
                (mount option "ignorecase" is identical to "nocase")
 posixpaths     If CIFS Unix extensions are supported, attempt to
                negotiate posix path name support which allows certain
diff --git a/fs/cifs/cifs_debug.h b/fs/cifs/cifs_debug.h
index c0c68bb492d7..86e92ef2abc1 100644
--- a/fs/cifs/cifs_debug.h
+++ b/fs/cifs/cifs_debug.h
@@ -18,7 +18,6 @@
 *   Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
 *
 */
-#define CIFS_DEBUG              /* BB temporary */
 #ifndef _H_CIFS_DEBUG
 #define _H_CIFS_DEBUG
@@ -37,49 +36,43 @@ void dump_smb(void *, int);
 #define CIFS_RC         0x02
 #define CIFS_TIMER      0x04
+extern int cifsFYI;
+extern int cifsERROR;
 /*
 *      debug ON
 *      --------
 */
-#ifdef CIFS_DEBUG
+#ifdef CONFIG_CIFS_DEBUG
 /* information message: e.g., configuration, major event */
-extern int cifsFYI;
+#define cifsfyi(fmt, ...)                                               \
-#define cifsfyi(fmt, arg...)                                            \
 do {                                                                    \
        if (cifsFYI & CIFS_INFO)                                        \
-                printk(KERN_DEBUG "%s: " fmt "\n", __FILE__, ##arg);    \
+                printk(KERN_DEBUG "%s: " fmt "\n",                      \
+                       __FILE__, ##__VA_ARGS__);                        \
 } while (0)
-#define cFYI(set, fmt, arg...)                  \
+#define cFYI(set, fmt, ...)                                             \
-do {                                            \
+do {                                                                    \
-        if (set)                                \
+        if (set)                                                        \
-                cifsfyi(fmt, ##arg);            \
+                cifsfyi(fmt, ##__VA_ARGS__);                            \
 } while (0)
-#define cifswarn(fmt, arg...)                   \
+#define cifswarn(fmt, ...)                                              \
-        printk(KERN_WARNING fmt "\n", ##arg)
+        printk(KERN_WARNING fmt "\n", ##__VA_ARGS__)
-/* debug event message: */
+/* error event message: e.g., i/o error */
-extern int cifsERROR;
+#define cifserror(fmt, ...)                                             \
-#define cEVENT(fmt, arg...)                                             \
 do {                                                                    \
        if (cifsERROR)                                                  \
-                printk(KERN_EVENT "%s: " fmt "\n", __FILE__, ##arg);    \
+                printk(KERN_ERR "CIFS VFS: " fmt "\n", ##__VA_ARGS__);  \
-} while (0)
-/* error event message: e.g., i/o error */
-#define cifserror(fmt, arg...)                                  \
-do {                                                            \
-        if (cifsERROR)                                          \
-                printk(KERN_ERR "CIFS VFS: " fmt "\n", ##arg);  \
 } while (0)
-#define cERROR(set, fmt, arg...)                \
+#define cERROR(set, fmt, ...)                                           \
-do {                                            \
+do {                                                                    \
-        if (set)                                \
+        if (set)                                                        \
-                cifserror(fmt, ##arg);          \
+                cifserror(fmt, ##__VA_ARGS__);                          \
 } while (0)
 /*
@@ -87,10 +80,27 @@ do {						\
 *      ---------
 */
 #else           /* _CIFS_DEBUG */
-#define cERROR(set, fmt, arg...)
+#define cifsfyi(fmt, ...)                                               \
-#define cEVENT(fmt, arg...)
+do {                                                                    \
-#define cFYI(set, fmt, arg...)
+        if (0)                                                          \
-#define cifserror(fmt, arg...)
+                printk(KERN_DEBUG "%s: " fmt "\n",                      \
+                       __FILE__, ##__VA_ARGS__);                        \
+} while (0)
+#define cFYI(set, fmt, ...)                                             \
+do {                                                                    \
+        if (0 && set)                                                   \
+                cifsfyi(fmt, ##__VA_ARGS__);                            \
+} while (0)
+#define cifserror(fmt, ...)                                             \
+do {                                                                    \
+        if (0)                                                          \
+                printk(KERN_ERR "CIFS VFS: " fmt "\n", ##__VA_ARGS__);  \
+} while (0)
+#define cERROR(set, fmt, ...)                                           \
+do {                                                                    \
+        if (0 && set)                                                   \
+                cifserror(fmt, ##__VA_ARGS__);                          \
+} while (0)
 #endif          /* _CIFS_DEBUG */
 #endif                          /* _H_CIFS_DEBUG */
diff --git a/fs/cifs/cifsacl.c b/fs/cifs/cifsacl.c
index fc783e264420..5cbd00e74067 100644
--- a/fs/cifs/cifsacl.c
+++ b/fs/cifs/cifsacl.c
@@ -42,135 +42,27 @@ static const struct cifs_sid sid_authusers = {
 /* group users */
 static const struct cifs_sid sid_user = {1, 2 , {0, 0, 0, 0, 0, 5}, {} };
-const struct cred *root_cred;
+static const struct cred *root_cred;
-static void
-shrink_idmap_tree(struct rb_root *root, int nr_to_scan, int *nr_rem,
-                        int *nr_del)
-{
-        struct rb_node *node;
-        struct rb_node *tmp;
-        struct cifs_sid_id *psidid;
-        node = rb_first(root);
-        while (node) {
-                tmp = node;
-                node = rb_next(tmp);
-                psidid = rb_entry(tmp, struct cifs_sid_id, rbnode);
-                if (nr_to_scan == 0 || *nr_del == nr_to_scan)
-                        ++(*nr_rem);
-                else {
-                        if (time_after(jiffies, psidid->time + SID_MAP_EXPIRE)
-                                                && psidid->refcount == 0) {
-                                rb_erase(tmp, root);
-                                ++(*nr_del);
-                        } else
-                                ++(*nr_rem);
-                }
-        }
-}
-/*
- * Run idmap cache shrinker.
- */
-static int
-cifs_idmap_shrinker(struct shrinker *shrink, struct shrink_control *sc)
-{
-        int nr_to_scan = sc->nr_to_scan;
-        int nr_del = 0;
-        int nr_rem = 0;
-        struct rb_root *root;
-        root = &uidtree;
-        spin_lock(&siduidlock);
-        shrink_idmap_tree(root, nr_to_scan, &nr_rem, &nr_del);
-        spin_unlock(&siduidlock);
-        root = &gidtree;
-        spin_lock(&sidgidlock);
-        shrink_idmap_tree(root, nr_to_scan, &nr_rem, &nr_del);
-        spin_unlock(&sidgidlock);
-        root = &siduidtree;
-        spin_lock(&uidsidlock);
-        shrink_idmap_tree(root, nr_to_scan, &nr_rem, &nr_del);
-        spin_unlock(&uidsidlock);
-        root = &sidgidtree;
-        spin_lock(&gidsidlock);
-        shrink_idmap_tree(root, nr_to_scan, &nr_rem, &nr_del);
-        spin_unlock(&gidsidlock);
-        return nr_rem;
-}
-static void
-sid_rb_insert(struct rb_root *root, unsigned long cid,
-                struct cifs_sid_id **psidid, char *typestr)
-{
-        char *strptr;
-        struct rb_node *node = root->rb_node;
-        struct rb_node *parent = NULL;
-        struct rb_node **linkto = &(root->rb_node);
-        struct cifs_sid_id *lsidid;
-        while (node) {
-                lsidid = rb_entry(node, struct cifs_sid_id, rbnode);
-                parent = node;
-                if (cid > lsidid->id) {
-                        linkto = &(node->rb_left);
-                        node = node->rb_left;
-                }
-                if (cid < lsidid->id) {
-                        linkto = &(node->rb_right);
-                        node = node->rb_right;
-                }
-        }
-        (*psidid)->id = cid;
-        (*psidid)->time = jiffies - (SID_MAP_RETRY + 1);
-        (*psidid)->refcount = 0;
-        sprintf((*psidid)->sidstr, "%s", typestr);
-        strptr = (*psidid)->sidstr + strlen((*psidid)->sidstr);
-        sprintf(strptr, "%ld", cid);
-        clear_bit(SID_ID_PENDING, &(*psidid)->state);
-        clear_bit(SID_ID_MAPPED, &(*psidid)->state);
-        rb_link_node(&(*psidid)->rbnode, parent, linkto);
-        rb_insert_color(&(*psidid)->rbnode, root);
-}
-static struct cifs_sid_id *
-sid_rb_search(struct rb_root *root, unsigned long cid)
-{
-        struct rb_node *node = root->rb_node;
-        struct cifs_sid_id *lsidid;
-        while (node) {
-                lsidid = rb_entry(node, struct cifs_sid_id, rbnode);
-                if (cid > lsidid->id)
-                        node = node->rb_left;
-                else if (cid < lsidid->id)
-                        node = node->rb_right;
-                else /* node found */
-                        return lsidid;
-        }
-        return NULL;
-}
-static struct shrinker cifs_shrinker = {
-        .shrink = cifs_idmap_shrinker,
-        .seeks = DEFAULT_SEEKS,
-};
 static int
 cifs_idmap_key_instantiate(struct key *key, struct key_preparsed_payload *prep)
 {
        char *payload;
+        /*
+         * If the payload is less than or equal to the size of a pointer, then
+         * an allocation here is wasteful. Just copy the data directly to the
+         * payload.value union member instead.
+         *
+         * With this however, you must check the datalen before trying to
+         * dereference payload.data!
+         */
+        if (prep->datalen <= sizeof(key->payload)) {
+                key->payload.value = 0;
+                memcpy(&key->payload.value, prep->data, prep->datalen);
+                key->datalen = prep->datalen;
+                return 0;
+        }
        payload = kmalloc(prep->datalen, GFP_KERNEL);
        if (!payload)
                return -ENOMEM;
@@ -184,10 +76,11 @@ cifs_idmap_key_instantiate(struct key *key, struct key_preparsed_payload *prep)
 static inline void
 cifs_idmap_key_destroy(struct key *key)
 {
-        kfree(key->payload.data);
+        if (key->datalen > sizeof(key->payload))
+                kfree(key->payload.data);
 }
-struct key_type cifs_idmap_key_type = {
+static struct key_type cifs_idmap_key_type = {
        .name        = "cifs.idmap",
        .instantiate = cifs_idmap_key_instantiate,
        .destroy     = cifs_idmap_key_destroy,
@@ -195,214 +88,174 @@ struct key_type cifs_idmap_key_type = {
        .match       = user_match,
 };
-static void
+static char *
-sid_to_str(struct cifs_sid *sidptr, char *sidstr)
+sid_to_key_str(struct cifs_sid *sidptr, unsigned int type)
 {
-        int i;
+        int i, len;
-        unsigned long saval;
+        unsigned int saval;
-        char *strptr;
+        char *sidstr, *strptr;
+        unsigned long long id_auth_val;
+        /* 3 bytes for prefix */
+        sidstr = kmalloc(3 + SID_STRING_BASE_SIZE +
+                         (SID_STRING_SUBAUTH_SIZE * sidptr->num_subauth),
+                         GFP_KERNEL);
+        if (!sidstr)
+                return sidstr;
        strptr = sidstr;
+        len = sprintf(strptr, "%cs:S-%hhu", type == SIDOWNER ? 'o' : 'g',
+                        sidptr->revision);
+        strptr += len;
+        /* The authority field is a single 48-bit number */
+        id_auth_val = (unsigned long long)sidptr->authority[5];
+        id_auth_val |= (unsigned long long)sidptr->authority[4] << 8;
+        id_auth_val |= (unsigned long long)sidptr->authority[3] << 16;
+        id_auth_val |= (unsigned long long)sidptr->authority[2] << 24;
+        id_auth_val |= (unsigned long long)sidptr->authority[1] << 32;
+        id_auth_val |= (unsigned long long)sidptr->authority[0] << 48;
-        sprintf(strptr, "%s", "S");
+        /*
-        strptr = sidstr + strlen(sidstr);
+         * MS-DTYP states that if the authority is >= 2^32, then it should be
+         * expressed as a hex value.
-        sprintf(strptr, "-%d", sidptr->revision);
+         */
-        strptr = sidstr + strlen(sidstr);
+        if (id_auth_val <= UINT_MAX)
+                len = sprintf(strptr, "-%llu", id_auth_val);
+        else
+                len = sprintf(strptr, "-0x%llx", id_auth_val);
-        for (i = 0; i < 6; ++i) {
+        strptr += len;
-                if (sidptr->authority[i]) {
-                        sprintf(strptr, "-%d", sidptr->authority[i]);
-                        strptr = sidstr + strlen(sidstr);
-                }
-        }
        for (i = 0; i < sidptr->num_subauth; ++i) {
                saval = le32_to_cpu(sidptr->sub_auth[i]);
-                sprintf(strptr, "-%ld", saval);
+                len = sprintf(strptr, "-%u", saval);
-                strptr = sidstr + strlen(sidstr);
+                strptr += len;
        }
+        return sidstr;
 }
-static void
+/*
-id_rb_insert(struct rb_root *root, struct cifs_sid *sidptr,
+ * if the two SIDs (roughly equivalent to a UUID for a user or group) are
-                struct cifs_sid_id **psidid, char *typestr)
+ * the same returns zero, if they do not match returns non-zero.
+ */
+static int
+compare_sids(const struct cifs_sid *ctsid, const struct cifs_sid *cwsid)
 {
-        int rc;
+        int i;
-        char *strptr;
+        int num_subauth, num_sat, num_saw;
-        struct rb_node *node = root->rb_node;
-        struct rb_node *parent = NULL;
-        struct rb_node **linkto = &(root->rb_node);
-        struct cifs_sid_id *lsidid;
-        while (node) {
-                lsidid = rb_entry(node, struct cifs_sid_id, rbnode);
-                parent = node;
-                rc = compare_sids(sidptr, &((lsidid)->sid));
-                if (rc > 0) {
-                        linkto = &(node->rb_left);
-                        node = node->rb_left;
-                } else if (rc < 0) {
-                        linkto = &(node->rb_right);
-                        node = node->rb_right;
-                }
-        }
-        memcpy(&(*psidid)->sid, sidptr, sizeof(struct cifs_sid));
-        (*psidid)->time = jiffies - (SID_MAP_RETRY + 1);
-        (*psidid)->refcount = 0;
-        sprintf((*psidid)->sidstr, "%s", typestr);
+        if ((!ctsid) || (!cwsid))
-        strptr = (*psidid)->sidstr + strlen((*psidid)->sidstr);
+                return 1;
-        sid_to_str(&(*psidid)->sid, strptr);
-        clear_bit(SID_ID_PENDING, &(*psidid)->state);
+        /* compare the revision */
-        clear_bit(SID_ID_MAPPED, &(*psidid)->state);
+        if (ctsid->revision != cwsid->revision) {
+                if (ctsid->revision > cwsid->revision)
+                        return 1;
+                else
+                        return -1;
+        }
-        rb_link_node(&(*psidid)->rbnode, parent, linkto);
+        /* compare all of the six auth values */
-        rb_insert_color(&(*psidid)->rbnode, root);
+        for (i = 0; i < NUM_AUTHS; ++i) {
-}
+                if (ctsid->authority[i] != cwsid->authority[i]) {
+                        if (ctsid->authority[i] > cwsid->authority[i])
+                                return 1;
+                        else
+                                return -1;
+                }
+        }
-static struct cifs_sid_id *
+        /* compare all of the subauth values if any */
-id_rb_search(struct rb_root *root, struct cifs_sid *sidptr)
+        num_sat = ctsid->num_subauth;
-{
+        num_saw = cwsid->num_subauth;
-        int rc;
+        num_subauth = num_sat < num_saw ? num_sat : num_saw;
-        struct rb_node *node = root->rb_node;
+        if (num_subauth) {
-        struct cifs_sid_id *lsidid;
+                for (i = 0; i < num_subauth; ++i) {
+                        if (ctsid->sub_auth[i] != cwsid->sub_auth[i]) {
-        while (node) {
+                                if (le32_to_cpu(ctsid->sub_auth[i]) >
-                lsidid = rb_entry(node, struct cifs_sid_id, rbnode);
+                                        le32_to_cpu(cwsid->sub_auth[i]))
-                rc = compare_sids(sidptr, &((lsidid)->sid));
+                                        return 1;
-                if (rc > 0) {
+                                else
-                        node = node->rb_left;
+                                        return -1;
-                } else if (rc < 0) {
+                        }
-                        node = node->rb_right;
+                }
-                } else /* node found */
-                        return lsidid;
        }
-        return NULL;
+        return 0; /* sids compare/match */
 }
-static int
+static void
-sidid_pending_wait(void *unused)
+cifs_copy_sid(struct cifs_sid *dst, const struct cifs_sid *src)
 {
-        schedule();
+        int i;
-        return signal_pending(current) ? -ERESTARTSYS : 0;
+        dst->revision = src->revision;
+        dst->num_subauth = min_t(u8, src->num_subauth, SID_MAX_SUB_AUTHORITIES);
+        for (i = 0; i < NUM_AUTHS; ++i)
+                dst->authority[i] = src->authority[i];
+        for (i = 0; i < dst->num_subauth; ++i)
+                dst->sub_auth[i] = src->sub_auth[i];
 }
 static int
-id_to_sid(unsigned long cid, uint sidtype, struct cifs_sid *ssid)
+id_to_sid(unsigned int cid, uint sidtype, struct cifs_sid *ssid)
 {
-        int rc = 0;
+        int rc;
        struct key *sidkey;
+        struct cifs_sid *ksid;
+        unsigned int ksid_size;
+        char desc[3 + 10 + 1]; /* 3 byte prefix + 10 bytes for value + NULL */
        const struct cred *saved_cred;
-        struct cifs_sid *lsid;
-        struct cifs_sid_id *psidid, *npsidid;
-        struct rb_root *cidtree;
-        spinlock_t *cidlock;
-        if (sidtype == SIDOWNER) {
-                cidlock = &siduidlock;
-                cidtree = &uidtree;
-        } else if (sidtype == SIDGROUP) {
-                cidlock = &sidgidlock;
-                cidtree = &gidtree;
-        } else
-                return -EINVAL;
-        spin_lock(cidlock);
-        psidid = sid_rb_search(cidtree, cid);
-        if (!psidid) { /* node does not exist, allocate one & attempt adding */
-                spin_unlock(cidlock);
-                npsidid = kzalloc(sizeof(struct cifs_sid_id), GFP_KERNEL);
-                if (!npsidid)
-                        return -ENOMEM;
-                npsidid->sidstr = kmalloc(SIDLEN, GFP_KERNEL);
+        rc = snprintf(desc, sizeof(desc), "%ci:%u",
-                if (!npsidid->sidstr) {
+                        sidtype == SIDOWNER ? 'o' : 'g', cid);
-                        kfree(npsidid);
+        if (rc >= sizeof(desc))
-                        return -ENOMEM;
+                return -EINVAL;
-                }
-                spin_lock(cidlock);
+        rc = 0;
-                psidid = sid_rb_search(cidtree, cid);
+        saved_cred = override_creds(root_cred);
-                if (psidid) { /* node happened to get inserted meanwhile */
+        sidkey = request_key(&cifs_idmap_key_type, desc, "");
-                        ++psidid->refcount;
+        if (IS_ERR(sidkey)) {
-                        spin_unlock(cidlock);
+                rc = -EINVAL;
-                        kfree(npsidid->sidstr);
+                cFYI(1, "%s: Can't map %cid %u to a SID", __func__,
-                        kfree(npsidid);
+                        sidtype == SIDOWNER ? 'u' : 'g', cid);
-                } else {
+                goto out_revert_creds;
-                        psidid = npsidid;
+        } else if (sidkey->datalen < CIFS_SID_BASE_SIZE) {
-                        sid_rb_insert(cidtree, cid, &psidid,
+                rc = -EIO;
-                                        sidtype == SIDOWNER ? "oi:" : "gi:");
+                cFYI(1, "%s: Downcall contained malformed key "
-                        ++psidid->refcount;
+                        "(datalen=%hu)", __func__, sidkey->datalen);
-                        spin_unlock(cidlock);
+                goto invalidate_key;
-                }
-        } else {
-                ++psidid->refcount;
-                spin_unlock(cidlock);
        }
        /*
-         * If we are here, it is safe to access psidid and its fields
+         * A sid is usually too large to be embedded in payload.value, but if
-         * since a reference was taken earlier while holding the spinlock.
+         * there are no subauthorities and the host has 8-byte pointers, then
-         * A reference on the node is put without holding the spinlock
+         * it could be.
-         * and it is OK to do so in this case, shrinker will not erase
-         * this node until all references are put and we do not access
-         * any fields of the node after a reference is put .
         */
-        if (test_bit(SID_ID_MAPPED, &psidid->state)) {
+        ksid = sidkey->datalen <= sizeof(sidkey->payload) ?
-                memcpy(ssid, &psidid->sid, sizeof(struct cifs_sid));
+                (struct cifs_sid *)&sidkey->payload.value :
-                psidid->time = jiffies; /* update ts for accessing */
+                (struct cifs_sid *)sidkey->payload.data;
-                goto id_sid_out;
-        }
+        ksid_size = CIFS_SID_BASE_SIZE + (ksid->num_subauth * sizeof(__le32));
+        if (ksid_size > sidkey->datalen) {
-        if (time_after(psidid->time + SID_MAP_RETRY, jiffies)) {
+                rc = -EIO;
-                rc = -EINVAL;
+                cFYI(1, "%s: Downcall contained malformed key (datalen=%hu, "
-                goto id_sid_out;
+                        "ksid_size=%u)", __func__, sidkey->datalen, ksid_size);
+                goto invalidate_key;
        }
-        if (!test_and_set_bit(SID_ID_PENDING, &psidid->state)) {
+        cifs_copy_sid(ssid, ksid);
-                saved_cred = override_creds(root_cred);
+out_key_put:
-                sidkey = request_key(&cifs_idmap_key_type, psidid->sidstr, "");
+        key_put(sidkey);
-                if (IS_ERR(sidkey)) {
+out_revert_creds:
-                        rc = -EINVAL;
+        revert_creds(saved_cred);
-                        cFYI(1, "%s: Can't map and id to a SID", __func__);
-                } else {
-                        lsid = (struct cifs_sid *)sidkey->payload.data;
-                        memcpy(&psidid->sid, lsid,
-                                sidkey->datalen < sizeof(struct cifs_sid) ?
-                                sidkey->datalen : sizeof(struct cifs_sid));
-                        memcpy(ssid, &psidid->sid,
-                                sidkey->datalen < sizeof(struct cifs_sid) ?
-                                sidkey->datalen : sizeof(struct cifs_sid));
-                        set_bit(SID_ID_MAPPED, &psidid->state);
-                        key_put(sidkey);
-                        kfree(psidid->sidstr);
-                }
-                psidid->time = jiffies; /* update ts for accessing */
-                revert_creds(saved_cred);
-                clear_bit(SID_ID_PENDING, &psidid->state);
-                wake_up_bit(&psidid->state, SID_ID_PENDING);
-        } else {
-                rc = wait_on_bit(&psidid->state, SID_ID_PENDING,
-                                sidid_pending_wait, TASK_INTERRUPTIBLE);
-                if (rc) {
-                        cFYI(1, "%s: sidid_pending_wait interrupted %d",
-                                        __func__, rc);
-                        --psidid->refcount;
-                        return rc;
-                }
-                if (test_bit(SID_ID_MAPPED, &psidid->state))
-                        memcpy(ssid, &psidid->sid, sizeof(struct cifs_sid));
-                else
-                        rc = -EINVAL;
-        }
-id_sid_out:
-        --psidid->refcount;
        return rc;
+invalidate_key:
+        key_invalidate(sidkey);
+        goto out_key_put;
 }
 static int
@@ -410,111 +263,67 @@ sid_to_id(struct cifs_sb_info *cifs_sb, struct cifs_sid *psid,
                struct cifs_fattr *fattr, uint sidtype)
 {
        int rc;
-        unsigned long cid;
+        struct key *sidkey;
-        struct key *idkey;
+        char *sidstr;
        const struct cred *saved_cred;
-        struct cifs_sid_id *psidid, *npsidid;
+        uid_t fuid = cifs_sb->mnt_uid;
-        struct rb_root *cidtree;
+        gid_t fgid = cifs_sb->mnt_gid;
-        spinlock_t *cidlock;
-        if (sidtype == SIDOWNER) {
-                cid = cifs_sb->mnt_uid; /* default uid, in case upcall fails */
-                cidlock = &siduidlock;
-                cidtree = &uidtree;
-        } else if (sidtype == SIDGROUP) {
-                cid = cifs_sb->mnt_gid; /* default gid, in case upcall fails */
-                cidlock = &sidgidlock;
-                cidtree = &gidtree;
-        } else
-                return -ENOENT;
-        spin_lock(cidlock);
-        psidid = id_rb_search(cidtree, psid);
-        if (!psidid) { /* node does not exist, allocate one & attempt adding */
-                spin_unlock(cidlock);
-                npsidid = kzalloc(sizeof(struct cifs_sid_id), GFP_KERNEL);
-                if (!npsidid)
-                        return -ENOMEM;
-                npsidid->sidstr = kmalloc(SIDLEN, GFP_KERNEL);
-                if (!npsidid->sidstr) {
-                        kfree(npsidid);
-                        return -ENOMEM;
-                }
-                spin_lock(cidlock);
-                psidid = id_rb_search(cidtree, psid);
-                if (psidid) { /* node happened to get inserted meanwhile */
-                        ++psidid->refcount;
-                        spin_unlock(cidlock);
-                        kfree(npsidid->sidstr);
-                        kfree(npsidid);
-                } else {
-                        psidid = npsidid;
-                        id_rb_insert(cidtree, psid, &psidid,
-                                        sidtype == SIDOWNER ? "os:" : "gs:");
-                        ++psidid->refcount;
-                        spin_unlock(cidlock);
-                }
-        } else {
-                ++psidid->refcount;
-                spin_unlock(cidlock);
-        }
        /*
-         * If we are here, it is safe to access psidid and its fields
+         * If we have too many subauthorities, then something is really wrong.
-         * since a reference was taken earlier while holding the spinlock.
+         * Just return an error.
-         * A reference on the node is put without holding the spinlock
-         * and it is OK to do so in this case, shrinker will not erase
-         * this node until all references are put and we do not access
-         * any fields of the node after a reference is put .
         */
-        if (test_bit(SID_ID_MAPPED, &psidid->state)) {
+        if (unlikely(psid->num_subauth > SID_MAX_SUB_AUTHORITIES)) {
-                cid = psidid->id;
+                cFYI(1, "%s: %u subauthorities is too many!", __func__,
-                psidid->time = jiffies; /* update ts for accessing */
+                        psid->num_subauth);
-                goto sid_to_id_out;
+                return -EIO;
        }
-        if (time_after(psidid->time + SID_MAP_RETRY, jiffies))
+        sidstr = sid_to_key_str(psid, sidtype);
-                goto sid_to_id_out;
+        if (!sidstr)
+                return -ENOMEM;
-        if (!test_and_set_bit(SID_ID_PENDING, &psidid->state)) {
-                saved_cred = override_creds(root_cred);
+        saved_cred = override_creds(root_cred);
-                idkey = request_key(&cifs_idmap_key_type, psidid->sidstr, "");
+        sidkey = request_key(&cifs_idmap_key_type, sidstr, "");
-                if (IS_ERR(idkey))
+        if (IS_ERR(sidkey)) {
-                        cFYI(1, "%s: Can't map SID to an id", __func__);
+                rc = -EINVAL;
-                else {
+                cFYI(1, "%s: Can't map SID %s to a %cid", __func__, sidstr,
-                        cid = *(unsigned long *)idkey->payload.value;
+                        sidtype == SIDOWNER ? 'u' : 'g');
-                        psidid->id = cid;
+                goto out_revert_creds;
-                        set_bit(SID_ID_MAPPED, &psidid->state);
+        }
-                        key_put(idkey);
-                        kfree(psidid->sidstr);
+        /*
-                }
+         * FIXME: Here we assume that uid_t and gid_t are same size. It's
-                revert_creds(saved_cred);
+         * probably a safe assumption but might be better to check based on
-                psidid->time = jiffies; /* update ts for accessing */
+         * sidtype.
-                clear_bit(SID_ID_PENDING, &psidid->state);
+         */
-                wake_up_bit(&psidid->state, SID_ID_PENDING);
+        if (sidkey->datalen != sizeof(uid_t)) {
-        } else {
+                rc = -EIO;
-                rc = wait_on_bit(&psidid->state, SID_ID_PENDING,
+                cFYI(1, "%s: Downcall contained malformed key "
-                                sidid_pending_wait, TASK_INTERRUPTIBLE);
+                        "(datalen=%hu)", __func__, sidkey->datalen);
-                if (rc) {
+                key_invalidate(sidkey);
-                        cFYI(1, "%s: sidid_pending_wait interrupted %d",
+                goto out_key_put;
-                                        __func__, rc);
-                        --psidid->refcount; /* decremented without spinlock */
-                        return rc;
-                }
-                if (test_bit(SID_ID_MAPPED, &psidid->state))
-                        cid = psidid->id;
        }
-sid_to_id_out:
-        --psidid->refcount; /* decremented without spinlock */
        if (sidtype == SIDOWNER)
-                fattr->cf_uid = cid;
+                memcpy(&fuid, &sidkey->payload.value, sizeof(uid_t));
        else
-                fattr->cf_gid = cid;
+                memcpy(&fgid, &sidkey->payload.value, sizeof(gid_t));
+out_key_put:
+        key_put(sidkey);
+out_revert_creds:
+        revert_creds(saved_cred);
+        kfree(sidstr);
+        /*
+         * Note that we return 0 here unconditionally. If the mapping
+         * fails then we just fall back to using the mnt_uid/mnt_gid.
+         */
+        if (sidtype == SIDOWNER)
+                fattr->cf_uid = fuid;
+        else
+                fattr->cf_gid = fgid;
        return 0;
 }
@@ -537,19 +346,15 @@ init_cifs_idmap(void)
        if (!cred)
                return -ENOMEM;
-        keyring = key_alloc(&key_type_keyring, ".cifs_idmap", 0, 0, cred,
+        keyring = keyring_alloc(".cifs_idmap", 0, 0, cred,
-                            (KEY_POS_ALL & ~KEY_POS_SETATTR) |
+                                (KEY_POS_ALL & ~KEY_POS_SETATTR) |
-                            KEY_USR_VIEW | KEY_USR_READ,
+                                KEY_USR_VIEW | KEY_USR_READ,
-                            KEY_ALLOC_NOT_IN_QUOTA);
+                                KEY_ALLOC_NOT_IN_QUOTA, NULL);
        if (IS_ERR(keyring)) {
                ret = PTR_ERR(keyring);
                goto failed_put_cred;
        }
-        ret = key_instantiate_and_link(keyring, NULL, 0, NULL, NULL);
-        if (ret < 0)
-                goto failed_put_key;
        ret = register_key_type(&cifs_idmap_key_type);
        if (ret < 0)
                goto failed_put_key;
@@ -561,17 +366,6 @@ init_cifs_idmap(void)
        cred->jit_keyring = KEY_REQKEY_DEFL_THREAD_KEYRING;
        root_cred = cred;
-        spin_lock_init(&siduidlock);
-        uidtree = RB_ROOT;
-        spin_lock_init(&sidgidlock);
-        gidtree = RB_ROOT;
-        spin_lock_init(&uidsidlock);
-        siduidtree = RB_ROOT;
-        spin_lock_init(&gidsidlock);
-        sidgidtree = RB_ROOT;
-        register_shrinker(&cifs_shrinker);
        cFYI(1, "cifs idmap keyring: %d", key_serial(keyring));
        return 0;
@@ -588,95 +382,13 @@ exit_cifs_idmap(void)
        key_revoke(root_cred->thread_keyring);
        unregister_key_type(&cifs_idmap_key_type);
        put_cred(root_cred);
-        unregister_shrinker(&cifs_shrinker);
        cFYI(1, "Unregistered %s key type", cifs_idmap_key_type.name);
 }
-void
-cifs_destroy_idmaptrees(void)
-{
-        struct rb_root *root;
-        struct rb_node *node;
-        root = &uidtree;
-        spin_lock(&siduidlock);
-        while ((node = rb_first(root)))
-                rb_erase(node, root);
-        spin_unlock(&siduidlock);
-        root = &gidtree;
-        spin_lock(&sidgidlock);
-        while ((node = rb_first(root)))
-                rb_erase(node, root);
-        spin_unlock(&sidgidlock);
-        root = &siduidtree;
-        spin_lock(&uidsidlock);
-        while ((node = rb_first(root)))
-                rb_erase(node, root);
-        spin_unlock(&uidsidlock);
-        root = &sidgidtree;
-        spin_lock(&gidsidlock);
-        while ((node = rb_first(root)))
-                rb_erase(node, root);
-        spin_unlock(&gidsidlock);
-}
-/* if the two SIDs (roughly equivalent to a UUID for a user or group) are
-   the same returns 1, if they do not match returns 0 */
-int compare_sids(const struct cifs_sid *ctsid, const struct cifs_sid *cwsid)
-{
-        int i;
-        int num_subauth, num_sat, num_saw;
-        if ((!ctsid) || (!cwsid))
-                return 1;
-        /* compare the revision */
-        if (ctsid->revision != cwsid->revision) {
-                if (ctsid->revision > cwsid->revision)
-                        return 1;
-                else
-                        return -1;
-        }
-        /* compare all of the six auth values */
-        for (i = 0; i < 6; ++i) {
-                if (ctsid->authority[i] != cwsid->authority[i]) {
-                        if (ctsid->authority[i] > cwsid->authority[i])
-                                return 1;
-                        else
-                                return -1;
-                }
-        }
-        /* compare all of the subauth values if any */
-        num_sat = ctsid->num_subauth;
-        num_saw = cwsid->num_subauth;
-        num_subauth = num_sat < num_saw ? num_sat : num_saw;
-        if (num_subauth) {
-                for (i = 0; i < num_subauth; ++i) {
-                        if (ctsid->sub_auth[i] != cwsid->sub_auth[i]) {
-                                if (le32_to_cpu(ctsid->sub_auth[i]) >
-                                        le32_to_cpu(cwsid->sub_auth[i]))
-                                        return 1;
-                                else
-                                        return -1;
-                        }
-                }
-        }
-        return 0; /* sids compare/match */
-}
 /* copy ntsd, owner sid, and group sid from a security descriptor to another */
 static void copy_sec_desc(const struct cifs_ntsd *pntsd,
                                struct cifs_ntsd *pnntsd, __u32 sidsoffset)
 {
-        int i;
        struct cifs_sid *owner_sid_ptr, *group_sid_ptr;
        struct cifs_sid *nowner_sid_ptr, *ngroup_sid_ptr;
@@ -692,26 +404,14 @@ static void copy_sec_desc(const struct cifs_ntsd *pntsd,
        owner_sid_ptr = (struct cifs_sid *)((char *)pntsd +
                                le32_to_cpu(pntsd->osidoffset));
        nowner_sid_ptr = (struct cifs_sid *)((char *)pnntsd + sidsoffset);
+        cifs_copy_sid(nowner_sid_ptr, owner_sid_ptr);
-        nowner_sid_ptr->revision = owner_sid_ptr->revision;
-        nowner_sid_ptr->num_subauth = owner_sid_ptr->num_subauth;
-        for (i = 0; i < 6; i++)
-                nowner_sid_ptr->authority[i] = owner_sid_ptr->authority[i];
-        for (i = 0; i < 5; i++)
-                nowner_sid_ptr->sub_auth[i] = owner_sid_ptr->sub_auth[i];
        /* copy group sid */
        group_sid_ptr = (struct cifs_sid *)((char *)pntsd +
                                le32_to_cpu(pntsd->gsidoffset));
        ngroup_sid_ptr = (struct cifs_sid *)((char *)pnntsd + sidsoffset +
                                        sizeof(struct cifs_sid));
+        cifs_copy_sid(ngroup_sid_ptr, group_sid_ptr);
-        ngroup_sid_ptr->revision = group_sid_ptr->revision;
-        ngroup_sid_ptr->num_subauth = group_sid_ptr->num_subauth;
-        for (i = 0; i < 6; i++)
-                ngroup_sid_ptr->authority[i] = group_sid_ptr->authority[i];
-        for (i = 0; i < 5; i++)
-                ngroup_sid_ptr->sub_auth[i] = group_sid_ptr->sub_auth[i];
        return;
 }
@@ -818,7 +518,7 @@ static __u16 fill_ace_for_sid(struct cifs_ace *pntace,
        pntace->sid.revision = psid->revision;
        pntace->sid.num_subauth = psid->num_subauth;
-        for (i = 0; i < 6; i++)
+        for (i = 0; i < NUM_AUTHS; i++)
                pntace->sid.authority[i] = psid->authority[i];
        for (i = 0; i < psid->num_subauth; i++)
                pntace->sid.sub_auth[i] = psid->sub_auth[i];
@@ -994,8 +694,8 @@ static int parse_sid(struct cifs_sid *psid, char *end_of_acl)
                return -EINVAL;
        }
-        if (psid->num_subauth) {
 #ifdef CONFIG_CIFS_DEBUG2
+        if (psid->num_subauth) {
                int i;
                cFYI(1, "SID revision %d num_auth %d",
                        psid->revision, psid->num_subauth);
@@ -1009,8 +709,8 @@ static int parse_sid(struct cifs_sid *psid, char *end_of_acl)
                        num auths and therefore go off the end */
                cFYI(1, "RID 0x%x",
                        le32_to_cpu(psid->sub_auth[psid->num_subauth-1]));
-#endif
        }
+#endif
        return 0;
 }
@@ -1120,8 +820,7 @@ static int build_sec_desc(struct cifs_ntsd *pntsd, struct cifs_ntsd *pnntsd,
                                kfree(nowner_sid_ptr);
                                return rc;
                        }
-                        memcpy(owner_sid_ptr, nowner_sid_ptr,
+                        cifs_copy_sid(owner_sid_ptr, nowner_sid_ptr);
-                                        sizeof(struct cifs_sid));
                        kfree(nowner_sid_ptr);
                        *aclflag = CIFS_ACL_OWNER;
                }
@@ -1139,8 +838,7 @@ static int build_sec_desc(struct cifs_ntsd *pntsd, struct cifs_ntsd *pnntsd,
                                kfree(ngroup_sid_ptr);
                                return rc;
                        }
-                        memcpy(group_sid_ptr, ngroup_sid_ptr,
+                        cifs_copy_sid(group_sid_ptr, ngroup_sid_ptr);
-                                        sizeof(struct cifs_sid));
                        kfree(ngroup_sid_ptr);
                        *aclflag = CIFS_ACL_GROUP;
                }
@@ -1316,42 +1014,39 @@ id_mode_to_cifs_acl(struct inode *inode, const char *path, __u64 nmode,
        /* Get the security descriptor */
        pntsd = get_cifs_acl(CIFS_SB(inode->i_sb), inode, path, &secdesclen);
-        /* Add three ACEs for owner, group, everyone getting rid of
-           other ACEs as chmod disables ACEs and set the security descriptor */
        if (IS_ERR(pntsd)) {
                rc = PTR_ERR(pntsd);
                cERROR(1, "%s: error %d getting sec desc", __func__, rc);
-        } else {
+                goto out;
-                /* allocate memory for the smb header,
+        }
-                   set security descriptor request security descriptor
-                   parameters, and secuirty descriptor itself */
-                secdesclen = secdesclen < DEFSECDESCLEN ?
-                                        DEFSECDESCLEN : secdesclen;
-                pnntsd = kmalloc(secdesclen, GFP_KERNEL);
-                if (!pnntsd) {
-                        cERROR(1, "Unable to allocate security descriptor");
-                        kfree(pntsd);
-                        return -ENOMEM;
-                }
-                rc = build_sec_desc(pntsd, pnntsd, secdesclen, nmode, uid, gid,
+        /*
-                                        &aclflag);
+         * Add three ACEs for owner, group, everyone getting rid of other ACEs
+         * as chmod disables ACEs and set the security descriptor. Allocate
+         * memory for the smb header, set security descriptor request security
+         * descriptor parameters, and secuirty descriptor itself
+         */
+        secdesclen = max_t(u32, secdesclen, DEFAULT_SEC_DESC_LEN);
+        pnntsd = kmalloc(secdesclen, GFP_KERNEL);
+        if (!pnntsd) {
+                cERROR(1, "Unable to allocate security descriptor");
+                kfree(pntsd);
+                return -ENOMEM;
+        }
-                cFYI(DBG2, "build_sec_desc rc: %d", rc);
+        rc = build_sec_desc(pntsd, pnntsd, secdesclen, nmode, uid, gid,
+                                &aclflag);
-                if (!rc) {
+        cFYI(DBG2, "build_sec_desc rc: %d", rc);
-                        /* Set the security descriptor */
-                        rc = set_cifs_acl(pnntsd, secdesclen, inode,
-                                                path, aclflag);
-                        cFYI(DBG2, "set_cifs_acl rc: %d", rc);
-                }
-                kfree(pnntsd);
+        if (!rc) {
-                kfree(pntsd);
+                /* Set the security descriptor */
+                rc = set_cifs_acl(pnntsd, secdesclen, inode, path, aclflag);
+                cFYI(DBG2, "set_cifs_acl rc: %d", rc);
        }
+        kfree(pnntsd);
+        kfree(pntsd);
+out:
        return rc;
 }
diff --git a/fs/cifs/cifsacl.h b/fs/cifs/cifsacl.h
index 5c902c7ce524..4f3884835267 100644
--- a/fs/cifs/cifsacl.h
+++ b/fs/cifs/cifsacl.h
@@ -23,11 +23,8 @@
 #define _CIFSACL_H
-#define NUM_AUTHS 6 /* number of authority fields */
+#define NUM_AUTHS (6)   /* number of authority fields */
-#define NUM_SUBAUTHS 5 /* number of sub authority fields */
+#define SID_MAX_SUB_AUTHORITIES (15) /* max number of sub authority fields */
-#define NUM_WK_SIDS 7 /* number of well known sids */
-#define SIDNAMELENGTH 20 /* long enough for the ones we care about */
-#define DEFSECDESCLEN 192 /* sec desc len contaiting a dacl with three aces */
 #define READ_BIT        0x4
 #define WRITE_BIT       0x2
@@ -41,12 +38,32 @@
 #define SIDOWNER 1
 #define SIDGROUP 2
-#define SIDLEN 150 /* S- 1 revision- 6 authorities- max 5 sub authorities */
-#define SID_ID_MAPPED 0
+/*
-#define SID_ID_PENDING 1
+ * Security Descriptor length containing DACL with 3 ACEs (one each for
-#define SID_MAP_EXPIRE (3600 * HZ) /* map entry expires after one hour */
+ * owner, group and world).
-#define SID_MAP_RETRY (300 * HZ)   /* wait 5 minutes for next attempt to map */
+ */
+#define DEFAULT_SEC_DESC_LEN (sizeof(struct cifs_ntsd) + \
+                              sizeof(struct cifs_acl) + \
+                              (sizeof(struct cifs_ace) * 3))
+/*
+ * Maximum size of a string representation of a SID:
+ *
+ * The fields are unsigned values in decimal. So:
+ *
+ * u8:  max 3 bytes in decimal
+ * u32: max 10 bytes in decimal
+ *
+ * "S-" + 3 bytes for version field + 15 for authority field + NULL terminator
+ *
+ * For authority field, max is when all 6 values are non-zero and it must be
+ * represented in hex. So "-0x" + 12 hex digits.
+ *
+ * Add 11 bytes for each subauthority field (10 bytes each + 1 for '-')
+ */
+#define SID_STRING_BASE_SIZE (2 + 3 + 15 + 1)
+#define SID_STRING_SUBAUTH_SIZE (11) /* size of a single subauth string */
 struct cifs_ntsd {
        __le16 revision; /* revision level */
@@ -60,10 +77,13 @@ struct cifs_ntsd {
 struct cifs_sid {
        __u8 revision; /* revision level */
        __u8 num_subauth;
-        __u8 authority[6];
+        __u8 authority[NUM_AUTHS];
-        __le32 sub_auth[5]; /* sub_auth[num_subauth] */
+        __le32 sub_auth[SID_MAX_SUB_AUTHORITIES]; /* sub_auth[num_subauth] */
 } __attribute__((packed));
+/* size of a struct cifs_sid, sans sub_auth array */
+#define CIFS_SID_BASE_SIZE (1 + 1 + NUM_AUTHS)
 struct cifs_acl {
        __le16 revision; /* revision level */
        __le16 size;
@@ -78,26 +98,4 @@ struct cifs_ace {
        struct cifs_sid sid; /* ie UUID of user or group who gets these perms */
 } __attribute__((packed));
-struct cifs_wksid {
-        struct cifs_sid cifssid;
-        char sidname[SIDNAMELENGTH];
-} __attribute__((packed));
-struct cifs_sid_id {
-        unsigned int refcount; /* increment with spinlock, decrement without */
-        unsigned long id;
-        unsigned long time;
-        unsigned long state;
-        char *sidstr;
-        struct rb_node rbnode;
-        struct cifs_sid sid;
-};
-#ifdef __KERNEL__
-extern struct key_type cifs_idmap_key_type;
-extern const struct cred *root_cred;
-#endif /* KERNEL */
-extern int compare_sids(const struct cifs_sid *, const struct cifs_sid *);
 #endif /* _CIFSACL_H */
diff --git a/fs/cifs/cifsfs.c b/fs/cifs/cifsfs.c
index e7931cc55d0c..ce9f3c5421bf 100644
--- a/fs/cifs/cifsfs.c
+++ b/fs/cifs/cifsfs.c
@@ -64,24 +64,23 @@ unsigned int global_secflags = CIFSSEC_DEF;
 unsigned int sign_CIFS_PDUs = 1;
 static const struct super_operations cifs_super_ops;
 unsigned int CIFSMaxBufSize = CIFS_MAX_MSGSIZE;
-module_param(CIFSMaxBufSize, int, 0);
+module_param(CIFSMaxBufSize, uint, 0);
 MODULE_PARM_DESC(CIFSMaxBufSize, "Network buffer size (not including header). "
                                 "Default: 16384 Range: 8192 to 130048");
 unsigned int cifs_min_rcv = CIFS_MIN_RCV_POOL;
-module_param(cifs_min_rcv, int, 0);
+module_param(cifs_min_rcv, uint, 0);
 MODULE_PARM_DESC(cifs_min_rcv, "Network buffers in pool. Default: 4 Range: "
                                "1 to 64");
 unsigned int cifs_min_small = 30;
-module_param(cifs_min_small, int, 0);
+module_param(cifs_min_small, uint, 0);
 MODULE_PARM_DESC(cifs_min_small, "Small network buffers in pool. Default: 30 "
                                 "Range: 2 to 256");
 unsigned int cifs_max_pending = CIFS_MAX_REQ;
-module_param(cifs_max_pending, int, 0444);
+module_param(cifs_max_pending, uint, 0444);
 MODULE_PARM_DESC(cifs_max_pending, "Simultaneous requests to server. "
                                   "Default: 32767 Range: 2 to 32767.");
 module_param(enable_oplocks, bool, 0644);
-MODULE_PARM_DESC(enable_oplocks, "Enable or disable oplocks (bool). Default:"
+MODULE_PARM_DESC(enable_oplocks, "Enable or disable oplocks. Default: y/Y/1");
-                                 "y/Y/1");
 extern mempool_t *cifs_sm_req_poolp;
 extern mempool_t *cifs_req_poolp;
@@ -230,6 +229,7 @@ cifs_alloc_inode(struct super_block *sb)
        cifs_set_oplock_level(cifs_inode, 0);
        cifs_inode->delete_pending = false;
        cifs_inode->invalid_mapping = false;
+        cifs_inode->leave_pages_clean = false;
        cifs_inode->vfs_inode.i_blkbits = 14;  /* 2**14 = CIFS_MAX_MSGSIZE */
        cifs_inode->server_eof = 0;
        cifs_inode->uniqueid = 0;
@@ -540,8 +540,8 @@ cifs_get_root(struct smb_vol *vol, struct super_block *sb)
        char *s, *p;
        char sep;
-        full_path = build_path_to_root(vol, cifs_sb,
+        full_path = cifs_build_path_to_root(vol, cifs_sb,
-                                       cifs_sb_master_tcon(cifs_sb));
+                                            cifs_sb_master_tcon(cifs_sb));
        if (full_path == NULL)
                return ERR_PTR(-ENOMEM);
@@ -695,13 +695,13 @@ static ssize_t cifs_file_aio_write(struct kiocb *iocb, const struct iovec *iov,
        return written;
 }
-static loff_t cifs_llseek(struct file *file, loff_t offset, int origin)
+static loff_t cifs_llseek(struct file *file, loff_t offset, int whence)
 {
        /*
-         * origin == SEEK_END || SEEK_DATA || SEEK_HOLE => we must revalidate
+         * whence == SEEK_END || SEEK_DATA || SEEK_HOLE => we must revalidate
         * the cached file length
         */
-        if (origin != SEEK_SET && origin != SEEK_CUR) {
+        if (whence != SEEK_SET && whence != SEEK_CUR) {
                int rc;
                struct inode *inode = file->f_path.dentry->d_inode;
@@ -728,7 +728,7 @@ static loff_t cifs_llseek(struct file *file, loff_t offset, int origin)
                if (rc < 0)
                        return (loff_t)rc;
        }
-        return generic_file_llseek(file, offset, origin);
+        return generic_file_llseek(file, offset, whence);
 }
 static int cifs_setlease(struct file *file, long arg, struct file_lock **lease)
@@ -1205,7 +1205,6 @@ exit_cifs(void)
        unregister_filesystem(&cifs_fs_type);
        cifs_dfs_release_automount_timer();
 #ifdef CONFIG_CIFS_ACL
-        cifs_destroy_idmaptrees();
        exit_cifs_idmap();
 #endif
 #ifdef CONFIG_CIFS_UPCALL
diff --git a/fs/cifs/cifsglob.h b/fs/cifs/cifsglob.h
index f5af2527fc69..aea1eec64911 100644
--- a/fs/cifs/cifsglob.h
+++ b/fs/cifs/cifsglob.h
@@ -178,6 +178,7 @@ struct smb_rqst {
 enum smb_version {
        Smb_1 = 1,
+        Smb_20,
        Smb_21,
        Smb_30,
 };
@@ -280,9 +281,6 @@ struct smb_version_operations {
        /* set attributes */
        int (*set_file_info)(struct inode *, const char *, FILE_BASIC_INFO *,
                             const unsigned int);
-        /* build a full path to the root of the mount */
-        char * (*build_path_to_root)(struct smb_vol *, struct cifs_sb_info *,
-                                     struct cifs_tcon *);
        /* check if we can send an echo or nor */
        bool (*can_echo)(struct TCP_Server_Info *);
        /* send echo request */
@@ -369,6 +367,8 @@ struct smb_version_operations {
        void (*set_lease_key)(struct inode *, struct cifs_fid *fid);
        /* generate new lease key */
        void (*new_lease_key)(struct cifs_fid *fid);
+        int (*calc_signature)(struct smb_rqst *rqst,
+                                   struct TCP_Server_Info *server);
 };
 struct smb_version_values {
@@ -396,7 +396,6 @@ struct smb_vol {
        char *password;
        char *domainname;
        char *UNC;
-        char *UNCip;
        char *iocharset;  /* local code page for mapping to and from Unicode */
        char source_rfc1001_name[RFC1001_NAME_LEN_WITH_NULL]; /* clnt nb name */
        char target_rfc1001_name[RFC1001_NAME_LEN_WITH_NULL]; /* srvr nb name */
@@ -444,11 +443,11 @@ struct smb_vol {
        unsigned int rsize;
        unsigned int wsize;
        bool sockopt_tcp_nodelay:1;
-        unsigned short int port;
        unsigned long actimeo; /* attribute cache timeout (jiffies) */
        struct smb_version_operations *ops;
        struct smb_version_values *vals;
        char *prepath;
+        struct sockaddr_storage dstaddr; /* destination address */
        struct sockaddr_storage srcaddr; /* allow binding to a local IP */
        struct nls_table *local_nls;
 };
@@ -1031,6 +1030,7 @@ struct cifsInodeInfo {
        bool clientCanCacheAll;         /* read and writebehind oplock */
        bool delete_pending;            /* DELETE_ON_CLOSE is set */
        bool invalid_mapping;           /* pagecache is invalid */
+        bool leave_pages_clean; /* protected by i_mutex, not set pages dirty */
        unsigned long time;             /* jiffies of last update of inode */
        u64  server_eof;                /* current file size on server -- protected by i_lock */
        u64  uniqueid;                  /* server inode number */
@@ -1067,30 +1067,16 @@ static inline char CIFS_DIR_SEP(const struct cifs_sb_info *cifs_sb)
 static inline void
 convert_delimiter(char *path, char delim)
 {
-        int i;
+        char old_delim, *pos;
-        char old_delim;
-        if (path == NULL)
-                return;
        if (delim == '/')
                old_delim = '\\';
        else
                old_delim = '/';
-        for (i = 0; path[i] != '\0'; i++) {
+        pos = path;
-                if (path[i] == old_delim)
+        while ((pos = strchr(pos, old_delim)))
-                        path[i] = delim;
+                *pos = delim;
-        }
-}
-static inline char *
-build_path_to_root(struct smb_vol *vol, struct cifs_sb_info *cifs_sb,
-                   struct cifs_tcon *tcon)
-{
-        if (!vol->ops->build_path_to_root)
-                return NULL;
-        return vol->ops->build_path_to_root(vol, cifs_sb, tcon);
 }
 #ifdef CONFIG_CIFS_STATS
@@ -1362,7 +1348,7 @@ require use of the stronger protocol */
 #define   CIFSSEC_MUST_SEAL     0x40040 /* not supported yet */
 #define   CIFSSEC_MUST_NTLMSSP  0x80080 /* raw ntlmssp with ntlmv2 */
-#define   CIFSSEC_DEF (CIFSSEC_MAY_SIGN | CIFSSEC_MAY_NTLM | CIFSSEC_MAY_NTLMV2 | CIFSSEC_MAY_NTLMSSP)
+#define   CIFSSEC_DEF (CIFSSEC_MAY_SIGN | CIFSSEC_MAY_NTLMSSP)
 #define   CIFSSEC_MAX (CIFSSEC_MUST_SIGN | CIFSSEC_MUST_NTLMV2)
 #define   CIFSSEC_AUTH_MASK (CIFSSEC_MAY_NTLM | CIFSSEC_MAY_NTLMV2 | CIFSSEC_MAY_LANMAN | CIFSSEC_MAY_PLNTXT | CIFSSEC_MAY_KRB5 | CIFSSEC_MAY_NTLMSSP)
 /*
@@ -1506,6 +1492,6 @@ extern struct smb_version_values smb20_values;
 extern struct smb_version_operations smb21_operations;
 extern struct smb_version_values smb21_values;
 #define SMB30_VERSION_STRING    "3.0"
-/*extern struct smb_version_operations smb30_operations; */ /* not needed yet */
+extern struct smb_version_operations smb30_operations;
 extern struct smb_version_values smb30_values;
 #endif  /* _CIFS_GLOB_H */
diff --git a/fs/cifs/cifsproto.h b/fs/cifs/cifsproto.h
index 5144e9fbeb8c..1988c1baa224 100644
--- a/fs/cifs/cifsproto.h
+++ b/fs/cifs/cifsproto.h
@@ -58,8 +58,10 @@ do {								\
 } while (0)
 extern int init_cifs_idmap(void);
 extern void exit_cifs_idmap(void);
-extern void cifs_destroy_idmaptrees(void);
 extern char *build_path_from_dentry(struct dentry *);
+extern char *cifs_build_path_to_root(struct smb_vol *vol,
+                                     struct cifs_sb_info *cifs_sb,
+                                     struct cifs_tcon *tcon);
 extern char *build_wildcard_path_from_dentry(struct dentry *direntry);
 extern char *cifs_compose_mount_options(const char *sb_mountdata,
                const char *fullpath, const struct dfs_info3_param *ref,
@@ -107,9 +109,7 @@ extern unsigned int smbCalcSize(void *buf);
 extern int decode_negTokenInit(unsigned char *security_blob, int length,
                        struct TCP_Server_Info *server);
 extern int cifs_convert_address(struct sockaddr *dst, const char *src, int len);
-extern int cifs_set_port(struct sockaddr *addr, const unsigned short int port);
+extern void cifs_set_port(struct sockaddr *addr, const unsigned short int port);
-extern int cifs_fill_sockaddr(struct sockaddr *dst, const char *src, int len,
-                                const unsigned short int port);
 extern int map_smb_to_linux_error(char *buf, bool logErr);
 extern void header_assemble(struct smb_hdr *, char /* command */ ,
                            const struct cifs_tcon *, int /* length of
@@ -185,7 +185,7 @@ extern void cifs_mark_open_files_invalid(struct cifs_tcon *tcon);
 extern bool cifs_find_lock_conflict(struct cifsFileInfo *cfile, __u64 offset,
                                    __u64 length, __u8 type,
                                    struct cifsLockInfo **conf_lock,
-                                    bool rw_check);
+                                    int rw_check);
 extern void cifs_add_pending_open(struct cifs_fid *fid,
                                  struct tcon_link *tlink,
                                  struct cifs_pending_open *open);
diff --git a/fs/cifs/connect.c b/fs/cifs/connect.c
index 5c670b998ffb..7635b5db26a7 100644
--- a/fs/cifs/connect.c
+++ b/fs/cifs/connect.c
@@ -186,6 +186,7 @@ static const match_table_t cifs_mount_option_tokens = {
        { Opt_user, "user=%s" },
        { Opt_user, "username=%s" },
        { Opt_blank_pass, "pass=" },
+        { Opt_blank_pass, "password=" },
        { Opt_pass, "pass=%s" },
        { Opt_pass, "password=%s" },
        { Opt_blank_ip, "ip=" },
@@ -274,6 +275,7 @@ static const match_table_t cifs_cacheflavor_tokens = {
 static const match_table_t cifs_smb_version_tokens = {
        { Smb_1, SMB1_VERSION_STRING },
+        { Smb_20, SMB20_VERSION_STRING},
        { Smb_21, SMB21_VERSION_STRING },
        { Smb_30, SMB30_VERSION_STRING },
 };
@@ -1074,12 +1076,16 @@ cifs_parse_smb_version(char *value, struct smb_vol *vol)
                vol->vals = &smb1_values;
                break;
 #ifdef CONFIG_CIFS_SMB2
+        case Smb_20:
+                vol->ops = &smb21_operations; /* currently identical with 2.1 */
+                vol->vals = &smb20_values;
+                break;
        case Smb_21:
                vol->ops = &smb21_operations;
                vol->vals = &smb21_values;
                break;
        case Smb_30:
-                vol->ops = &smb21_operations; /* currently identical with 2.1 */
+                vol->ops = &smb30_operations;
                vol->vals = &smb30_values;
                break;
 #endif
@@ -1090,6 +1096,52 @@ cifs_parse_smb_version(char *value, struct smb_vol *vol)
        return 0;
 }
+/*
+ * Parse a devname into substrings and populate the vol->UNC and vol->prepath
+ * fields with the result. Returns 0 on success and an error otherwise.
+ */
+static int
+cifs_parse_devname(const char *devname, struct smb_vol *vol)
+{
+        char *pos;
+        const char *delims = "/\\";
+        size_t len;
+        /* make sure we have a valid UNC double delimiter prefix */
+        len = strspn(devname, delims);
+        if (len != 2)
+                return -EINVAL;
+        /* find delimiter between host and sharename */
+        pos = strpbrk(devname + 2, delims);
+        if (!pos)
+                return -EINVAL;
+        /* skip past delimiter */
+        ++pos;
+        /* now go until next delimiter or end of string */
+        len = strcspn(pos, delims);
+        /* move "pos" up to delimiter or NULL */
+        pos += len;
+        vol->UNC = kstrndup(devname, pos - devname, GFP_KERNEL);
+        if (!vol->UNC)
+                return -ENOMEM;
+        convert_delimiter(vol->UNC, '\\');
+        /* If pos is NULL, or is a bogus trailing delimiter then no prepath */
+        if (!*pos++ || !*pos)
+                return 0;
+        vol->prepath = kstrdup(pos, GFP_KERNEL);
+        if (!vol->prepath)
+                return -ENOMEM;
+        return 0;
+}
 static int
 cifs_parse_mount_options(const char *mountdata, const char *devname,
                         struct smb_vol *vol)
@@ -1108,11 +1160,17 @@ cifs_parse_mount_options(const char *mountdata, const char *devname,
        char *string = NULL;
        char *tmp_end, *value;
        char delim;
+        bool got_ip = false;
+        unsigned short port = 0;
+        struct sockaddr *dstaddr = (struct sockaddr *)&vol->dstaddr;
        separator[0] = ',';
        separator[1] = 0;
        delim = separator[0];
+        /* ensure we always start with zeroed-out smb_vol */
+        memset(vol, 0, sizeof(*vol));
        /*
         * does not have to be perfect mapping since field is
         * informational, only used for servers that do not support
@@ -1169,6 +1227,16 @@ cifs_parse_mount_options(const char *mountdata, const char *devname,
        vol->backupuid_specified = false; /* no backup intent for a user */
        vol->backupgid_specified = false; /* no backup intent for a group */
+        /*
+         * For now, we ignore -EINVAL errors under the assumption that the
+         * unc= and prefixpath= options will be usable.
+         */
+        if (cifs_parse_devname(devname, vol) == -ENOMEM) {
+                printk(KERN_ERR "CIFS: Unable to allocate memory to parse "
+                                "device string.\n");
+                goto out_nomem;
+        }
        while ((data = strsep(&options, separator)) != NULL) {
                substring_t args[MAX_OPT_ARGS];
                unsigned long option;
@@ -1416,12 +1484,12 @@ cifs_parse_mount_options(const char *mountdata, const char *devname,
                        vol->dir_mode = option;
                        break;
                case Opt_port:
-                        if (get_option_ul(args, &option)) {
+                        if (get_option_ul(args, &option) ||
-                                cERROR(1, "%s: Invalid port value",
+                            option > USHRT_MAX) {
-                                        __func__);
+                                cERROR(1, "%s: Invalid port value", __func__);
                                goto cifs_parse_mount_err;
                        }
-                        vol->port = option;
+                        port = (unsigned short)option;
                        break;
                case Opt_rsize:
                        if (get_option_ul(args, &option)) {
@@ -1537,53 +1605,48 @@ cifs_parse_mount_options(const char *mountdata, const char *devname,
                        vol->password[j] = '\0';
                        break;
                case Opt_blank_ip:
-                        vol->UNCip = NULL;
+                        /* FIXME: should this be an error instead? */
+                        got_ip = false;
                        break;
                case Opt_ip:
                        string = match_strdup(args);
                        if (string == NULL)
                                goto out_nomem;
-                        if (strnlen(string, INET6_ADDRSTRLEN) >
+                        if (!cifs_convert_address(dstaddr, string,
-                                                INET6_ADDRSTRLEN) {
+                                        strlen(string))) {
-                                printk(KERN_WARNING "CIFS: ip address "
+                                printk(KERN_ERR "CIFS: bad ip= option (%s).\n",
-                                                    "too long\n");
+                                        string);
-                                goto cifs_parse_mount_err;
-                        }
-                        vol->UNCip = kstrdup(string, GFP_KERNEL);
-                        if (!vol->UNCip) {
-                                printk(KERN_WARNING "CIFS: no memory "
-                                                    "for UNC IP\n");
                                goto cifs_parse_mount_err;
                        }
+                        got_ip = true;
                        break;
                case Opt_unc:
-                        string = match_strdup(args);
+                        string = vol->UNC;
-                        if (string == NULL)
+                        vol->UNC = match_strdup(args);
+                        if (vol->UNC == NULL) {
+                                kfree(string);
                                goto out_nomem;
-                        temp_len = strnlen(string, 300);
-                        if (temp_len  == 300) {
-                                printk(KERN_WARNING "CIFS: UNC name too long\n");
-                                goto cifs_parse_mount_err;
                        }
-                        vol->UNC = kmalloc(temp_len+1, GFP_KERNEL);
+                        convert_delimiter(vol->UNC, '\\');
-                        if (vol->UNC == NULL) {
+                        if (vol->UNC[0] != '\\' || vol->UNC[1] != '\\') {
-                                printk(KERN_WARNING "CIFS: no memory for UNC\n");
+                                kfree(string);
-                                goto cifs_parse_mount_err;
+                                printk(KERN_ERR "CIFS: UNC Path does not "
-                        }
+                                                "begin with // or \\\\\n");
-                        strcpy(vol->UNC, string);
-                        if (strncmp(string, "//", 2) == 0) {
-                                vol->UNC[0] = '\\';
-                                vol->UNC[1] = '\\';
-                        } else if (strncmp(string, "\\\\", 2) != 0) {
-                                printk(KERN_WARNING "CIFS: UNC Path does not "
-                                                    "begin with // or \\\\\n");
                                goto cifs_parse_mount_err;
                        }
+                        /* Compare old unc= option to new one */
+                        if (!string || strcmp(string, vol->UNC))
+                                printk(KERN_WARNING "CIFS: the value of the "
+                                        "unc= mount option does not match the "
+                                        "device string. Using the unc= option "
+                                        "for now. In 3.10, that option will "
+                                        "be ignored and the contents of the "
+                                        "device string will be used "
+                                        "instead. (%s != %s)\n", string,
+                                        vol->UNC);
                        break;
                case Opt_domain:
                        string = match_strdup(args);
@@ -1618,31 +1681,26 @@ cifs_parse_mount_options(const char *mountdata, const char *devname,
                        }
                        break;
                case Opt_prefixpath:
-                        string = match_strdup(args);
+                        /* skip over any leading delimiter */
-                        if (string == NULL)
+                        if (*args[0].from == '/' || *args[0].from == '\\')
-                                goto out_nomem;
+                                args[0].from++;
-                        temp_len = strnlen(string, 1024);
-                        if (string[0] != '/')
-                                temp_len++; /* missing leading slash */
-                        if (temp_len > 1024) {
-                                printk(KERN_WARNING "CIFS: prefix too long\n");
-                                goto cifs_parse_mount_err;
-                        }
-                        vol->prepath = kmalloc(temp_len+1, GFP_KERNEL);
+                        string = vol->prepath;
+                        vol->prepath = match_strdup(args);
                        if (vol->prepath == NULL) {
-                                printk(KERN_WARNING "CIFS: no memory "
+                                kfree(string);
-                                                    "for path prefix\n");
+                                goto out_nomem;
-                                goto cifs_parse_mount_err;
                        }
+                        /* Compare old prefixpath= option to new one */
-                        if (string[0] != '/') {
+                        if (!string || strcmp(string, vol->prepath))
-                                vol->prepath[0] = '/';
+                                printk(KERN_WARNING "CIFS: the value of the "
-                                strcpy(vol->prepath+1, string);
+                                        "prefixpath= mount option does not "
-                        } else
+                                        "match the device string. Using the "
-                                strcpy(vol->prepath, string);
+                                        "prefixpath= option for now. In 3.10, "
+                                        "that option will be ignored and the "
+                                        "contents of the device string will be "
+                                        "used instead.(%s != %s)\n", string,
+                                        vol->prepath);
                        break;
                case Opt_iocharset:
                        string = match_strdup(args);
@@ -1799,9 +1857,30 @@ cifs_parse_mount_options(const char *mountdata, const char *devname,
                goto cifs_parse_mount_err;
        }
 #endif
+        if (!vol->UNC) {
+                cERROR(1, "CIFS mount error: No usable UNC path provided in "
+                          "device string or in unc= option!");
+                goto cifs_parse_mount_err;
+        }
-        if (vol->UNCip == NULL)
+        /* make sure UNC has a share name */
-                vol->UNCip = &vol->UNC[2];
+        if (!strchr(vol->UNC + 3, '\\')) {
+                cERROR(1, "Malformed UNC. Unable to find share name.");
+                goto cifs_parse_mount_err;
+        }
+        if (!got_ip) {
+                /* No ip= option specified? Try to get it from UNC */
+                if (!cifs_convert_address(dstaddr, &vol->UNC[2],
+                                                strlen(&vol->UNC[2]))) {
+                        printk(KERN_ERR "Unable to determine destination "
+                                        "address.\n");
+                        goto cifs_parse_mount_err;
+                }
+        }
+        /* set the port that we got earlier */
+        cifs_set_port(dstaddr, port);
        if (uid_specified)
                vol->override_uid = override_uid;
@@ -1972,9 +2051,10 @@ match_security(struct TCP_Server_Info *server, struct smb_vol *vol)
        return true;
 }
-static int match_server(struct TCP_Server_Info *server, struct sockaddr *addr,
+static int match_server(struct TCP_Server_Info *server, struct smb_vol *vol)
-                         struct smb_vol *vol)
 {
+        struct sockaddr *addr = (struct sockaddr *)&vol->dstaddr;
        if ((server->vals != vol->vals) || (server->ops != vol->ops))
                return 0;
@@ -1995,13 +2075,13 @@ static int match_server(struct TCP_Server_Info *server, struct sockaddr *addr,
 }
 static struct TCP_Server_Info *
-cifs_find_tcp_session(struct sockaddr *addr, struct smb_vol *vol)
+cifs_find_tcp_session(struct smb_vol *vol)
 {
        struct TCP_Server_Info *server;
        spin_lock(&cifs_tcp_ses_lock);
        list_for_each_entry(server, &cifs_tcp_ses_list, tcp_ses_list) {
-                if (!match_server(server, addr, vol))
+                if (!match_server(server, vol))
                        continue;
                ++server->srv_count;
@@ -2051,40 +2131,12 @@ static struct TCP_Server_Info *
 cifs_get_tcp_session(struct smb_vol *volume_info)
 {
        struct TCP_Server_Info *tcp_ses = NULL;
-        struct sockaddr_storage addr;
-        struct sockaddr_in *sin_server = (struct sockaddr_in *) &addr;
-        struct sockaddr_in6 *sin_server6 = (struct sockaddr_in6 *) &addr;
        int rc;
-        memset(&addr, 0, sizeof(struct sockaddr_storage));
+        cFYI(1, "UNC: %s", volume_info->UNC);
-        cFYI(1, "UNC: %s ip: %s", volume_info->UNC, volume_info->UNCip);
-        if (volume_info->UNCip && volume_info->UNC) {
-                rc = cifs_fill_sockaddr((struct sockaddr *)&addr,
-                                        volume_info->UNCip,
-                                        strlen(volume_info->UNCip),
-                                        volume_info->port);
-                if (!rc) {
-                        /* we failed translating address */
-                        rc = -EINVAL;
-                        goto out_err;
-                }
-        } else if (volume_info->UNCip) {
-                /* BB using ip addr as tcp_ses name to connect to the
-                   DFS root below */
-                cERROR(1, "Connecting to DFS root not implemented yet");
-                rc = -EINVAL;
-                goto out_err;
-        } else /* which tcp_sess DFS root would we conect to */ {
-                cERROR(1, "CIFS mount error: No UNC path (e.g. -o "
-                        "unc=//192.168.1.100/public) specified");
-                rc = -EINVAL;
-                goto out_err;
-        }
        /* see if we already have a matching tcp_ses */
-        tcp_ses = cifs_find_tcp_session((struct sockaddr *)&addr, volume_info);
+        tcp_ses = cifs_find_tcp_session(volume_info);
        if (tcp_ses)
                return tcp_ses;
@@ -2129,27 +2181,18 @@ cifs_get_tcp_session(struct smb_vol *volume_info)
        INIT_LIST_HEAD(&tcp_ses->tcp_ses_list);
        INIT_LIST_HEAD(&tcp_ses->smb_ses_list);
        INIT_DELAYED_WORK(&tcp_ses->echo, cifs_echo_request);
+        memcpy(&tcp_ses->srcaddr, &volume_info->srcaddr,
+               sizeof(tcp_ses->srcaddr));
+        memcpy(&tcp_ses->dstaddr, &volume_info->dstaddr,
+                sizeof(tcp_ses->dstaddr));
        /*
         * at this point we are the only ones with the pointer
         * to the struct since the kernel thread not created yet
         * no need to spinlock this init of tcpStatus or srv_count
         */
        tcp_ses->tcpStatus = CifsNew;
-        memcpy(&tcp_ses->srcaddr, &volume_info->srcaddr,
-               sizeof(tcp_ses->srcaddr));
        ++tcp_ses->srv_count;
-        if (addr.ss_family == AF_INET6) {
-                cFYI(1, "attempting ipv6 connect");
-                /* BB should we allow ipv6 on port 139? */
-                /* other OS never observed in Wild doing 139 with v6 */
-                memcpy(&tcp_ses->dstaddr, sin_server6,
-                       sizeof(struct sockaddr_in6));
-        } else
-                memcpy(&tcp_ses->dstaddr, sin_server,
-                       sizeof(struct sockaddr_in));
        rc = ip_connect(tcp_ses);
        if (rc < 0) {
                cERROR(1, "Error connecting to socket. Aborting operation");
@@ -2397,8 +2440,6 @@ cifs_set_cifscreds(struct smb_vol *vol __attribute__((unused)),
 }
 #endif /* CONFIG_KEYS */
-static bool warned_on_ntlm;  /* globals init to false automatically */
 static struct cifs_ses *
 cifs_get_smb_ses(struct TCP_Server_Info *server, struct smb_vol *volume_info)
 {
@@ -2475,14 +2516,6 @@ cifs_get_smb_ses(struct TCP_Server_Info *server, struct smb_vol *volume_info)
        ses->cred_uid = volume_info->cred_uid;
        ses->linux_uid = volume_info->linux_uid;
-        /* ntlmv2 is much stronger than ntlm security, and has been broadly
-        supported for many years, time to update default security mechanism */
-        if ((volume_info->secFlg == 0) && warned_on_ntlm == false) {
-                warned_on_ntlm = true;
-                cERROR(1, "default security mechanism requested.  The default "
-                        "security mechanism will be upgraded from ntlm to "
-                        "ntlmv2 in kernel release 3.3");
-        }
        ses->overrideSecFlg = volume_info->secFlg;
        mutex_lock(&ses->session_mutex);
@@ -2598,13 +2631,6 @@ cifs_get_tcon(struct cifs_ses *ses, struct smb_vol *volume_info)
                }
        }
-        if (strchr(volume_info->UNC + 3, '\\') == NULL
-            && strchr(volume_info->UNC + 3, '/') == NULL) {
-                cERROR(1, "Missing share name");
-                rc = -ENODEV;
-                goto out_fail;
-        }
        /*
         * BB Do we need to wrap session_mutex around this TCon call and Unix
         * SetFS as we do on SessSetup and reconnect?
@@ -2718,11 +2744,8 @@ cifs_match_super(struct super_block *sb, void *data)
        struct cifs_ses *ses;
        struct cifs_tcon *tcon;
        struct tcon_link *tlink;
-        struct sockaddr_storage addr;
        int rc = 0;
-        memset(&addr, 0, sizeof(struct sockaddr_storage));
        spin_lock(&cifs_tcp_ses_lock);
        cifs_sb = CIFS_SB(sb);
        tlink = cifs_get_tlink(cifs_sb_master_tlink(cifs_sb));
@@ -2736,17 +2759,7 @@ cifs_match_super(struct super_block *sb, void *data)
        volume_info = mnt_data->vol;
-        if (!volume_info->UNCip || !volume_info->UNC)
+        if (!match_server(tcp_srv, volume_info) ||
-                goto out;
-        rc = cifs_fill_sockaddr((struct sockaddr *)&addr,
-                                volume_info->UNCip,
-                                strlen(volume_info->UNCip),
-                                volume_info->port);
-        if (!rc)
-                goto out;
-        if (!match_server(tcp_srv, (struct sockaddr *)&addr, volume_info) ||
            !match_session(ses, volume_info) ||
            !match_tcon(tcon, volume_info->UNC)) {
                rc = 0;
@@ -3261,8 +3274,6 @@ cleanup_volume_info_contents(struct smb_vol *volume_info)
 {
        kfree(volume_info->username);
        kzfree(volume_info->password);
-        if (volume_info->UNCip != volume_info->UNC + 2)
-                kfree(volume_info->UNCip);
        kfree(volume_info->UNC);
        kfree(volume_info->domainname);
        kfree(volume_info->iocharset);
@@ -3280,14 +3291,16 @@ cifs_cleanup_volume_info(struct smb_vol *volume_info)
 #ifdef CONFIG_CIFS_DFS_UPCALL
-/* build_path_to_root returns full path to root when
+/*
- * we do not have an exiting connection (tcon) */
+ * cifs_build_path_to_root returns full path to root when we do not have an
+ * exiting connection (tcon)
+ */
 static char *
 build_unc_path_to_root(const struct smb_vol *vol,
                const struct cifs_sb_info *cifs_sb)
 {
        char *full_path, *pos;
-        unsigned int pplen = vol->prepath ? strlen(vol->prepath) : 0;
+        unsigned int pplen = vol->prepath ? strlen(vol->prepath) + 1 : 0;
        unsigned int unc_len = strnlen(vol->UNC, MAX_TREE_SIZE + 1);
        full_path = kmalloc(unc_len + pplen + 1, GFP_KERNEL);
@@ -3298,6 +3311,7 @@ build_unc_path_to_root(const struct smb_vol *vol,
        pos = full_path + unc_len;
        if (pplen) {
+                *pos++ = CIFS_DIR_SEP(cifs_sb);
                strncpy(pos, vol->prepath, pplen);
                pos += pplen;
        }
@@ -3353,7 +3367,6 @@ expand_dfs_referral(const unsigned int xid, struct cifs_ses *ses,
                        mdata = NULL;
                } else {
                        cleanup_volume_info_contents(volume_info);
-                        memset(volume_info, '\0', sizeof(*volume_info));
                        rc = cifs_setup_volume_info(volume_info, mdata,
                                                        fake_devname);
                }
@@ -3375,7 +3388,6 @@ cifs_setup_volume_info(struct smb_vol *volume_info, char *mount_data,
        if (cifs_parse_mount_options(mount_data, devname, volume_info))
                return -EINVAL;
        if (volume_info->nullauth) {
                cFYI(1, "Anonymous login");
                kfree(volume_info->username);
@@ -3412,7 +3424,7 @@ cifs_get_volume_info(char *mount_data, const char *devname)
        int rc;
        struct smb_vol *volume_info;
-        volume_info = kzalloc(sizeof(struct smb_vol), GFP_KERNEL);
+        volume_info = kmalloc(sizeof(struct smb_vol), GFP_KERNEL);
        if (!volume_info)
                return ERR_PTR(-ENOMEM);
@@ -3537,8 +3549,10 @@ remote_path_check:
                        rc = -ENOSYS;
                        goto mount_fail_check;
                }
-                /* build_path_to_root works only when we have a valid tcon */
+                /*
-                full_path = build_path_to_root(volume_info, cifs_sb, tcon);
+                 * cifs_build_path_to_root works only when we have a valid tcon
+                 */
+                full_path = cifs_build_path_to_root(volume_info, cifs_sb, tcon);
                if (full_path == NULL) {
                        rc = -ENOMEM;
                        goto mount_fail_check;
diff --git a/fs/cifs/dir.c b/fs/cifs/dir.c
index 7c0a81283645..8719bbe0dcc3 100644
--- a/fs/cifs/dir.c
+++ b/fs/cifs/dir.c
@@ -44,6 +44,38 @@ renew_parental_timestamps(struct dentry *direntry)
        } while (!IS_ROOT(direntry));
 }
+char *
+cifs_build_path_to_root(struct smb_vol *vol, struct cifs_sb_info *cifs_sb,
+                        struct cifs_tcon *tcon)
+{
+        int pplen = vol->prepath ? strlen(vol->prepath) + 1 : 0;
+        int dfsplen;
+        char *full_path = NULL;
+        /* if no prefix path, simply set path to the root of share to "" */
+        if (pplen == 0) {
+                full_path = kzalloc(1, GFP_KERNEL);
+                return full_path;
+        }
+        if (tcon->Flags & SMB_SHARE_IS_IN_DFS)
+                dfsplen = strnlen(tcon->treeName, MAX_TREE_SIZE + 1);
+        else
+                dfsplen = 0;
+        full_path = kmalloc(dfsplen + pplen + 1, GFP_KERNEL);
+        if (full_path == NULL)
+                return full_path;
+        if (dfsplen)
+                strncpy(full_path, tcon->treeName, dfsplen);
+        full_path[dfsplen] = CIFS_DIR_SEP(cifs_sb);
+        strncpy(full_path + dfsplen + 1, vol->prepath, pplen);
+        convert_delimiter(full_path, CIFS_DIR_SEP(cifs_sb));
+        full_path[dfsplen + pplen] = 0; /* add trailing null */
+        return full_path;
+}
 /* Note: caller must free return buffer */
 char *
 build_path_from_dentry(struct dentry *direntry)
@@ -398,7 +430,16 @@ cifs_atomic_open(struct inode *inode, struct dentry *direntry,
         * in network traffic in the other paths.
         */
        if (!(oflags & O_CREAT)) {
-                struct dentry *res = cifs_lookup(inode, direntry, 0);
+                struct dentry *res;
+                /*
+                 * Check for hashed negative dentry. We have already revalidated
+                 * the dentry and it is fine. No need to perform another lookup.
+                 */
+                if (!d_unhashed(direntry))
+                        return -ENOENT;
+                res = cifs_lookup(inode, direntry, 0);
                if (IS_ERR(res))
                        return PTR_ERR(res);
diff --git a/fs/cifs/file.c b/fs/cifs/file.c
index edb25b4bbb95..0a6677ba212b 100644
--- a/fs/cifs/file.c
+++ b/fs/cifs/file.c
@@ -505,16 +505,36 @@ out:
        return rc;
 }
+static int cifs_push_posix_locks(struct cifsFileInfo *cfile);
 /*
 * Try to reacquire byte range locks that were released when session
- * to server was lost
+ * to server was lost.
 */
-static int cifs_relock_file(struct cifsFileInfo *cifsFile)
+static int
+cifs_relock_file(struct cifsFileInfo *cfile)
 {
+        struct cifs_sb_info *cifs_sb = CIFS_SB(cfile->dentry->d_sb);
+        struct cifsInodeInfo *cinode = CIFS_I(cfile->dentry->d_inode);
+        struct cifs_tcon *tcon = tlink_tcon(cfile->tlink);
        int rc = 0;
-        /* BB list all locks open on this file and relock */
+        /* we are going to update can_cache_brlcks here - need a write access */
+        down_write(&cinode->lock_sem);
+        if (cinode->can_cache_brlcks) {
+                /* can cache locks - no need to push them */
+                up_write(&cinode->lock_sem);
+                return rc;
+        }
+        if (cap_unix(tcon->ses) &&
+            (CIFS_UNIX_FCNTL_CAP & le64_to_cpu(tcon->fsUnixInfo.Capability)) &&
+            ((cifs_sb->mnt_cifs_flags & CIFS_MOUNT_NOPOSIXBRL) == 0))
+                rc = cifs_push_posix_locks(cfile);
+        else
+                rc = tcon->ses->server->ops->push_mand_locks(cfile);
+        up_write(&cinode->lock_sem);
        return rc;
 }
@@ -739,10 +759,15 @@ cifs_del_lock_waiters(struct cifsLockInfo *lock)
        }
 }
+#define CIFS_LOCK_OP    0
+#define CIFS_READ_OP    1
+#define CIFS_WRITE_OP   2
+/* @rw_check : 0 - no op, 1 - read, 2 - write */
 static bool
 cifs_find_fid_lock_conflict(struct cifs_fid_locks *fdlocks, __u64 offset,
                            __u64 length, __u8 type, struct cifsFileInfo *cfile,
-                            struct cifsLockInfo **conf_lock, bool rw_check)
+                            struct cifsLockInfo **conf_lock, int rw_check)
 {
        struct cifsLockInfo *li;
        struct cifsFileInfo *cur_cfile = fdlocks->cfile;
@@ -752,9 +777,13 @@ cifs_find_fid_lock_conflict(struct cifs_fid_locks *fdlocks, __u64 offset,
                if (offset + length <= li->offset ||
                    offset >= li->offset + li->length)
                        continue;
-                if (rw_check && server->ops->compare_fids(cfile, cur_cfile) &&
+                if (rw_check != CIFS_LOCK_OP && current->tgid == li->pid &&
-                    current->tgid == li->pid)
+                    server->ops->compare_fids(cfile, cur_cfile)) {
-                        continue;
+                        /* shared lock prevents write op through the same fid */
+                        if (!(li->type & server->vals->shared_lock_type) ||
+                            rw_check != CIFS_WRITE_OP)
+                                continue;
+                }
                if ((type & server->vals->shared_lock_type) &&
                    ((server->ops->compare_fids(cfile, cur_cfile) &&
                     current->tgid == li->pid) || type == li->type))
@@ -769,7 +798,7 @@ cifs_find_fid_lock_conflict(struct cifs_fid_locks *fdlocks, __u64 offset,
 bool
 cifs_find_lock_conflict(struct cifsFileInfo *cfile, __u64 offset, __u64 length,
                        __u8 type, struct cifsLockInfo **conf_lock,
-                        bool rw_check)
+                        int rw_check)
 {
        bool rc = false;
        struct cifs_fid_locks *cur;
@@ -805,7 +834,7 @@ cifs_lock_test(struct cifsFileInfo *cfile, __u64 offset, __u64 length,
        down_read(&cinode->lock_sem);
        exist = cifs_find_lock_conflict(cfile, offset, length, type,
-                                        &conf_lock, false);
+                                        &conf_lock, CIFS_LOCK_OP);
        if (exist) {
                flock->fl_start = conf_lock->offset;
                flock->fl_end = conf_lock->offset + conf_lock->length - 1;
@@ -852,7 +881,7 @@ try_again:
        down_write(&cinode->lock_sem);
        exist = cifs_find_lock_conflict(cfile, lock->offset, lock->length,
-                                        lock->type, &conf_lock, false);
+                                        lock->type, &conf_lock, CIFS_LOCK_OP);
        if (!exist && cinode->can_cache_brlcks) {
                list_add_tail(&lock->llist, &cfile->llist->locks);
                up_write(&cinode->lock_sem);
@@ -948,7 +977,6 @@ cifs_push_mandatory_locks(struct cifsFileInfo *cfile)
        int rc = 0, stored_rc;
        struct cifsLockInfo *li, *tmp;
        struct cifs_tcon *tcon;
-        struct cifsInodeInfo *cinode = CIFS_I(cfile->dentry->d_inode);
        unsigned int num, max_num, max_buf;
        LOCKING_ANDX_RANGE *buf, *cur;
        int types[] = {LOCKING_ANDX_LARGE_FILES,
@@ -958,21 +986,12 @@ cifs_push_mandatory_locks(struct cifsFileInfo *cfile)
        xid = get_xid();
        tcon = tlink_tcon(cfile->tlink);
-        /* we are going to update can_cache_brlcks here - need a write access */
-        down_write(&cinode->lock_sem);
-        if (!cinode->can_cache_brlcks) {
-                up_write(&cinode->lock_sem);
-                free_xid(xid);
-                return rc;
-        }
        /*
         * Accessing maxBuf is racy with cifs_reconnect - need to store value
         * and check it for zero before using.
         */
        max_buf = tcon->ses->server->maxBuf;
        if (!max_buf) {
-                up_write(&cinode->lock_sem);
                free_xid(xid);
                return -EINVAL;
        }
@@ -981,7 +1000,6 @@ cifs_push_mandatory_locks(struct cifsFileInfo *cfile)
                                                sizeof(LOCKING_ANDX_RANGE);
        buf = kzalloc(max_num * sizeof(LOCKING_ANDX_RANGE), GFP_KERNEL);
        if (!buf) {
-                up_write(&cinode->lock_sem);
                free_xid(xid);
                return -ENOMEM;
        }
@@ -1018,9 +1036,6 @@ cifs_push_mandatory_locks(struct cifsFileInfo *cfile)
                }
        }
-        cinode->can_cache_brlcks = false;
-        up_write(&cinode->lock_sem);
        kfree(buf);
        free_xid(xid);
        return rc;
@@ -1043,7 +1058,6 @@ struct lock_to_push {
 static int
 cifs_push_posix_locks(struct cifsFileInfo *cfile)
 {
-        struct cifsInodeInfo *cinode = CIFS_I(cfile->dentry->d_inode);
        struct cifs_tcon *tcon = tlink_tcon(cfile->tlink);
        struct file_lock *flock, **before;
        unsigned int count = 0, i = 0;
@@ -1054,14 +1068,6 @@ cifs_push_posix_locks(struct cifsFileInfo *cfile)
        xid = get_xid();
-        /* we are going to update can_cache_brlcks here - need a write access */
-        down_write(&cinode->lock_sem);
-        if (!cinode->can_cache_brlcks) {
-                up_write(&cinode->lock_sem);
-                free_xid(xid);
-                return rc;
-        }
        lock_flocks();
        cifs_for_each_lock(cfile->dentry->d_inode, before) {
                if ((*before)->fl_flags & FL_POSIX)
@@ -1127,9 +1133,6 @@ cifs_push_posix_locks(struct cifsFileInfo *cfile)
        }
 out:
-        cinode->can_cache_brlcks = false;
-        up_write(&cinode->lock_sem);
        free_xid(xid);
        return rc;
 err_out:
@@ -1144,14 +1147,27 @@ static int
 cifs_push_locks(struct cifsFileInfo *cfile)
 {
        struct cifs_sb_info *cifs_sb = CIFS_SB(cfile->dentry->d_sb);
+        struct cifsInodeInfo *cinode = CIFS_I(cfile->dentry->d_inode);
        struct cifs_tcon *tcon = tlink_tcon(cfile->tlink);
+        int rc = 0;
+        /* we are going to update can_cache_brlcks here - need a write access */
+        down_write(&cinode->lock_sem);
+        if (!cinode->can_cache_brlcks) {
+                up_write(&cinode->lock_sem);
+                return rc;
+        }
        if (cap_unix(tcon->ses) &&
            (CIFS_UNIX_FCNTL_CAP & le64_to_cpu(tcon->fsUnixInfo.Capability)) &&
            ((cifs_sb->mnt_cifs_flags & CIFS_MOUNT_NOPOSIXBRL) == 0))
-                return cifs_push_posix_locks(cfile);
+                rc = cifs_push_posix_locks(cfile);
+        else
+                rc = tcon->ses->server->ops->push_mand_locks(cfile);
-        return tcon->ses->server->ops->push_mand_locks(cfile);
+        cinode->can_cache_brlcks = false;
+        up_write(&cinode->lock_sem);
+        return rc;
 }
 static void
@@ -1436,16 +1452,18 @@ cifs_setlk(struct file *file, struct file_lock *flock, __u32 type,
                        return -ENOMEM;
                rc = cifs_lock_add_if(cfile, lock, wait_flag);
-                if (rc < 0)
+                if (rc < 0) {
                        kfree(lock);
-                if (rc <= 0)
+                        return rc;
+                }
+                if (!rc)
                        goto out;
                rc = server->ops->mand_lock(xid, cfile, flock->fl_start, length,
                                            type, 1, 0, wait_flag);
                if (rc) {
                        kfree(lock);
-                        goto out;
+                        return rc;
                }
                cifs_lock_add(cfile, lock);
@@ -1794,7 +1812,6 @@ static int cifs_writepages(struct address_space *mapping,
        struct TCP_Server_Info *server;
        struct page *page;
        int rc = 0;
-        loff_t isize = i_size_read(mapping->host);
        /*
         * If wsize is smaller than the page cache size, default to writing
@@ -1899,7 +1916,7 @@ retry:
                         */
                        set_page_writeback(page);
-                        if (page_offset(page) >= isize) {
+                        if (page_offset(page) >= i_size_read(mapping->host)) {
                                done = true;
                                unlock_page(page);
                                end_page_writeback(page);
@@ -1932,7 +1949,8 @@ retry:
                wdata->offset = page_offset(wdata->pages[0]);
                wdata->pagesz = PAGE_CACHE_SIZE;
                wdata->tailsz =
-                        min(isize - page_offset(wdata->pages[nr_pages - 1]),
+                        min(i_size_read(mapping->host) -
+                            page_offset(wdata->pages[nr_pages - 1]),
                            (loff_t)PAGE_CACHE_SIZE);
                wdata->bytes = ((nr_pages - 1) * PAGE_CACHE_SIZE) +
                                        wdata->tailsz;
@@ -2085,7 +2103,15 @@ static int cifs_write_end(struct file *file, struct address_space *mapping,
        } else {
                rc = copied;
                pos += copied;
-                set_page_dirty(page);
+                /*
+                 * When we use strict cache mode and cifs_strict_writev was run
+                 * with level II oplock (indicated by leave_pages_clean field of
+                 * CIFS_I(inode)), we can leave pages clean - cifs_strict_writev
+                 * sent the data to the server itself.
+                 */
+                if (!CIFS_I(inode)->leave_pages_clean ||
+                    !(cifs_sb->mnt_cifs_flags & CIFS_MOUNT_STRICT_IO))
+                        set_page_dirty(page);
        }
        if (rc > 0) {
@@ -2436,8 +2462,8 @@ ssize_t cifs_user_writev(struct kiocb *iocb, const struct iovec *iov,
 }
 static ssize_t
-cifs_writev(struct kiocb *iocb, const struct iovec *iov,
+cifs_pagecache_writev(struct kiocb *iocb, const struct iovec *iov,
-            unsigned long nr_segs, loff_t pos)
+                      unsigned long nr_segs, loff_t pos, bool cache_ex)
 {
        struct file *file = iocb->ki_filp;
        struct cifsFileInfo *cfile = (struct cifsFileInfo *)file->private_data;
@@ -2457,10 +2483,14 @@ cifs_writev(struct kiocb *iocb, const struct iovec *iov,
        down_read(&cinode->lock_sem);
        if (!cifs_find_lock_conflict(cfile, pos, iov_length(iov, nr_segs),
                                     server->vals->exclusive_lock_type, NULL,
-                                     true)) {
+                                     CIFS_WRITE_OP)) {
                mutex_lock(&inode->i_mutex);
+                if (!cache_ex)
+                        cinode->leave_pages_clean = true;
                rc = __generic_file_aio_write(iocb, iov, nr_segs,
-                                               &iocb->ki_pos);
+                                              &iocb->ki_pos);
+                if (!cache_ex)
+                        cinode->leave_pages_clean = false;
                mutex_unlock(&inode->i_mutex);
        }
@@ -2487,42 +2517,62 @@ cifs_strict_writev(struct kiocb *iocb, const struct iovec *iov,
        struct cifsFileInfo *cfile = (struct cifsFileInfo *)
                                                iocb->ki_filp->private_data;
        struct cifs_tcon *tcon = tlink_tcon(cfile->tlink);
+        ssize_t written, written2;
-#ifdef CONFIG_CIFS_SMB2
        /*
-         * If we have an oplock for read and want to write a data to the file
+         * We need to store clientCanCacheAll here to prevent race
-         * we need to store it in the page cache and then push it to the server
+         * conditions - this value can be changed during an execution
-         * to be sure the next read will get a valid data.
+         * of generic_file_aio_write. For CIFS it can be changed from
+         * true to false only, but for SMB2 it can be changed both from
+         * true to false and vice versa. So, we can end up with a data
+         * stored in the cache, not marked dirty and not sent to the
+         * server if this value changes its state from false to true
+         * after cifs_write_end.
         */
-        if (!cinode->clientCanCacheAll && cinode->clientCanCacheRead) {
+        bool cache_ex = cinode->clientCanCacheAll;
-                ssize_t written;
+        bool cache_read = cinode->clientCanCacheRead;
-                int rc;
+        int rc;
+        loff_t saved_pos;
-                written = generic_file_aio_write(iocb, iov, nr_segs, pos);
-                rc = filemap_fdatawrite(inode->i_mapping);
-                if (rc)
-                        return (ssize_t)rc;
-                return written;
+        if (cache_ex) {
+                if (cap_unix(tcon->ses) &&
+                    ((cifs_sb->mnt_cifs_flags & CIFS_MOUNT_NOPOSIXBRL) == 0) &&
+                    (CIFS_UNIX_FCNTL_CAP & le64_to_cpu(
+                                                tcon->fsUnixInfo.Capability)))
+                        return generic_file_aio_write(iocb, iov, nr_segs, pos);
+                return cifs_pagecache_writev(iocb, iov, nr_segs, pos, cache_ex);
        }
-#endif
        /*
-         * For non-oplocked files in strict cache mode we need to write the data
+         * For files without exclusive oplock in strict cache mode we need to
-         * to the server exactly from the pos to pos+len-1 rather than flush all
+         * write the data to the server exactly from the pos to pos+len-1 rather
-         * affected pages because it may cause a error with mandatory locks on
+         * than flush all affected pages because it may cause a error with
-         * these pages but not on the region from pos to ppos+len-1.
+         * mandatory locks on these pages but not on the region from pos to
+         * ppos+len-1.
         */
+        written = cifs_user_writev(iocb, iov, nr_segs, pos);
+        if (!cache_read || written <= 0)
+                return written;
-        if (!cinode->clientCanCacheAll)
+        saved_pos = iocb->ki_pos;
-                return cifs_user_writev(iocb, iov, nr_segs, pos);
+        iocb->ki_pos = pos;
+        /* we have a read oplock - need to store a data in the page cache */
        if (cap_unix(tcon->ses) &&
-            (CIFS_UNIX_FCNTL_CAP & le64_to_cpu(tcon->fsUnixInfo.Capability)) &&
+            ((cifs_sb->mnt_cifs_flags & CIFS_MOUNT_NOPOSIXBRL) == 0) &&
-            ((cifs_sb->mnt_cifs_flags & CIFS_MOUNT_NOPOSIXBRL) == 0))
+            (CIFS_UNIX_FCNTL_CAP & le64_to_cpu(
-                return generic_file_aio_write(iocb, iov, nr_segs, pos);
+                                        tcon->fsUnixInfo.Capability)))
+                written2 = generic_file_aio_write(iocb, iov, nr_segs, pos);
-        return cifs_writev(iocb, iov, nr_segs, pos);
+        else
+                written2 = cifs_pagecache_writev(iocb, iov, nr_segs, pos,
+                                                 cache_ex);
+        /* errors occured during writing - invalidate the page cache */
+        if (written2 < 0) {
+                rc = cifs_invalidate_mapping(inode);
+                if (rc)
+                        written = (ssize_t)rc;
+                else
+                        iocb->ki_pos = saved_pos;
+        }
+        return written;
 }
 static struct cifs_readdata *
@@ -2892,7 +2942,7 @@ cifs_strict_readv(struct kiocb *iocb, const struct iovec *iov,
        down_read(&cinode->lock_sem);
        if (!cifs_find_lock_conflict(cfile, pos, iov_length(iov, nr_segs),
                                     tcon->ses->server->vals->shared_lock_type,
-                                     NULL, true))
+                                     NULL, CIFS_READ_OP))
                rc = generic_file_aio_read(iocb, iov, nr_segs, pos);
        up_read(&cinode->lock_sem);
        return rc;
@@ -3536,7 +3586,7 @@ void cifs_oplock_break(struct work_struct *work)
                if (cinode->clientCanCacheRead == 0) {
                        rc = filemap_fdatawait(inode->i_mapping);
                        mapping_set_error(inode->i_mapping, rc);
-                        invalidate_remote_inode(inode);
+                        cifs_invalidate_mapping(inode);
                }
                cFYI(1, "Oplock flush inode %p rc %d", inode, rc);
        }
diff --git a/fs/cifs/inode.c b/fs/cifs/inode.c
index afdff79651f1..ed6208ff85a7 100644
--- a/fs/cifs/inode.c
+++ b/fs/cifs/inode.c
@@ -1791,11 +1791,12 @@ int cifs_getattr(struct vfsmount *mnt, struct dentry *dentry,
        stat->ino = CIFS_I(inode)->uniqueid;
        /*
-         * If on a multiuser mount without unix extensions, and the admin hasn't
+         * If on a multiuser mount without unix extensions or cifsacl being
-         * overridden them, set the ownership to the fsuid/fsgid of the current
+         * enabled, and the admin hasn't overridden them, set the ownership
-         * process.
+         * to the fsuid/fsgid of the current process.
         */
        if ((cifs_sb->mnt_cifs_flags & CIFS_MOUNT_MULTIUSER) &&
+            !(cifs_sb->mnt_cifs_flags & CIFS_MOUNT_CIFS_ACL) &&
            !tcon->unix_ext) {
                if (!(cifs_sb->mnt_cifs_flags & CIFS_MOUNT_OVERR_UID))
                        stat->uid = current_fsuid();
diff --git a/fs/cifs/netmisc.c b/fs/cifs/netmisc.c
index d5ce9e26696c..a82bc51fdc82 100644
--- a/fs/cifs/netmisc.c
+++ b/fs/cifs/netmisc.c
@@ -204,7 +204,7 @@ cifs_convert_address(struct sockaddr *dst, const char *src, int len)
        return rc;
 }
-int
+void
 cifs_set_port(struct sockaddr *addr, const unsigned short int port)
 {
        switch (addr->sa_family) {
@@ -214,19 +214,7 @@ cifs_set_port(struct sockaddr *addr, const unsigned short int port)
        case AF_INET6:
                ((struct sockaddr_in6 *)addr)->sin6_port = htons(port);
                break;
-        default:
-                return 0;
        }
-        return 1;
-}
-int
-cifs_fill_sockaddr(struct sockaddr *dst, const char *src, int len,
-                   const unsigned short int port)
-{
-        if (!cifs_convert_address(dst, src, len))
-                return 0;
-        return cifs_set_port(dst, port);
 }
 /*****************************************************************************
diff --git a/fs/cifs/readdir.c b/fs/cifs/readdir.c
index f9b5d3d6cf33..6002fdc920ae 100644
--- a/fs/cifs/readdir.c
+++ b/fs/cifs/readdir.c
@@ -66,18 +66,20 @@ static inline void dump_cifs_file_struct(struct file *file, char *label)
 #endif /* DEBUG2 */
 /*
+ * Attempt to preload the dcache with the results from the FIND_FIRST/NEXT
+ *
 * Find the dentry that matches "name". If there isn't one, create one. If it's
 * a negative dentry or the uniqueid changed, then drop it and recreate it.
 */
-static struct dentry *
+static void
-cifs_readdir_lookup(struct dentry *parent, struct qstr *name,
+cifs_prime_dcache(struct dentry *parent, struct qstr *name,
                    struct cifs_fattr *fattr)
 {
        struct dentry *dentry, *alias;
        struct inode *inode;
        struct super_block *sb = parent->d_inode->i_sb;
-        cFYI(1, "For %s", name->name);
+        cFYI(1, "%s: for %s", __func__, name->name);
        if (parent->d_op && parent->d_op->d_hash)
                parent->d_op->d_hash(parent, parent->d_inode, name);
@@ -86,35 +88,33 @@ cifs_readdir_lookup(struct dentry *parent, struct qstr *name,
        dentry = d_lookup(parent, name);
        if (dentry) {
+                int err;
                inode = dentry->d_inode;
                /* update inode in place if i_ino didn't change */
                if (inode && CIFS_I(inode)->uniqueid == fattr->cf_uniqueid) {
                        cifs_fattr_to_inode(inode, fattr);
-                        return dentry;
+                        goto out;
                }
-                d_drop(dentry);
+                err = d_invalidate(dentry);
                dput(dentry);
+                if (err)
+                        return;
        }
        dentry = d_alloc(parent, name);
-        if (dentry == NULL)
+        if (!dentry)
-                return NULL;
+                return;
        inode = cifs_iget(sb, fattr);
-        if (!inode) {
+        if (!inode)
-                dput(dentry);
+                goto out;
-                return NULL;
-        }
        alias = d_materialise_unique(dentry, inode);
-        if (alias != NULL) {
+        if (alias && !IS_ERR(alias))
-                dput(dentry);
+                dput(alias);
-                if (IS_ERR(alias))
+out:
-                        return NULL;
+        dput(dentry);
-                dentry = alias;
-        }
-        return dentry;
 }
 static void
@@ -134,6 +134,16 @@ cifs_fill_common_info(struct cifs_fattr *fattr, struct cifs_sb_info *cifs_sb)
        if (fattr->cf_cifsattrs & ATTR_READONLY)
                fattr->cf_mode &= ~S_IWUGO;
+        /*
+         * We of course don't get ACL info in FIND_FIRST/NEXT results, so
+         * mark it for revalidation so that "ls -l" will look right. It might
+         * be super-slow, but if we don't do this then the ownership of files
+         * may look wrong since the inodes may not have timed out by the time
+         * "ls" does a stat() call on them.
+         */
+        if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_CIFS_ACL)
+                fattr->cf_flags |= CIFS_FATTR_NEED_REVAL;
        if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_UNX_EMUL &&
            fattr->cf_cifsattrs & ATTR_SYSTEM) {
                if (fattr->cf_eof == 0)  {
@@ -649,7 +659,6 @@ static int cifs_filldir(char *find_entry, struct file *file, filldir_t filldir,
        struct cifs_sb_info *cifs_sb = CIFS_SB(sb);
        struct cifs_dirent de = { NULL, };
        struct cifs_fattr fattr;
-        struct dentry *dentry;
        struct qstr name;
        int rc = 0;
        ino_t ino;
@@ -720,13 +729,11 @@ static int cifs_filldir(char *find_entry, struct file *file, filldir_t filldir,
                 */
                fattr.cf_flags |= CIFS_FATTR_NEED_REVAL;
-        ino = cifs_uniqueid_to_ino_t(fattr.cf_uniqueid);
+        cifs_prime_dcache(file->f_dentry, &name, &fattr);
-        dentry = cifs_readdir_lookup(file->f_dentry, &name, &fattr);
+        ino = cifs_uniqueid_to_ino_t(fattr.cf_uniqueid);
        rc = filldir(dirent, name.name, name.len, file->f_pos, ino,
                     fattr.cf_dtype);
-        dput(dentry);
        return rc;
 }
diff --git a/fs/cifs/smb1ops.c b/fs/cifs/smb1ops.c
index 56cc4be87807..a5d234c8d5d9 100644
--- a/fs/cifs/smb1ops.c
+++ b/fs/cifs/smb1ops.c
@@ -575,37 +575,6 @@ cifs_query_file_info(const unsigned int xid, struct cifs_tcon *tcon,
        return CIFSSMBQFileInfo(xid, tcon, fid->netfid, data);
 }
-static char *
-cifs_build_path_to_root(struct smb_vol *vol, struct cifs_sb_info *cifs_sb,
-                        struct cifs_tcon *tcon)
-{
-        int pplen = vol->prepath ? strlen(vol->prepath) : 0;
-        int dfsplen;
-        char *full_path = NULL;
-        /* if no prefix path, simply set path to the root of share to "" */
-        if (pplen == 0) {
-                full_path = kzalloc(1, GFP_KERNEL);
-                return full_path;
-        }
-        if (tcon->Flags & SMB_SHARE_IS_IN_DFS)
-                dfsplen = strnlen(tcon->treeName, MAX_TREE_SIZE + 1);
-        else
-                dfsplen = 0;
-        full_path = kmalloc(dfsplen + pplen + 1, GFP_KERNEL);
-        if (full_path == NULL)
-                return full_path;
-        if (dfsplen)
-                strncpy(full_path, tcon->treeName, dfsplen);
-        strncpy(full_path + dfsplen, vol->prepath, pplen);
-        convert_delimiter(full_path, CIFS_DIR_SEP(cifs_sb));
-        full_path[dfsplen + pplen] = 0; /* add trailing null */
-        return full_path;
-}
 static void
 cifs_clear_stats(struct cifs_tcon *tcon)
 {
@@ -766,7 +735,6 @@ smb_set_file_info(struct inode *inode, const char *full_path,
        struct cifs_sb_info *cifs_sb = CIFS_SB(inode->i_sb);
        struct tcon_link *tlink = NULL;
        struct cifs_tcon *tcon;
-        FILE_BASIC_INFO info_buf;
        /* if the file is already open for write, just use that fileid */
        open_file = find_writable_file(cinode, true);
@@ -817,7 +785,7 @@ smb_set_file_info(struct inode *inode, const char *full_path,
        netpid = current->tgid;
 set_via_filehandle:
-        rc = CIFSSMBSetFileInfo(xid, tcon, &info_buf, netfid, netpid);
+        rc = CIFSSMBSetFileInfo(xid, tcon, buf, netfid, netpid);
        if (!rc)
                cinode->cifsAttrs = le32_to_cpu(buf->Attributes);
@@ -944,7 +912,6 @@ struct smb_version_operations smb1_operations = {
        .set_path_size = CIFSSMBSetEOF,
        .set_file_size = CIFSSMBSetFileSize,
        .set_file_info = smb_set_file_info,
-        .build_path_to_root = cifs_build_path_to_root,
        .echo = CIFSSMBEcho,
        .mkdir = CIFSSMBMkDir,
        .mkdir_setinfo = cifs_mkdir_setinfo,
diff --git a/fs/cifs/smb2file.c b/fs/cifs/smb2file.c
index a93eec30a50d..71e6aed4b382 100644
--- a/fs/cifs/smb2file.c
+++ b/fs/cifs/smb2file.c
@@ -260,13 +260,6 @@ smb2_push_mandatory_locks(struct cifsFileInfo *cfile)
        struct cifs_fid_locks *fdlocks;
        xid = get_xid();
-        /* we are going to update can_cache_brlcks here - need a write access */
-        down_write(&cinode->lock_sem);
-        if (!cinode->can_cache_brlcks) {
-                up_write(&cinode->lock_sem);
-                free_xid(xid);
-                return rc;
-        }
        /*
         * Accessing maxBuf is racy with cifs_reconnect - need to store value
@@ -274,7 +267,6 @@ smb2_push_mandatory_locks(struct cifsFileInfo *cfile)
         */
        max_buf = tlink_tcon(cfile->tlink)->ses->server->maxBuf;
        if (!max_buf) {
-                up_write(&cinode->lock_sem);
                free_xid(xid);
                return -EINVAL;
        }
@@ -282,7 +274,6 @@ smb2_push_mandatory_locks(struct cifsFileInfo *cfile)
        max_num = max_buf / sizeof(struct smb2_lock_element);
        buf = kzalloc(max_num * sizeof(struct smb2_lock_element), GFP_KERNEL);
        if (!buf) {
-                up_write(&cinode->lock_sem);
                free_xid(xid);
                return -ENOMEM;
        }
@@ -293,10 +284,7 @@ smb2_push_mandatory_locks(struct cifsFileInfo *cfile)
                        rc = stored_rc;
        }
-        cinode->can_cache_brlcks = false;
        kfree(buf);
-        up_write(&cinode->lock_sem);
        free_xid(xid);
        return rc;
 }
diff --git a/fs/cifs/smb2ops.c b/fs/cifs/smb2ops.c
index 4d9dbe0b7385..d79de7bc4435 100644
--- a/fs/cifs/smb2ops.c
+++ b/fs/cifs/smb2ops.c
@@ -262,23 +262,6 @@ smb2_query_file_info(const unsigned int xid, struct cifs_tcon *tcon,
        return rc;
 }
-static char *
-smb2_build_path_to_root(struct smb_vol *vol, struct cifs_sb_info *cifs_sb,
-                        struct cifs_tcon *tcon)
-{
-        int pplen = vol->prepath ? strlen(vol->prepath) : 0;
-        char *full_path = NULL;
-        /* if no prefix path, simply set path to the root of share to "" */
-        if (pplen == 0) {
-                full_path = kzalloc(2, GFP_KERNEL);
-                return full_path;
-        }
-        cERROR(1, "prefixpath is not supported for SMB2 now");
-        return NULL;
-}
 static bool
 smb2_can_echo(struct TCP_Server_Info *server)
 {
@@ -613,7 +596,6 @@ struct smb_version_operations smb21_operations = {
        .set_path_size = smb2_set_path_size,
        .set_file_size = smb2_set_file_size,
        .set_file_info = smb2_set_file_info,
-        .build_path_to_root = smb2_build_path_to_root,
        .mkdir = smb2_mkdir,
        .mkdir_setinfo = smb2_mkdir_setinfo,
        .rmdir = smb2_rmdir,
@@ -641,6 +623,91 @@ struct smb_version_operations smb21_operations = {
        .get_lease_key = smb2_get_lease_key,
        .set_lease_key = smb2_set_lease_key,
        .new_lease_key = smb2_new_lease_key,
+        .calc_signature = smb2_calc_signature,
+};
+struct smb_version_operations smb30_operations = {
+        .compare_fids = smb2_compare_fids,
+        .setup_request = smb2_setup_request,
+        .setup_async_request = smb2_setup_async_request,
+        .check_receive = smb2_check_receive,
+        .add_credits = smb2_add_credits,
+        .set_credits = smb2_set_credits,
+        .get_credits_field = smb2_get_credits_field,
+        .get_credits = smb2_get_credits,
+        .get_next_mid = smb2_get_next_mid,
+        .read_data_offset = smb2_read_data_offset,
+        .read_data_length = smb2_read_data_length,
+        .map_error = map_smb2_to_linux_error,
+        .find_mid = smb2_find_mid,
+        .check_message = smb2_check_message,
+        .dump_detail = smb2_dump_detail,
+        .clear_stats = smb2_clear_stats,
+        .print_stats = smb2_print_stats,
+        .is_oplock_break = smb2_is_valid_oplock_break,
+        .need_neg = smb2_need_neg,
+        .negotiate = smb2_negotiate,
+        .negotiate_wsize = smb2_negotiate_wsize,
+        .negotiate_rsize = smb2_negotiate_rsize,
+        .sess_setup = SMB2_sess_setup,
+        .logoff = SMB2_logoff,
+        .tree_connect = SMB2_tcon,
+        .tree_disconnect = SMB2_tdis,
+        .is_path_accessible = smb2_is_path_accessible,
+        .can_echo = smb2_can_echo,
+        .echo = SMB2_echo,
+        .query_path_info = smb2_query_path_info,
+        .get_srv_inum = smb2_get_srv_inum,
+        .query_file_info = smb2_query_file_info,
+        .set_path_size = smb2_set_path_size,
+        .set_file_size = smb2_set_file_size,
+        .set_file_info = smb2_set_file_info,
+        .mkdir = smb2_mkdir,
+        .mkdir_setinfo = smb2_mkdir_setinfo,
+        .rmdir = smb2_rmdir,
+        .unlink = smb2_unlink,
+        .rename = smb2_rename_path,
+        .create_hardlink = smb2_create_hardlink,
+        .open = smb2_open_file,
+        .set_fid = smb2_set_fid,
+        .close = smb2_close_file,
+        .flush = smb2_flush_file,
+        .async_readv = smb2_async_readv,
+        .async_writev = smb2_async_writev,
+        .sync_read = smb2_sync_read,
+        .sync_write = smb2_sync_write,
+        .query_dir_first = smb2_query_dir_first,
+        .query_dir_next = smb2_query_dir_next,
+        .close_dir = smb2_close_dir,
+        .calc_smb_size = smb2_calc_size,
+        .is_status_pending = smb2_is_status_pending,
+        .oplock_response = smb2_oplock_response,
+        .queryfs = smb2_queryfs,
+        .mand_lock = smb2_mand_lock,
+        .mand_unlock_range = smb2_unlock_range,
+        .push_mand_locks = smb2_push_mandatory_locks,
+        .get_lease_key = smb2_get_lease_key,
+        .set_lease_key = smb2_set_lease_key,
+        .new_lease_key = smb2_new_lease_key,
+        .calc_signature = smb3_calc_signature,
+};
+struct smb_version_values smb20_values = {
+        .version_string = SMB20_VERSION_STRING,
+        .protocol_id = SMB20_PROT_ID,
+        .req_capabilities = 0, /* MBZ */
+        .large_lock_type = 0,
+        .exclusive_lock_type = SMB2_LOCKFLAG_EXCLUSIVE_LOCK,
+        .shared_lock_type = SMB2_LOCKFLAG_SHARED_LOCK,
+        .unlock_lock_type = SMB2_LOCKFLAG_UNLOCK,
+        .header_size = sizeof(struct smb2_hdr),
+        .max_header_size = MAX_SMB2_HDR_SIZE,
+        .read_rsp_size = sizeof(struct smb2_read_rsp) - 1,
+        .lock_cmd = SMB2_LOCK,
+        .cap_unix = 0,
+        .cap_nt_find = SMB2_NT_FIND,
+        .cap_large_files = SMB2_LARGE_FILES,
 };
 struct smb_version_values smb21_values = {
diff --git a/fs/cifs/smb2pdu.c b/fs/cifs/smb2pdu.c
index cf33622cdac8..41d9d0725f0f 100644
--- a/fs/cifs/smb2pdu.c
+++ b/fs/cifs/smb2pdu.c
@@ -425,7 +425,7 @@ SMB2_negotiate(const unsigned int xid, struct cifs_ses *ses)
        }
        cFYI(1, "sec_flags 0x%x", sec_flags);
-        if (sec_flags & CIFSSEC_MUST_SIGN) {
+        if ((sec_flags & CIFSSEC_MUST_SIGN) == CIFSSEC_MUST_SIGN) {
                cFYI(1, "Signing required");
                if (!(server->sec_mode & (SMB2_NEGOTIATE_SIGNING_REQUIRED |
                      SMB2_NEGOTIATE_SIGNING_ENABLED))) {
@@ -612,7 +612,8 @@ ssetup_ntlmssp_authenticate:
        /* BB add code to build os and lm fields */
-        rc = SendReceive2(xid, ses, iov, 2, &resp_buftype, CIFS_LOG_ERROR);
+        rc = SendReceive2(xid, ses, iov, 2, &resp_buftype,
+                          CIFS_LOG_ERROR | CIFS_NEG_OP);
        kfree(security_blob);
        rsp = (struct smb2_sess_setup_rsp *)iov[0].iov_base;
diff --git a/fs/cifs/smb2proto.h b/fs/cifs/smb2proto.h
index 7d25f8b14f93..2aa3535e38ce 100644
--- a/fs/cifs/smb2proto.h
+++ b/fs/cifs/smb2proto.h
@@ -47,6 +47,10 @@ extern struct mid_q_entry *smb2_setup_request(struct cifs_ses *ses,
                              struct smb_rqst *rqst);
 extern struct mid_q_entry *smb2_setup_async_request(
                        struct TCP_Server_Info *server, struct smb_rqst *rqst);
+extern int smb2_calc_signature(struct smb_rqst *rqst,
+                                struct TCP_Server_Info *server);
+extern int smb3_calc_signature(struct smb_rqst *rqst,
+                                struct TCP_Server_Info *server);
 extern void smb2_echo_request(struct work_struct *work);
 extern __le32 smb2_get_lease_state(struct cifsInodeInfo *cinode);
 extern __u8 smb2_map_lease_to_oplock(__le32 lease_state);
diff --git a/fs/cifs/smb2transport.c b/fs/cifs/smb2transport.c
index 2a5fdf26f79f..8dd73e61d762 100644
--- a/fs/cifs/smb2transport.c
+++ b/fs/cifs/smb2transport.c
@@ -39,7 +39,7 @@
 #include "smb2status.h"
 #include "smb2glob.h"
-static int
+int
 smb2_calc_signature(struct smb_rqst *rqst, struct TCP_Server_Info *server)
 {
        int i, rc;
@@ -116,6 +116,13 @@ smb2_calc_signature(struct smb_rqst *rqst, struct TCP_Server_Info *server)
        return rc;
 }
+int
+smb3_calc_signature(struct smb_rqst *rqst, struct TCP_Server_Info *server)
+{
+        cFYI(1, "smb3 signatures not supported yet");
+        return -EOPNOTSUPP;
+}
 /* must be called with server->srv_mutex held */
 static int
 smb2_sign_rqst(struct smb_rqst *rqst, struct TCP_Server_Info *server)
@@ -132,7 +139,7 @@ smb2_sign_rqst(struct smb_rqst *rqst, struct TCP_Server_Info *server)
                return rc;
        }
-        rc = smb2_calc_signature(rqst, server);
+        rc = server->ops->calc_signature(rqst, server);
        return rc;
 }
@@ -168,7 +175,7 @@ smb2_verify_signature(struct smb_rqst *rqst, struct TCP_Server_Info *server)
        memset(smb2_pdu->Signature, 0, SMB2_SIGNATURE_SIZE);
        mutex_lock(&server->srv_mutex);
-        rc = smb2_calc_signature(rqst, server);
+        rc = server->ops->calc_signature(rqst, server);
        mutex_unlock(&server->srv_mutex);
        if (rc)
diff --git a/fs/compat_ioctl.c b/fs/compat_ioctl.c
index 4c6285fff598..e2f57a007029 100644
--- a/fs/compat_ioctl.c
+++ b/fs/compat_ioctl.c
@@ -844,6 +844,9 @@ COMPATIBLE_IOCTL(TIOCGDEV)
 COMPATIBLE_IOCTL(TIOCCBRK)
 COMPATIBLE_IOCTL(TIOCGSID)
 COMPATIBLE_IOCTL(TIOCGICOUNT)
+COMPATIBLE_IOCTL(TIOCGPKT)
+COMPATIBLE_IOCTL(TIOCGPTLCK)
+COMPATIBLE_IOCTL(TIOCGEXCL)
 /* Little t */
 COMPATIBLE_IOCTL(TIOCGETD)
 COMPATIBLE_IOCTL(TIOCSETD)
diff --git a/fs/configfs/dir.c b/fs/configfs/dir.c
index 7414ae24a79b..712b10f64c70 100644
--- a/fs/configfs/dir.c
+++ b/fs/configfs/dir.c
@@ -1613,12 +1613,12 @@ static int configfs_readdir(struct file * filp, void * dirent, filldir_t filldir
        return 0;
 }
-static loff_t configfs_dir_lseek(struct file * file, loff_t offset, int origin)
+static loff_t configfs_dir_lseek(struct file *file, loff_t offset, int whence)
 {
        struct dentry * dentry = file->f_path.dentry;
        mutex_lock(&dentry->d_inode->i_mutex);
-        switch (origin) {
+        switch (whence) {
                case 1:
                        offset += file->f_pos;
                case 0:
diff --git a/fs/coredump.c b/fs/coredump.c
index ce47379bfa61..177493272a61 100644
--- a/fs/coredump.c
+++ b/fs/coredump.c
@@ -458,7 +458,7 @@ static int umh_pipe_setup(struct subprocess_info *info, struct cred *new)
        return err;
 }
-void do_coredump(siginfo_t *siginfo, struct pt_regs *regs)
+void do_coredump(siginfo_t *siginfo)
 {
        struct core_state core_state;
        struct core_name cn;
@@ -474,7 +474,7 @@ void do_coredump(siginfo_t *siginfo, struct pt_regs *regs)
        static atomic_t core_dump_count = ATOMIC_INIT(0);
        struct coredump_params cprm = {
                .siginfo = siginfo,
-                .regs = regs,
+                .regs = signal_pt_regs(),
                .limit = rlimit(RLIMIT_CORE),
                /*
                 * We must use the same mm->flags while dumping core to avoid
diff --git a/fs/debugfs/inode.c b/fs/debugfs/inode.c
index b607d92cdf24..153bb1e42e63 100644
--- a/fs/debugfs/inode.c
+++ b/fs/debugfs/inode.c
@@ -59,7 +59,6 @@ static struct inode *debugfs_get_inode(struct super_block *sb, umode_t mode, dev
                case S_IFDIR:
                        inode->i_op = &simple_dir_inode_operations;
                        inode->i_fop = &simple_dir_operations;
-                        inode->i_private = NULL;
                        /* directory inodes start off with i_nlink == 2
                         * (for "." entry) */
diff --git a/fs/devpts/inode.c b/fs/devpts/inode.c
index 14afbabe6546..472e6befc54d 100644
--- a/fs/devpts/inode.c
+++ b/fs/devpts/inode.c
@@ -545,37 +545,38 @@ void devpts_kill_index(struct inode *ptmx_inode, int idx)
        mutex_unlock(&allocated_ptys_lock);
 }
-int devpts_pty_new(struct inode *ptmx_inode, struct tty_struct *tty)
+/**
+ * devpts_pty_new -- create a new inode in /dev/pts/
+ * @ptmx_inode: inode of the master
+ * @device: major+minor of the node to be created
+ * @index: used as a name of the node
+ * @priv: what's given back by devpts_get_priv
+ *
+ * The created inode is returned. Remove it from /dev/pts/ by devpts_pty_kill.
+ */
+struct inode *devpts_pty_new(struct inode *ptmx_inode, dev_t device, int index,
+                void *priv)
 {
-        /* tty layer puts index from devpts_new_index() in here */
-        int number = tty->index;
-        struct tty_driver *driver = tty->driver;
-        dev_t device = MKDEV(driver->major, driver->minor_start+number);
        struct dentry *dentry;
        struct super_block *sb = pts_sb_from_inode(ptmx_inode);
-        struct inode *inode = new_inode(sb);
+        struct inode *inode;
        struct dentry *root = sb->s_root;
        struct pts_fs_info *fsi = DEVPTS_SB(sb);
        struct pts_mount_opts *opts = &fsi->mount_opts;
-        int ret = 0;
        char s[12];
-        /* We're supposed to be given the slave end of a pty */
+        inode = new_inode(sb);
-        BUG_ON(driver->type != TTY_DRIVER_TYPE_PTY);
-        BUG_ON(driver->subtype != PTY_TYPE_SLAVE);
        if (!inode)
-                return -ENOMEM;
+                return ERR_PTR(-ENOMEM);
-        inode->i_ino = number + 3;
+        inode->i_ino = index + 3;
        inode->i_uid = opts->setuid ? opts->uid : current_fsuid();
        inode->i_gid = opts->setgid ? opts->gid : current_fsgid();
        inode->i_mtime = inode->i_atime = inode->i_ctime = CURRENT_TIME;
        init_special_inode(inode, S_IFCHR|opts->mode, device);
-        inode->i_private = tty;
+        inode->i_private = priv;
-        tty->driver_data = inode;
-        sprintf(s, "%d", number);
+        sprintf(s, "%d", index);
        mutex_lock(&root->d_inode->i_mutex);
@@ -585,18 +586,24 @@ int devpts_pty_new(struct inode *ptmx_inode, struct tty_struct *tty)
                fsnotify_create(root->d_inode, dentry);
        } else {
                iput(inode);
-                ret = -ENOMEM;
+                inode = ERR_PTR(-ENOMEM);
        }
        mutex_unlock(&root->d_inode->i_mutex);
-        return ret;
+        return inode;
 }
-struct tty_struct *devpts_get_tty(struct inode *pts_inode, int number)
+/**
+ * devpts_get_priv -- get private data for a slave
+ * @pts_inode: inode of the slave
+ *
+ * Returns whatever was passed as priv in devpts_pty_new for a given inode.
+ */
+void *devpts_get_priv(struct inode *pts_inode)
 {
        struct dentry *dentry;
-        struct tty_struct *tty;
+        void *priv = NULL;
        BUG_ON(pts_inode->i_rdev == MKDEV(TTYAUX_MAJOR, PTMX_MINOR));
@@ -605,18 +612,22 @@ struct tty_struct *devpts_get_tty(struct inode *pts_inode, int number)
        if (!dentry)
                return NULL;
-        tty = NULL;
        if (pts_inode->i_sb->s_magic == DEVPTS_SUPER_MAGIC)
-                tty = (struct tty_struct *)pts_inode->i_private;
+                priv = pts_inode->i_private;
        dput(dentry);
-        return tty;
+        return priv;
 }
-void devpts_pty_kill(struct tty_struct *tty)
+/**
+ * devpts_pty_kill -- remove inode form /dev/pts/
+ * @inode: inode of the slave to be removed
+ *
+ * This is an inverse operation of devpts_pty_new.
+ */
+void devpts_pty_kill(struct inode *inode)
 {
-        struct inode *inode = tty->driver_data;
        struct super_block *sb = pts_sb_from_inode(inode);
        struct dentry *root = sb->s_root;
        struct dentry *dentry;
diff --git a/fs/direct-io.c b/fs/direct-io.c
index f86c720dba0e..cf5b44b10c67 100644
--- a/fs/direct-io.c
+++ b/fs/direct-io.c
@@ -540,6 +540,7 @@ static int get_more_blocks(struct dio *dio, struct dio_submit *sdio,
        sector_t fs_endblk;     /* Into file, in filesystem-sized blocks */
        unsigned long fs_count; /* Number of filesystem-sized blocks */
        int create;
+        unsigned int i_blkbits = sdio->blkbits + sdio->blkfactor;
        /*
         * If there was a memory error and we've overwritten all the
@@ -554,7 +555,7 @@ static int get_more_blocks(struct dio *dio, struct dio_submit *sdio,
                fs_count = fs_endblk - fs_startblk + 1;
                map_bh->b_state = 0;
-                map_bh->b_size = fs_count << dio->inode->i_blkbits;
+                map_bh->b_size = fs_count << i_blkbits;
                /*
                 * For writes inside i_size on a DIO_SKIP_HOLES filesystem we
@@ -1053,7 +1054,8 @@ do_blockdev_direct_IO(int rw, struct kiocb *iocb, struct inode *inode,
        int seg;
        size_t size;
        unsigned long addr;
-        unsigned blkbits = inode->i_blkbits;
+        unsigned i_blkbits = ACCESS_ONCE(inode->i_blkbits);
+        unsigned blkbits = i_blkbits;
        unsigned blocksize_mask = (1 << blkbits) - 1;
        ssize_t retval = -EINVAL;
        loff_t end = offset;
@@ -1149,7 +1151,7 @@ do_blockdev_direct_IO(int rw, struct kiocb *iocb, struct inode *inode,
        dio->inode = inode;
        dio->rw = rw;
        sdio.blkbits = blkbits;
-        sdio.blkfactor = inode->i_blkbits - blkbits;
+        sdio.blkfactor = i_blkbits - blkbits;
        sdio.block_in_file = offset >> blkbits;
        sdio.get_block = get_block;
diff --git a/fs/dlm/Kconfig b/fs/dlm/Kconfig
index 1897eb1b4b6a..e4242c3f8486 100644
--- a/fs/dlm/Kconfig
+++ b/fs/dlm/Kconfig
@@ -1,6 +1,6 @@
 menuconfig DLM
        tristate "Distributed Lock Manager (DLM)"
-        depends on EXPERIMENTAL && INET
+        depends on INET
        depends on SYSFS && CONFIGFS_FS && (IPV6 || IPV6=n)
        select IP_SCTP
        help
diff --git a/fs/dlm/dlm_internal.h b/fs/dlm/dlm_internal.h
index 871c1abf6029..77c0f70f8fe8 100644
--- a/fs/dlm/dlm_internal.h
+++ b/fs/dlm/dlm_internal.h
@@ -337,6 +337,7 @@ enum rsb_flags {
        RSB_NEW_MASTER2,
        RSB_RECOVER_CONVERT,
        RSB_RECOVER_GRANT,
+        RSB_RECOVER_LVB_INVAL,
 };
 static inline void rsb_set_flag(struct dlm_rsb *r, enum rsb_flags flag)
diff --git a/fs/dlm/lock.c b/fs/dlm/lock.c
index b56950758188..a579f30f237d 100644
--- a/fs/dlm/lock.c
+++ b/fs/dlm/lock.c
@@ -5393,6 +5393,13 @@ static void purge_dead_list(struct dlm_ls *ls, struct dlm_rsb *r,
                if ((lkb->lkb_nodeid == nodeid_gone) ||
                    dlm_is_removed(ls, lkb->lkb_nodeid)) {
+                        /* tell recover_lvb to invalidate the lvb
+                           because a node holding EX/PW failed */
+                        if ((lkb->lkb_exflags & DLM_LKF_VALBLK) &&
+                            (lkb->lkb_grmode >= DLM_LOCK_PW)) {
+                                rsb_set_flag(r, RSB_RECOVER_LVB_INVAL);
+                        }
                        del_lkb(r, lkb);
                        /* this put should free the lkb */
@@ -6025,15 +6032,18 @@ static int orphan_proc_lock(struct dlm_ls *ls, struct dlm_lkb *lkb)
        return error;
 }
-/* The force flag allows the unlock to go ahead even if the lkb isn't granted.
+/* The FORCEUNLOCK flag allows the unlock to go ahead even if the lkb isn't
-   Regardless of what rsb queue the lock is on, it's removed and freed. */
+   granted.  Regardless of what rsb queue the lock is on, it's removed and
+   freed.  The IVVALBLK flag causes the lvb on the resource to be invalidated
+   if our lock is PW/EX (it's ignored if our granted mode is smaller.) */
 static int unlock_proc_lock(struct dlm_ls *ls, struct dlm_lkb *lkb)
 {
        struct dlm_args args;
        int error;
-        set_unlock_args(DLM_LKF_FORCEUNLOCK, lkb->lkb_ua, &args);
+        set_unlock_args(DLM_LKF_FORCEUNLOCK | DLM_LKF_IVVALBLK,
+                        lkb->lkb_ua, &args);
        error = unlock_lock(ls, lkb, &args);
        if (error == -DLM_EUNLOCK)
diff --git a/fs/dlm/lowcomms.c b/fs/dlm/lowcomms.c
index 331ea4f94efd..dd87a31bcc21 100644
--- a/fs/dlm/lowcomms.c
+++ b/fs/dlm/lowcomms.c
@@ -1385,7 +1385,6 @@ void *dlm_lowcomms_get_buffer(int nodeid, int len, gfp_t allocation, char **ppc)
        struct connection *con;
        struct writequeue_entry *e;
        int offset = 0;
-        int users = 0;
        con = nodeid2con(nodeid, allocation);
        if (!con)
@@ -1399,7 +1398,7 @@ void *dlm_lowcomms_get_buffer(int nodeid, int len, gfp_t allocation, char **ppc)
        } else {
                offset = e->end;
                e->end += len;
-                users = e->users++;
+                e->users++;
        }
        spin_unlock(&con->writequeue_lock);
@@ -1414,7 +1413,7 @@ void *dlm_lowcomms_get_buffer(int nodeid, int len, gfp_t allocation, char **ppc)
                spin_lock(&con->writequeue_lock);
                offset = e->end;
                e->end += len;
-                users = e->users++;
+                e->users++;
                list_add_tail(&e->list, &con->writequeue);
                spin_unlock(&con->writequeue_lock);
                goto got_one;
diff --git a/fs/dlm/recover.c b/fs/dlm/recover.c
index 4a7a76e42fc3..aedea28a86a1 100644
--- a/fs/dlm/recover.c
+++ b/fs/dlm/recover.c
@@ -717,8 +717,14 @@ void dlm_recovered_lock(struct dlm_rsb *r)
 * the VALNOTVALID flag if necessary, and determining the correct lvb contents
 * based on the lvb's of the locks held on the rsb.
 *
- * RSB_VALNOTVALID is set if there are only NL/CR locks on the rsb.  If it
+ * RSB_VALNOTVALID is set in two cases:
- * was already set prior to recovery, it's not cleared, regardless of locks.
+ *
+ * 1. we are master, but not new, and we purged an EX/PW lock held by a
+ * failed node (in dlm_recover_purge which set RSB_RECOVER_LVB_INVAL)
+ *
+ * 2. we are a new master, and there are only NL/CR locks left.
+ * (We could probably improve this by only invaliding in this way when
+ * the previous master left uncleanly.  VMS docs mention that.)
 *
 * The LVB contents are only considered for changing when this is a new master
 * of the rsb (NEW_MASTER2).  Then, the rsb's lvb is taken from any lkb with
@@ -734,6 +740,19 @@ static void recover_lvb(struct dlm_rsb *r)
        int big_lock_exists = 0;
        int lvblen = r->res_ls->ls_lvblen;
+        if (!rsb_flag(r, RSB_NEW_MASTER2) &&
+            rsb_flag(r, RSB_RECOVER_LVB_INVAL)) {
+                /* case 1 above */
+                rsb_set_flag(r, RSB_VALNOTVALID);
+                return;
+        }
+        if (!rsb_flag(r, RSB_NEW_MASTER2))
+                return;
+        /* we are the new master, so figure out if VALNOTVALID should
+           be set, and set the rsb lvb from the best lkb available. */
        list_for_each_entry(lkb, &r->res_grantqueue, lkb_statequeue) {
                if (!(lkb->lkb_exflags & DLM_LKF_VALBLK))
                        continue;
@@ -772,13 +791,10 @@ static void recover_lvb(struct dlm_rsb *r)
        if (!lock_lvb_exists)
                goto out;
+        /* lvb is invalidated if only NL/CR locks remain */
        if (!big_lock_exists)
                rsb_set_flag(r, RSB_VALNOTVALID);
-        /* don't mess with the lvb unless we're the new master */
-        if (!rsb_flag(r, RSB_NEW_MASTER2))
-                goto out;
        if (!r->res_lvbptr) {
                r->res_lvbptr = dlm_allocate_lvb(r->res_ls);
                if (!r->res_lvbptr)
@@ -852,12 +868,19 @@ void dlm_recover_rsbs(struct dlm_ls *ls)
                if (is_master(r)) {
                        if (rsb_flag(r, RSB_RECOVER_CONVERT))
                                recover_conversion(r);
+                        /* recover lvb before granting locks so the updated
+                           lvb/VALNOTVALID is presented in the completion */
+                        recover_lvb(r);
                        if (rsb_flag(r, RSB_NEW_MASTER2))
                                recover_grant(r);
-                        recover_lvb(r);
                        count++;
+                } else {
+                        rsb_clear_flag(r, RSB_VALNOTVALID);
                }
                rsb_clear_flag(r, RSB_RECOVER_CONVERT);
+                rsb_clear_flag(r, RSB_RECOVER_LVB_INVAL);
                rsb_clear_flag(r, RSB_NEW_MASTER2);
                unlock_rsb(r);
        }
diff --git a/fs/eventfd.c b/fs/eventfd.c
index d81b9f654086..35470d9b96e6 100644
--- a/fs/eventfd.c
+++ b/fs/eventfd.c
@@ -19,6 +19,8 @@
 #include <linux/export.h>
 #include <linux/kref.h>
 #include <linux/eventfd.h>
+#include <linux/proc_fs.h>
+#include <linux/seq_file.h>
 struct eventfd_ctx {
        struct kref kref;
@@ -284,7 +286,25 @@ static ssize_t eventfd_write(struct file *file, const char __user *buf, size_t c
        return res;
 }
+#ifdef CONFIG_PROC_FS
+static int eventfd_show_fdinfo(struct seq_file *m, struct file *f)
+{
+        struct eventfd_ctx *ctx = f->private_data;
+        int ret;
+        spin_lock_irq(&ctx->wqh.lock);
+        ret = seq_printf(m, "eventfd-count: %16llx\n",
+                         (unsigned long long)ctx->count);
+        spin_unlock_irq(&ctx->wqh.lock);
+        return ret;
+}
+#endif
 static const struct file_operations eventfd_fops = {
+#ifdef CONFIG_PROC_FS
+        .show_fdinfo    = eventfd_show_fdinfo,
+#endif
        .release        = eventfd_release,
        .poll           = eventfd_poll,
        .read           = eventfd_read,
diff --git a/fs/eventpoll.c b/fs/eventpoll.c
index da72250ddc1c..be56b21435f8 100644
--- a/fs/eventpoll.c
+++ b/fs/eventpoll.c
@@ -38,6 +38,8 @@
 #include <asm/io.h>
 #include <asm/mman.h>
 #include <linux/atomic.h>
+#include <linux/proc_fs.h>
+#include <linux/seq_file.h>
 /*
 * LOCKING:
@@ -346,7 +348,7 @@ static inline struct epitem *ep_item_from_epqueue(poll_table *p)
 /* Tells if the epoll_ctl(2) operation needs an event copy from userspace */
 static inline int ep_op_has_event(int op)
 {
-        return op == EPOLL_CTL_ADD || op == EPOLL_CTL_MOD;
+        return op != EPOLL_CTL_DEL;
 }
 /* Initialize the poll safe wake up structure */
@@ -676,34 +678,6 @@ static int ep_remove(struct eventpoll *ep, struct epitem *epi)
        return 0;
 }
-/*
- * Disables a "struct epitem" in the eventpoll set. Returns -EBUSY if the item
- * had no event flags set, indicating that another thread may be currently
- * handling that item's events (in the case that EPOLLONESHOT was being
- * used). Otherwise a zero result indicates that the item has been disabled
- * from receiving events. A disabled item may be re-enabled via
- * EPOLL_CTL_MOD. Must be called with "mtx" held.
- */
-static int ep_disable(struct eventpoll *ep, struct epitem *epi)
-{
-        int result = 0;
-        unsigned long flags;
-        spin_lock_irqsave(&ep->lock, flags);
-        if (epi->event.events & ~EP_PRIVATE_BITS) {
-                if (ep_is_linked(&epi->rdllink))
-                        list_del_init(&epi->rdllink);
-                /* Ensure ep_poll_callback will not add epi back onto ready
-                   list: */
-                epi->event.events &= EP_PRIVATE_BITS;
-                }
-        else
-                result = -EBUSY;
-        spin_unlock_irqrestore(&ep->lock, flags);
-        return result;
-}
 static void ep_free(struct eventpoll *ep)
 {
        struct rb_node *rbp;
@@ -811,8 +785,34 @@ static unsigned int ep_eventpoll_poll(struct file *file, poll_table *wait)
        return pollflags != -1 ? pollflags : 0;
 }
+#ifdef CONFIG_PROC_FS
+static int ep_show_fdinfo(struct seq_file *m, struct file *f)
+{
+        struct eventpoll *ep = f->private_data;
+        struct rb_node *rbp;
+        int ret = 0;
+        mutex_lock(&ep->mtx);
+        for (rbp = rb_first(&ep->rbr); rbp; rbp = rb_next(rbp)) {
+                struct epitem *epi = rb_entry(rbp, struct epitem, rbn);
+                ret = seq_printf(m, "tfd: %8d events: %8x data: %16llx\n",
+                                 epi->ffd.fd, epi->event.events,
+                                 (long long)epi->event.data);
+                if (ret)
+                        break;
+        }
+        mutex_unlock(&ep->mtx);
+        return ret;
+}
+#endif
 /* File callbacks that implement the eventpoll file behaviour */
 static const struct file_operations eventpoll_fops = {
+#ifdef CONFIG_PROC_FS
+        .show_fdinfo    = ep_show_fdinfo,
+#endif
        .release        = ep_eventpoll_release,
        .poll           = ep_eventpoll_poll,
        .llseek         = noop_llseek,
@@ -1048,6 +1048,8 @@ static void ep_rbtree_insert(struct eventpoll *ep, struct epitem *epi)
        rb_insert_color(&epi->rbn, &ep->rbr);
 }
 #define PATH_ARR_SIZE 5
 /*
 * These are the number paths of length 1 to 5, that we are allowing to emanate
@@ -1813,12 +1815,6 @@ SYSCALL_DEFINE4(epoll_ctl, int, epfd, int, op, int, fd,
                } else
                        error = -ENOENT;
                break;
-        case EPOLL_CTL_DISABLE:
-                if (epi)
-                        error = ep_disable(ep, epi);
-                else
-                        error = -ENOENT;
-                break;
        }
        mutex_unlock(&ep->mtx);
diff --git a/fs/exec.c b/fs/exec.c
index 0039055b1fc6..d8e1191cb112 100644
--- a/fs/exec.c
+++ b/fs/exec.c
@@ -1266,14 +1266,13 @@ int prepare_binprm(struct linux_binprm *bprm)
        bprm->cred->egid = current_egid();
        if (!(bprm->file->f_path.mnt->mnt_flags & MNT_NOSUID) &&
-            !current->no_new_privs) {
+            !current->no_new_privs &&
+            kuid_has_mapping(bprm->cred->user_ns, inode->i_uid) &&
+            kgid_has_mapping(bprm->cred->user_ns, inode->i_gid)) {
                /* Set-uid? */
                if (mode & S_ISUID) {
-                        if (!kuid_has_mapping(bprm->cred->user_ns, inode->i_uid))
-                                return -EPERM;
                        bprm->per_clear |= PER_CLEAR_ON_SETID;
                        bprm->cred->euid = inode->i_uid;
                }
                /* Set-gid? */
@@ -1283,8 +1282,6 @@ int prepare_binprm(struct linux_binprm *bprm)
                 * executable.
                 */
                if ((mode & (S_ISGID | S_IXGRP)) == (S_ISGID | S_IXGRP)) {
-                        if (!kgid_has_mapping(bprm->cred->user_ns, inode->i_gid))
-                                return -EPERM;
                        bprm->per_clear |= PER_CLEAR_ON_SETID;
                        bprm->cred->egid = inode->i_gid;
                }
@@ -1349,13 +1346,17 @@ EXPORT_SYMBOL(remove_arg_zero);
 /*
 * cycle the list of binary formats handler, until one recognizes the image
 */
-int search_binary_handler(struct linux_binprm *bprm,struct pt_regs *regs)
+int search_binary_handler(struct linux_binprm *bprm)
 {
        unsigned int depth = bprm->recursion_depth;
        int try,retval;
        struct linux_binfmt *fmt;
        pid_t old_pid, old_vpid;
+        /* This allows 4 levels of binfmt rewrites before failing hard. */
+        if (depth > 5)
+                return -ELOOP;
        retval = security_bprm_check(bprm);
        if (retval)
                return retval;
@@ -1374,18 +1375,14 @@ int search_binary_handler(struct linux_binprm *bprm,struct pt_regs *regs)
        for (try=0; try<2; try++) {
                read_lock(&binfmt_lock);
                list_for_each_entry(fmt, &formats, lh) {
-                        int (*fn)(struct linux_binprm *, struct pt_regs *) = fmt->load_binary;
+                        int (*fn)(struct linux_binprm *) = fmt->load_binary;
                        if (!fn)
                                continue;
                        if (!try_module_get(fmt->module))
                                continue;
                        read_unlock(&binfmt_lock);
-                        retval = fn(bprm, regs);
+                        bprm->recursion_depth = depth + 1;
-                        /*
+                        retval = fn(bprm);
-                         * Restore the depth counter to its starting value
-                         * in this call, so we don't have to rely on every
-                         * load_binary function to restore it on return.
-                         */
                        bprm->recursion_depth = depth;
                        if (retval >= 0) {
                                if (depth == 0) {
@@ -1439,8 +1436,7 @@ EXPORT_SYMBOL(search_binary_handler);
 */
 static int do_execve_common(const char *filename,
                                struct user_arg_ptr argv,
-                                struct user_arg_ptr envp,
+                                struct user_arg_ptr envp)
-                                struct pt_regs *regs)
 {
        struct linux_binprm *bprm;
        struct file *file;
@@ -1524,7 +1520,7 @@ static int do_execve_common(const char *filename,
        if (retval < 0)
                goto out;
-        retval = search_binary_handler(bprm,regs);
+        retval = search_binary_handler(bprm);
        if (retval < 0)
                goto out;
@@ -1566,19 +1562,17 @@ out_ret:
 int do_execve(const char *filename,
        const char __user *const __user *__argv,
-        const char __user *const __user *__envp,
+        const char __user *const __user *__envp)
-        struct pt_regs *regs)
 {
        struct user_arg_ptr argv = { .ptr.native = __argv };
        struct user_arg_ptr envp = { .ptr.native = __envp };
-        return do_execve_common(filename, argv, envp, regs);
+        return do_execve_common(filename, argv, envp);
 }
 #ifdef CONFIG_COMPAT
-int compat_do_execve(const char *filename,
+static int compat_do_execve(const char *filename,
        const compat_uptr_t __user *__argv,
-        const compat_uptr_t __user *__envp,
+        const compat_uptr_t __user *__envp)
-        struct pt_regs *regs)
 {
        struct user_arg_ptr argv = {
                .is_compat = true,
@@ -1588,7 +1582,7 @@ int compat_do_execve(const char *filename,
                .is_compat = true,
                .ptr.compat = __envp,
        };
-        return do_execve_common(filename, argv, envp, regs);
+        return do_execve_common(filename, argv, envp);
 }
 #endif
@@ -1669,7 +1663,7 @@ SYSCALL_DEFINE3(execve,
        struct filename *path = getname(filename);
        int error = PTR_ERR(path);
        if (!IS_ERR(path)) {
-                error = do_execve(path->name, argv, envp, current_pt_regs());
+                error = do_execve(path->name, argv, envp);
                putname(path);
        }
        return error;
@@ -1682,8 +1676,7 @@ asmlinkage long compat_sys_execve(const char __user * filename,
        struct filename *path = getname(filename);
        int error = PTR_ERR(path);
        if (!IS_ERR(path)) {
-                error = compat_do_execve(path->name, argv, envp,
+                error = compat_do_execve(path->name, argv, envp);
-                                                        current_pt_regs());
                putname(path);
        }
        return error;
@@ -1696,12 +1689,9 @@ int kernel_execve(const char *filename,
                  const char *const argv[],
                  const char *const envp[])
 {
-        struct pt_regs *p = current_pt_regs();
+        int ret = do_execve(filename,
-        int ret;
-        ret = do_execve(filename,
                        (const char __user *const __user *)argv,
-                        (const char __user *const __user *)envp, p);
+                        (const char __user *const __user *)envp);
        if (ret < 0)
                return ret;
@@ -1709,6 +1699,6 @@ int kernel_execve(const char *filename,
         * We were successful.  We won't be returning to our caller, but
         * instead to user space by manipulating the kernel stack.
         */
-        ret_from_kernel_execve(p);
+        ret_from_kernel_execve(current_pt_regs());
 }
 #endif
diff --git a/fs/exofs/inode.c b/fs/exofs/inode.c
index b56181047751..d1f80abd8828 100644
--- a/fs/exofs/inode.c
+++ b/fs/exofs/inode.c
@@ -361,12 +361,12 @@ static int read_exec(struct page_collect *pcol)
        return 0;
 err:
-        if (!pcol->read_4_write)
+        if (!pcol_copy) /* Failed before ownership transfer */
-                _unlock_pcol_pages(pcol, ret, READ);
+                pcol_copy = pcol;
+        _unlock_pcol_pages(pcol_copy, ret, READ);
-        pcol_free(pcol);
+        pcol_free(pcol_copy);
        kfree(pcol_copy);
        return ret;
 }
@@ -676,8 +676,10 @@ static int write_exec(struct page_collect *pcol)
        return 0;
 err:
-        _unlock_pcol_pages(pcol, ret, WRITE);
+        if (!pcol_copy) /* Failed before ownership transfer */
-        pcol_free(pcol);
+                pcol_copy = pcol;
+        _unlock_pcol_pages(pcol_copy, ret, WRITE);
+        pcol_free(pcol_copy);
        kfree(pcol_copy);
        return ret;
diff --git a/fs/exportfs/expfs.c b/fs/exportfs/expfs.c
index 29ab099e3e08..606bb074c501 100644
--- a/fs/exportfs/expfs.c
+++ b/fs/exportfs/expfs.c
@@ -341,10 +341,21 @@ static int export_encode_fh(struct inode *inode, struct fid *fid,
        return type;
 }
+int exportfs_encode_inode_fh(struct inode *inode, struct fid *fid,
+                             int *max_len, struct inode *parent)
+{
+        const struct export_operations *nop = inode->i_sb->s_export_op;
+        if (nop && nop->encode_fh)
+                return nop->encode_fh(inode, fid->raw, max_len, parent);
+        return export_encode_fh(inode, fid, max_len, parent);
+}
+EXPORT_SYMBOL_GPL(exportfs_encode_inode_fh);
 int exportfs_encode_fh(struct dentry *dentry, struct fid *fid, int *max_len,
                int connectable)
 {
-        const struct export_operations *nop = dentry->d_sb->s_export_op;
        int error;
        struct dentry *p = NULL;
        struct inode *inode = dentry->d_inode, *parent = NULL;
@@ -357,10 +368,8 @@ int exportfs_encode_fh(struct dentry *dentry, struct fid *fid, int *max_len,
                 */
                parent = p->d_inode;
        }
-        if (nop->encode_fh)
-                error = nop->encode_fh(inode, fid->raw, max_len, parent);
+        error = exportfs_encode_inode_fh(inode, fid, max_len, parent);
-        else
-                error = export_encode_fh(inode, fid, max_len, parent);
        dput(p);
        return error;
diff --git a/fs/ext3/balloc.c b/fs/ext3/balloc.c
index 7320a66e958f..22548f56197b 100644
--- a/fs/ext3/balloc.c
+++ b/fs/ext3/balloc.c
@@ -2101,8 +2101,9 @@ int ext3_trim_fs(struct super_block *sb, struct fstrim_range *range)
        end = start + (range->len >> sb->s_blocksize_bits) - 1;
        minlen = range->minlen >> sb->s_blocksize_bits;
-        if (unlikely(minlen > EXT3_BLOCKS_PER_GROUP(sb)) ||
+        if (minlen > EXT3_BLOCKS_PER_GROUP(sb) ||
-            unlikely(start >= max_blks))
+            start >= max_blks ||
+            range->len < sb->s_blocksize)
                return -EINVAL;
        if (end >= max_blks)
                end = max_blks - 1;
diff --git a/fs/ext3/dir.c b/fs/ext3/dir.c
index c8fff930790d..dd91264ba94f 100644
--- a/fs/ext3/dir.c
+++ b/fs/ext3/dir.c
@@ -296,17 +296,17 @@ static inline loff_t ext3_get_htree_eof(struct file *filp)
 * NOTE: offsets obtained *before* ext3_set_inode_flag(dir, EXT3_INODE_INDEX)
 *       will be invalid once the directory was converted into a dx directory
 */
-loff_t ext3_dir_llseek(struct file *file, loff_t offset, int origin)
+loff_t ext3_dir_llseek(struct file *file, loff_t offset, int whence)
 {
        struct inode *inode = file->f_mapping->host;
        int dx_dir = is_dx_dir(inode);
        loff_t htree_max = ext3_get_htree_eof(file);
        if (likely(dx_dir))
-                return generic_file_llseek_size(file, offset, origin,
+                return generic_file_llseek_size(file, offset, whence,
                                                htree_max, htree_max);
        else
-                return generic_file_llseek(file, offset, origin);
+                return generic_file_llseek(file, offset, whence);
 }
 /*
diff --git a/fs/ext3/inode.c b/fs/ext3/inode.c
index 7e87e37a372a..b176d4253544 100644
--- a/fs/ext3/inode.c
+++ b/fs/ext3/inode.c
@@ -1071,8 +1071,7 @@ struct buffer_head *ext3_getblk(handle_t *handle, struct inode *inode,
         * mapped. 0 in case of a HOLE.
         */
        if (err > 0) {
-                if (err > 1)
+                WARN_ON(err > 1);
-                        WARN_ON(1);
                err = 0;
        }
        *errp = err;
diff --git a/fs/ext3/super.c b/fs/ext3/super.c
index 5366393528df..6e50223b3299 100644
--- a/fs/ext3/super.c
+++ b/fs/ext3/super.c
@@ -1661,9 +1661,6 @@ static int ext3_fill_super (struct super_block *sb, void *data, int silent)
                return -ENOMEM;
        }
        sb->s_fs_info = sbi;
-        sbi->s_mount_opt = 0;
-        sbi->s_resuid = make_kuid(&init_user_ns, EXT3_DEF_RESUID);
-        sbi->s_resgid = make_kgid(&init_user_ns, EXT3_DEF_RESGID);
        sbi->s_sb_block = sb_block;
        blocksize = sb_min_blocksize(sb, EXT3_MIN_BLOCK_SIZE);
diff --git a/fs/ext4/Kconfig b/fs/ext4/Kconfig
index c22f17021b6e..0a475c881852 100644
--- a/fs/ext4/Kconfig
+++ b/fs/ext4/Kconfig
@@ -39,22 +39,8 @@ config EXT4_USE_FOR_EXT23
          compiled kernel size by using one file system driver for
          ext2, ext3, and ext4 file systems.
-config EXT4_FS_XATTR
-        bool "Ext4 extended attributes"
-        depends on EXT4_FS
-        default y
-        help
-          Extended attributes are name:value pairs associated with inodes by
-          the kernel or by users (see the attr(5) manual page, or visit
-          <http://acl.bestbits.at/> for details).
-          If unsure, say N.
-          You need this for POSIX ACL support on ext4.
 config EXT4_FS_POSIX_ACL
        bool "Ext4 POSIX Access Control Lists"
-        depends on EXT4_FS_XATTR
        select FS_POSIX_ACL
        help
          POSIX Access Control Lists (ACLs) support permissions for users and
@@ -67,7 +53,6 @@ config EXT4_FS_POSIX_ACL
 config EXT4_FS_SECURITY
        bool "Ext4 Security Labels"
-        depends on EXT4_FS_XATTR
        help
          Security labels support alternative access control models
          implemented by security modules like SELinux.  This option
diff --git a/fs/ext4/Makefile b/fs/ext4/Makefile
index 56fd8f865930..0310fec2ee3d 100644
--- a/fs/ext4/Makefile
+++ b/fs/ext4/Makefile
@@ -7,8 +7,8 @@ obj-$(CONFIG_EXT4_FS) += ext4.o
 ext4-y  := balloc.o bitmap.o dir.o file.o fsync.o ialloc.o inode.o page-io.o \
                ioctl.o namei.o super.o symlink.o hash.o resize.o extents.o \
                ext4_jbd2.o migrate.o mballoc.o block_validity.o move_extent.o \
-                mmp.o indirect.o
+                mmp.o indirect.o extents_status.o xattr.o xattr_user.o \
+                xattr_trusted.o inline.o
-ext4-$(CONFIG_EXT4_FS_XATTR)            += xattr.o xattr_user.o xattr_trusted.o
 ext4-$(CONFIG_EXT4_FS_POSIX_ACL)        += acl.o
 ext4-$(CONFIG_EXT4_FS_SECURITY)         += xattr_security.o
diff --git a/fs/ext4/acl.c b/fs/ext4/acl.c
index d3c5b88fd89f..e6e0d988439b 100644
--- a/fs/ext4/acl.c
+++ b/fs/ext4/acl.c
@@ -423,8 +423,10 @@ ext4_xattr_set_acl(struct dentry *dentry, const char *name, const void *value,
 retry:
        handle = ext4_journal_start(inode, EXT4_DATA_TRANS_BLOCKS(inode->i_sb));
-        if (IS_ERR(handle))
+        if (IS_ERR(handle)) {
-                return PTR_ERR(handle);
+                error = PTR_ERR(handle);
+                goto release_and_out;
+        }
        error = ext4_set_acl(handle, inode, type, acl);
        ext4_journal_stop(handle);
        if (error == -ENOSPC && ext4_should_retry_alloc(inode->i_sb, &retries))
diff --git a/fs/ext4/dir.c b/fs/ext4/dir.c
index 8e07d2a5a139..80a28b297279 100644
--- a/fs/ext4/dir.c
+++ b/fs/ext4/dir.c
@@ -27,23 +27,11 @@
 #include <linux/slab.h>
 #include <linux/rbtree.h>
 #include "ext4.h"
+#include "xattr.h"
-static unsigned char ext4_filetype_table[] = {
-        DT_UNKNOWN, DT_REG, DT_DIR, DT_CHR, DT_BLK, DT_FIFO, DT_SOCK, DT_LNK
-};
 static int ext4_dx_readdir(struct file *filp,
                           void *dirent, filldir_t filldir);
-static unsigned char get_dtype(struct super_block *sb, int filetype)
-{
-        if (!EXT4_HAS_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_FILETYPE) ||
-            (filetype >= EXT4_FT_MAX))
-                return DT_UNKNOWN;
-        return (ext4_filetype_table[filetype]);
-}
 /**
 * Check if the given dir-inode refers to an htree-indexed directory
 * (or a directory which chould potentially get coverted to use htree
@@ -68,11 +56,14 @@ static int is_dx_dir(struct inode *inode)
 * Return 0 if the directory entry is OK, and 1 if there is a problem
 *
 * Note: this is the opposite of what ext2 and ext3 historically returned...
+ *
+ * bh passed here can be an inode block or a dir data block, depending
+ * on the inode inline data flag.
 */
 int __ext4_check_dir_entry(const char *function, unsigned int line,
                           struct inode *dir, struct file *filp,
                           struct ext4_dir_entry_2 *de,
-                           struct buffer_head *bh,
+                           struct buffer_head *bh, char *buf, int size,
                           unsigned int offset)
 {
        const char *error_msg = NULL;
@@ -85,9 +76,8 @@ int __ext4_check_dir_entry(const char *function, unsigned int line,
                error_msg = "rec_len % 4 != 0";
        else if (unlikely(rlen < EXT4_DIR_REC_LEN(de->name_len)))
                error_msg = "rec_len is too small for name_len";
-        else if (unlikely(((char *) de - bh->b_data) + rlen >
+        else if (unlikely(((char *) de - buf) + rlen > size))
-                          dir->i_sb->s_blocksize))
+                error_msg = "directory entry across range";
-                error_msg = "directory entry across blocks";
        else if (unlikely(le32_to_cpu(de->inode) >
                        le32_to_cpu(EXT4_SB(dir->i_sb)->s_es->s_inodes_count)))
                error_msg = "inode out of bounds";
@@ -98,14 +88,14 @@ int __ext4_check_dir_entry(const char *function, unsigned int line,
                ext4_error_file(filp, function, line, bh->b_blocknr,
                                "bad entry in directory: %s - offset=%u(%u), "
                                "inode=%u, rec_len=%d, name_len=%d",
-                                error_msg, (unsigned) (offset % bh->b_size),
+                                error_msg, (unsigned) (offset % size),
                                offset, le32_to_cpu(de->inode),
                                rlen, de->name_len);
        else
                ext4_error_inode(dir, function, line, bh->b_blocknr,
                                "bad entry in directory: %s - offset=%u(%u), "
                                "inode=%u, rec_len=%d, name_len=%d",
-                                error_msg, (unsigned) (offset % bh->b_size),
+                                error_msg, (unsigned) (offset % size),
                                offset, le32_to_cpu(de->inode),
                                rlen, de->name_len);
@@ -125,6 +115,14 @@ static int ext4_readdir(struct file *filp,
        int ret = 0;
        int dir_has_error = 0;
+        if (ext4_has_inline_data(inode)) {
+                int has_inline_data = 1;
+                ret = ext4_read_inline_dir(filp, dirent, filldir,
+                                           &has_inline_data);
+                if (has_inline_data)
+                        return ret;
+        }
        if (is_dx_dir(inode)) {
                err = ext4_dx_readdir(filp, dirent, filldir);
                if (err != ERR_BAD_DX_DIR) {
@@ -221,8 +219,9 @@ revalidate:
                while (!error && filp->f_pos < inode->i_size
                       && offset < sb->s_blocksize) {
                        de = (struct ext4_dir_entry_2 *) (bh->b_data + offset);
-                        if (ext4_check_dir_entry(inode, filp, de,
+                        if (ext4_check_dir_entry(inode, filp, de, bh,
-                                                 bh, offset)) {
+                                                 bh->b_data, bh->b_size,
+                                                 offset)) {
                                /*
                                 * On error, skip the f_pos to the next block
                                 */
@@ -334,17 +333,17 @@ static inline loff_t ext4_get_htree_eof(struct file *filp)
 *
 * For non-htree, ext4_llseek already chooses the proper max offset.
 */
-loff_t ext4_dir_llseek(struct file *file, loff_t offset, int origin)
+loff_t ext4_dir_llseek(struct file *file, loff_t offset, int whence)
 {
        struct inode *inode = file->f_mapping->host;
        int dx_dir = is_dx_dir(inode);
        loff_t htree_max = ext4_get_htree_eof(file);
        if (likely(dx_dir))
-                return generic_file_llseek_size(file, offset, origin,
+                return generic_file_llseek_size(file, offset, whence,
                                                    htree_max, htree_max);
        else
-                return ext4_llseek(file, offset, origin);
+                return ext4_llseek(file, offset, whence);
 }
 /*
diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h
index 3c20de1d59d0..8462eb3c33aa 100644
--- a/fs/ext4/ext4.h
+++ b/fs/ext4/ext4.h
@@ -57,6 +57,16 @@
 #define ext4_debug(fmt, ...)    no_printk(fmt, ##__VA_ARGS__)
 #endif
+/*
+ * Turn on EXT_DEBUG to get lots of info about extents operations.
+ */
+#define EXT_DEBUG__
+#ifdef EXT_DEBUG
+#define ext_debug(fmt, ...)     printk(fmt, ##__VA_ARGS__)
+#else
+#define ext_debug(fmt, ...)     no_printk(fmt, ##__VA_ARGS__)
+#endif
 #define EXT4_ERROR_INODE(inode, fmt, a...) \
        ext4_error_inode((inode), __func__, __LINE__, 0, (fmt), ## a)
@@ -392,6 +402,7 @@ struct flex_groups {
 #define EXT4_EXTENTS_FL                 0x00080000 /* Inode uses extents */
 #define EXT4_EA_INODE_FL                0x00200000 /* Inode used for large EA */
 #define EXT4_EOFBLOCKS_FL               0x00400000 /* Blocks allocated beyond EOF */
+#define EXT4_INLINE_DATA_FL             0x10000000 /* Inode has inline data. */
 #define EXT4_RESERVED_FL                0x80000000 /* reserved for ext4 lib */
 #define EXT4_FL_USER_VISIBLE            0x004BDFFF /* User visible flags */
@@ -448,28 +459,26 @@ enum {
        EXT4_INODE_EXTENTS      = 19,   /* Inode uses extents */
        EXT4_INODE_EA_INODE     = 21,   /* Inode used for large EA */
        EXT4_INODE_EOFBLOCKS    = 22,   /* Blocks allocated beyond EOF */
+        EXT4_INODE_INLINE_DATA  = 28,   /* Data in inode. */
        EXT4_INODE_RESERVED     = 31,   /* reserved for ext4 lib */
 };
-#define TEST_FLAG_VALUE(FLAG) (EXT4_##FLAG##_FL == (1 << EXT4_INODE_##FLAG))
+/*
-#define CHECK_FLAG_VALUE(FLAG) if (!TEST_FLAG_VALUE(FLAG)) { \
+ * Since it's pretty easy to mix up bit numbers and hex values, we use a
-        printk(KERN_EMERG "EXT4 flag fail: " #FLAG ": %d %d\n", \
+ * build-time check to make sure that EXT4_XXX_FL is consistent with respect to
-                EXT4_##FLAG##_FL, EXT4_INODE_##FLAG); BUG_ON(1); }
+ * EXT4_INODE_XXX. If all is well, the macros will be dropped, so, it won't cost
+ * any extra space in the compiled kernel image, otherwise, the build will fail.
-/*
+ * It's important that these values are the same, since we are using
- * Since it's pretty easy to mix up bit numbers and hex values, and we
+ * EXT4_INODE_XXX to test for flag values, but EXT4_XXX_FL must be consistent
- * can't do a compile-time test for ENUM values, we use a run-time
+ * with the values of FS_XXX_FL defined in include/linux/fs.h and the on-disk
- * test to make sure that EXT4_XXX_FL is consistent with respect to
+ * values found in ext2, ext3 and ext4 filesystems, and of course the values
- * EXT4_INODE_XXX.  If all is well the printk and BUG_ON will all drop
+ * defined in e2fsprogs.
- * out so it won't cost any extra space in the compiled kernel image.
- * But it's important that these values are the same, since we are
- * using EXT4_INODE_XXX to test for the flag values, but EXT4_XX_FL
- * must be consistent with the values of FS_XXX_FL defined in
- * include/linux/fs.h and the on-disk values found in ext2, ext3, and
- * ext4 filesystems, and of course the values defined in e2fsprogs.
 *
 * It's not paranoia if the Murphy's Law really *is* out to get you.  :-)
 */
+#define TEST_FLAG_VALUE(FLAG) (EXT4_##FLAG##_FL == (1 << EXT4_INODE_##FLAG))
+#define CHECK_FLAG_VALUE(FLAG) BUILD_BUG_ON(!TEST_FLAG_VALUE(FLAG))
 static inline void ext4_check_flag_values(void)
 {
        CHECK_FLAG_VALUE(SECRM);
@@ -494,6 +503,7 @@ static inline void ext4_check_flag_values(void)
        CHECK_FLAG_VALUE(EXTENTS);
        CHECK_FLAG_VALUE(EA_INODE);
        CHECK_FLAG_VALUE(EOFBLOCKS);
+        CHECK_FLAG_VALUE(INLINE_DATA);
        CHECK_FLAG_VALUE(RESERVED);
 }
@@ -811,6 +821,8 @@ struct ext4_ext_cache {
        __u32           ec_len; /* must be 32bit to return holes */
 };
+#include "extents_status.h"
 /*
 * fourth extended file system inode data in memory
 */
@@ -833,7 +845,6 @@ struct ext4_inode_info {
 #endif
        unsigned long   i_flags;
-#ifdef CONFIG_EXT4_FS_XATTR
        /*
         * Extended attributes can be read independently of the main file
         * data. Taking i_mutex even when reading would cause contention
@@ -842,7 +853,6 @@ struct ext4_inode_info {
         * EAs.
         */
        struct rw_semaphore xattr_sem;
-#endif
        struct list_head i_orphan;      /* unlinked but open inodes */
@@ -888,6 +898,10 @@ struct ext4_inode_info {
        struct list_head i_prealloc_list;
        spinlock_t i_prealloc_lock;
+        /* extents status tree */
+        struct ext4_es_tree i_es_tree;
+        rwlock_t i_es_lock;
        /* ialloc */
        ext4_group_t    i_last_alloc_group;
@@ -902,6 +916,10 @@ struct ext4_inode_info {
        /* on-disk additional length */
        __u16 i_extra_isize;
+        /* Indicate the inline data space. */
+        u16 i_inline_off;
+        u16 i_inline_size;
 #ifdef CONFIG_QUOTA
        /* quota space reservation, managed internally by quota code */
        qsize_t i_reserved_quota;
@@ -1360,6 +1378,7 @@ enum {
        EXT4_STATE_DELALLOC_RESERVED,   /* blks already reserved for delalloc */
        EXT4_STATE_DIOREAD_LOCK,        /* Disable support for dio read
                                           nolocking */
+        EXT4_STATE_MAY_INLINE_DATA,     /* may have in-inode data */
 };
 #define EXT4_INODE_BIT_FNS(name, field, offset)                         \
@@ -1481,7 +1500,7 @@ static inline void ext4_clear_state_flags(struct ext4_inode_info *ei)
 #define EXT4_FEATURE_INCOMPAT_DIRDATA           0x1000 /* data in dirent */
 #define EXT4_FEATURE_INCOMPAT_BG_USE_META_CSUM  0x2000 /* use crc32c for bg */
 #define EXT4_FEATURE_INCOMPAT_LARGEDIR          0x4000 /* >2GB or 3-lvl htree */
-#define EXT4_FEATURE_INCOMPAT_INLINEDATA        0x8000 /* data in inode */
+#define EXT4_FEATURE_INCOMPAT_INLINE_DATA       0x8000 /* data in inode */
 #define EXT2_FEATURE_COMPAT_SUPP        EXT4_FEATURE_COMPAT_EXT_ATTR
 #define EXT2_FEATURE_INCOMPAT_SUPP      (EXT4_FEATURE_INCOMPAT_FILETYPE| \
@@ -1505,7 +1524,8 @@ static inline void ext4_clear_state_flags(struct ext4_inode_info *ei)
                                         EXT4_FEATURE_INCOMPAT_EXTENTS| \
                                         EXT4_FEATURE_INCOMPAT_64BIT| \
                                         EXT4_FEATURE_INCOMPAT_FLEX_BG| \
-                                         EXT4_FEATURE_INCOMPAT_MMP)
+                                         EXT4_FEATURE_INCOMPAT_MMP |    \
+                                         EXT4_FEATURE_INCOMPAT_INLINE_DATA)
 #define EXT4_FEATURE_RO_COMPAT_SUPP     (EXT4_FEATURE_RO_COMPAT_SPARSE_SUPER| \
                                         EXT4_FEATURE_RO_COMPAT_LARGE_FILE| \
                                         EXT4_FEATURE_RO_COMPAT_GDT_CSUM| \
@@ -1592,6 +1612,11 @@ struct ext4_dir_entry_tail {
        __le32  det_checksum;           /* crc32c(uuid+inum+dirblock) */
 };
+#define EXT4_DIRENT_TAIL(block, blocksize) \
+        ((struct ext4_dir_entry_tail *)(((void *)(block)) + \
+                                        ((blocksize) - \
+                                         sizeof(struct ext4_dir_entry_tail))))
 /*
 * Ext4 directory file types.  Only the low 3 bits are used.  The
 * other bits are reserved for now.
@@ -1936,14 +1961,42 @@ ext4_fsblk_t ext4_inode_to_goal_block(struct inode *);
 extern int __ext4_check_dir_entry(const char *, unsigned int, struct inode *,
                                  struct file *,
                                  struct ext4_dir_entry_2 *,
-                                  struct buffer_head *, unsigned int);
+                                  struct buffer_head *, char *, int,
-#define ext4_check_dir_entry(dir, filp, de, bh, offset)                 \
+                                  unsigned int);
+#define ext4_check_dir_entry(dir, filp, de, bh, buf, size, offset)      \
        unlikely(__ext4_check_dir_entry(__func__, __LINE__, (dir), (filp), \
-                                        (de), (bh), (offset)))
+                                        (de), (bh), (buf), (size), (offset)))
 extern int ext4_htree_store_dirent(struct file *dir_file, __u32 hash,
                                    __u32 minor_hash,
                                    struct ext4_dir_entry_2 *dirent);
 extern void ext4_htree_free_dir_info(struct dir_private_info *p);
+extern int ext4_find_dest_de(struct inode *dir, struct inode *inode,
+                             struct buffer_head *bh,
+                             void *buf, int buf_size,
+                             const char *name, int namelen,
+                             struct ext4_dir_entry_2 **dest_de);
+void ext4_insert_dentry(struct inode *inode,
+                        struct ext4_dir_entry_2 *de,
+                        int buf_size,
+                        const char *name, int namelen);
+static inline void ext4_update_dx_flag(struct inode *inode)
+{
+        if (!EXT4_HAS_COMPAT_FEATURE(inode->i_sb,
+                                     EXT4_FEATURE_COMPAT_DIR_INDEX))
+                ext4_clear_inode_flag(inode, EXT4_INODE_INDEX);
+}
+static unsigned char ext4_filetype_table[] = {
+        DT_UNKNOWN, DT_REG, DT_DIR, DT_CHR, DT_BLK, DT_FIFO, DT_SOCK, DT_LNK
+};
+static inline  unsigned char get_dtype(struct super_block *sb, int filetype)
+{
+        if (!EXT4_HAS_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_FILETYPE) ||
+            (filetype >= EXT4_FT_MAX))
+                return DT_UNKNOWN;
+        return ext4_filetype_table[filetype];
+}
 /* fsync.c */
 extern int ext4_sync_file(struct file *, loff_t, loff_t, int);
@@ -1994,8 +2047,23 @@ struct buffer_head *ext4_getblk(handle_t *, struct inode *,
                                                ext4_lblk_t, int, int *);
 struct buffer_head *ext4_bread(handle_t *, struct inode *,
                                                ext4_lblk_t, int, int *);
+int ext4_get_block_write(struct inode *inode, sector_t iblock,
+                         struct buffer_head *bh_result, int create);
 int ext4_get_block(struct inode *inode, sector_t iblock,
                                struct buffer_head *bh_result, int create);
+int ext4_da_get_block_prep(struct inode *inode, sector_t iblock,
+                           struct buffer_head *bh, int create);
+int ext4_walk_page_buffers(handle_t *handle,
+                           struct buffer_head *head,
+                           unsigned from,
+                           unsigned to,
+                           int *partial,
+                           int (*fn)(handle_t *handle,
+                                     struct buffer_head *bh));
+int do_journal_get_write_access(handle_t *handle,
+                                struct buffer_head *bh);
+#define FALL_BACK_TO_NONDELALLOC 1
+#define CONVERT_INLINE_DATA      2
 extern struct inode *ext4_iget(struct super_block *, unsigned long);
 extern int  ext4_write_inode(struct inode *, struct writeback_control *);
@@ -2050,6 +2118,20 @@ extern int ext4_orphan_add(handle_t *, struct inode *);
 extern int ext4_orphan_del(handle_t *, struct inode *);
 extern int ext4_htree_fill_tree(struct file *dir_file, __u32 start_hash,
                                __u32 start_minor_hash, __u32 *next_hash);
+extern int search_dir(struct buffer_head *bh,
+                      char *search_buf,
+                      int buf_size,
+                      struct inode *dir,
+                      const struct qstr *d_name,
+                      unsigned int offset,
+                      struct ext4_dir_entry_2 **res_dir);
+extern int ext4_generic_delete_entry(handle_t *handle,
+                                     struct inode *dir,
+                                     struct ext4_dir_entry_2 *de_del,
+                                     struct buffer_head *bh,
+                                     void *entry_buf,
+                                     int buf_size,
+                                     int csum_size);
 /* resize.c */
 extern int ext4_group_add(struct super_block *sb,
@@ -2376,6 +2458,15 @@ extern void ext4_unwritten_wait(struct inode *inode);
 extern const struct inode_operations ext4_dir_inode_operations;
 extern const struct inode_operations ext4_special_inode_operations;
 extern struct dentry *ext4_get_parent(struct dentry *child);
+extern struct ext4_dir_entry_2 *ext4_init_dot_dotdot(struct inode *inode,
+                                 struct ext4_dir_entry_2 *de,
+                                 int blocksize, int csum_size,
+                                 unsigned int parent_ino, int dotdot_real_len);
+extern void initialize_dirent_tail(struct ext4_dir_entry_tail *t,
+                                   unsigned int blocksize);
+extern int ext4_handle_dirty_dirent_node(handle_t *handle,
+                                         struct inode *inode,
+                                         struct buffer_head *bh);
 /* symlink.c */
 extern const struct inode_operations ext4_symlink_inode_operations;
@@ -2393,6 +2484,9 @@ extern int ext4_check_blockref(const char *, unsigned int,
                               struct inode *, __le32 *, unsigned int);
 /* extents.c */
+struct ext4_ext_path;
+struct ext4_extent;
 extern int ext4_ext_tree_init(handle_t *handle, struct inode *);
 extern int ext4_ext_writepage_trans_blocks(struct inode *, int);
 extern int ext4_ext_index_trans_blocks(struct inode *inode, int nrblocks,
@@ -2410,8 +2504,27 @@ extern int ext4_convert_unwritten_extents(struct inode *inode, loff_t offset,
                          ssize_t len);
 extern int ext4_map_blocks(handle_t *handle, struct inode *inode,
                           struct ext4_map_blocks *map, int flags);
+extern int ext4_ext_calc_metadata_amount(struct inode *inode,
+                                         ext4_lblk_t lblocks);
+extern int ext4_extent_tree_init(handle_t *, struct inode *);
+extern int ext4_ext_calc_credits_for_single_extent(struct inode *inode,
+                                                   int num,
+                                                   struct ext4_ext_path *path);
+extern int ext4_can_extents_be_merged(struct inode *inode,
+                                      struct ext4_extent *ex1,
+                                      struct ext4_extent *ex2);
+extern int ext4_ext_insert_extent(handle_t *, struct inode *,
+                                  struct ext4_ext_path *,
+                                  struct ext4_extent *, int);
+extern struct ext4_ext_path *ext4_ext_find_extent(struct inode *, ext4_lblk_t,
+                                                  struct ext4_ext_path *);
+extern void ext4_ext_drop_refs(struct ext4_ext_path *);
+extern int ext4_ext_check_inode(struct inode *inode);
+extern int ext4_find_delalloc_cluster(struct inode *inode, ext4_lblk_t lblk);
 extern int ext4_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
                        __u64 start, __u64 len);
 /* move_extent.c */
 extern int ext4_move_extents(struct file *o_filp, struct file *d_filp,
                             __u64 start_orig, __u64 start_donor,
@@ -2445,17 +2558,13 @@ enum ext4_state_bits {
                                 * never, ever appear in a buffer_head's state
                                 * flag. See EXT4_MAP_FROM_CLUSTER to see where
                                 * this is used. */
-        BH_Da_Mapped,   /* Delayed allocated block that now has a mapping. This
-                         * flag is set when ext4_map_blocks is called on a
-                         * delayed allocated block to get its real mapping. */
 };
 BUFFER_FNS(Uninit, uninit)
 TAS_BUFFER_FNS(Uninit, uninit)
-BUFFER_FNS(Da_Mapped, da_mapped)
 /*
- * Add new method to test wether block and inode bitmaps are properly
+ * Add new method to test whether block and inode bitmaps are properly
 * initialized. With uninit_bg reading the block from disk is not enough
 * to mark the bitmap uptodate. We need to also zero-out the bitmap
 */
@@ -2503,6 +2612,4 @@ extern void ext4_resize_end(struct super_block *sb);
 #endif  /* __KERNEL__ */
-#include "ext4_extents.h"
 #endif  /* _EXT4_H */
diff --git a/fs/ext4/ext4_extents.h b/fs/ext4/ext4_extents.h
index cb1b2c919963..487fda12bc00 100644
--- a/fs/ext4/ext4_extents.h
+++ b/fs/ext4/ext4_extents.h
@@ -43,16 +43,6 @@
 #define CHECK_BINSEARCH__
 /*
- * Turn on EXT_DEBUG to get lots of info about extents operations.
- */
-#define EXT_DEBUG__
-#ifdef EXT_DEBUG
-#define ext_debug(fmt, ...)     printk(fmt, ##__VA_ARGS__)
-#else
-#define ext_debug(fmt, ...)     no_printk(fmt, ##__VA_ARGS__)
-#endif
-/*
 * If EXT_STATS is defined then stats numbers are collected.
 * These number will be displayed at umount time.
 */
@@ -144,20 +134,6 @@ struct ext4_ext_path {
 */
 /*
- * to be called by ext4_ext_walk_space()
- * negative retcode - error
- * positive retcode - signal for ext4_ext_walk_space(), see below
- * callback must return valid extent (passed or newly created)
- */
-typedef int (*ext_prepare_callback)(struct inode *, ext4_lblk_t,
-                                        struct ext4_ext_cache *,
-                                        struct ext4_extent *, void *);
-#define EXT_CONTINUE   0
-#define EXT_BREAK      1
-#define EXT_REPEAT     2
-/*
 * Maximum number of logical blocks in a file; ext4_extent's ee_block is
 * __le32.
 */
@@ -300,21 +276,5 @@ static inline void ext4_idx_store_pblock(struct ext4_extent_idx *ix,
                                     0xffff);
 }
-extern int ext4_ext_calc_metadata_amount(struct inode *inode,
-                                         ext4_lblk_t lblocks);
-extern int ext4_extent_tree_init(handle_t *, struct inode *);
-extern int ext4_ext_calc_credits_for_single_extent(struct inode *inode,
-                                                   int num,
-                                                   struct ext4_ext_path *path);
-extern int ext4_can_extents_be_merged(struct inode *inode,
-                                      struct ext4_extent *ex1,
-                                      struct ext4_extent *ex2);
-extern int ext4_ext_insert_extent(handle_t *, struct inode *, struct ext4_ext_path *, struct ext4_extent *, int);
-extern struct ext4_ext_path *ext4_ext_find_extent(struct inode *, ext4_lblk_t,
-                                                        struct ext4_ext_path *);
-extern void ext4_ext_drop_refs(struct ext4_ext_path *);
-extern int ext4_ext_check_inode(struct inode *inode);
-extern int ext4_find_delalloc_cluster(struct inode *inode, ext4_lblk_t lblk,
-                                      int search_hint_reverse);
 #endif /* _EXT4_EXTENTS */
diff --git a/fs/ext4/ext4_jbd2.h b/fs/ext4/ext4_jbd2.h
index 56d258c18303..7177f9b21cb2 100644
--- a/fs/ext4/ext4_jbd2.h
+++ b/fs/ext4/ext4_jbd2.h
@@ -254,13 +254,6 @@ static inline void ext4_handle_sync(handle_t *handle)
                handle->h_sync = 1;
 }
-static inline void ext4_handle_release_buffer(handle_t *handle,
-                                                struct buffer_head *bh)
-{
-        if (ext4_handle_valid(handle))
-                jbd2_journal_release_buffer(handle, bh);
-}
 static inline int ext4_handle_is_aborted(handle_t *handle)
 {
        if (ext4_handle_valid(handle))
diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c
index 7011ac967208..26af22832a84 100644
--- a/fs/ext4/extents.c
+++ b/fs/ext4/extents.c
@@ -41,6 +41,8 @@
 #include <asm/uaccess.h>
 #include <linux/fiemap.h>
 #include "ext4_jbd2.h"
+#include "ext4_extents.h"
+#include "xattr.h"
 #include <trace/events/ext4.h>
@@ -109,6 +111,9 @@ static int ext4_split_extent_at(handle_t *handle,
                             int split_flag,
                             int flags);
+static int ext4_find_delayed_extent(struct inode *inode,
+                                    struct ext4_ext_cache *newex);
 static int ext4_ext_truncate_extend_restart(handle_t *handle,
                                            struct inode *inode,
                                            int needed)
@@ -1959,27 +1964,33 @@ cleanup:
        return err;
 }
-static int ext4_ext_walk_space(struct inode *inode, ext4_lblk_t block,
+static int ext4_fill_fiemap_extents(struct inode *inode,
-                               ext4_lblk_t num, ext_prepare_callback func,
+                                    ext4_lblk_t block, ext4_lblk_t num,
-                               void *cbdata)
+                                    struct fiemap_extent_info *fieinfo)
 {
        struct ext4_ext_path *path = NULL;
-        struct ext4_ext_cache cbex;
+        struct ext4_ext_cache newex;
        struct ext4_extent *ex;
-        ext4_lblk_t next, start = 0, end = 0;
+        ext4_lblk_t next, next_del, start = 0, end = 0;
        ext4_lblk_t last = block + num;
-        int depth, exists, err = 0;
+        int exists, depth = 0, err = 0;
+        unsigned int flags = 0;
-        BUG_ON(func == NULL);
+        unsigned char blksize_bits = inode->i_sb->s_blocksize_bits;
-        BUG_ON(inode == NULL);
        while (block < last && block != EXT_MAX_BLOCKS) {
                num = last - block;
                /* find extent for this block */
                down_read(&EXT4_I(inode)->i_data_sem);
+                if (path && ext_depth(inode) != depth) {
+                        /* depth was changed. we have to realloc path */
+                        kfree(path);
+                        path = NULL;
+                }
                path = ext4_ext_find_extent(inode, block, path);
-                up_read(&EXT4_I(inode)->i_data_sem);
                if (IS_ERR(path)) {
+                        up_read(&EXT4_I(inode)->i_data_sem);
                        err = PTR_ERR(path);
                        path = NULL;
                        break;
@@ -1987,13 +1998,16 @@ static int ext4_ext_walk_space(struct inode *inode, ext4_lblk_t block,
                depth = ext_depth(inode);
                if (unlikely(path[depth].p_hdr == NULL)) {
+                        up_read(&EXT4_I(inode)->i_data_sem);
                        EXT4_ERROR_INODE(inode, "path[%d].p_hdr == NULL", depth);
                        err = -EIO;
                        break;
                }
                ex = path[depth].p_ext;
                next = ext4_ext_next_allocated_block(path);
+                ext4_ext_drop_refs(path);
+                flags = 0;
                exists = 0;
                if (!ex) {
                        /* there is no extent yet, so try to allocate
@@ -2030,40 +2044,64 @@ static int ext4_ext_walk_space(struct inode *inode, ext4_lblk_t block,
                BUG_ON(end <= start);
                if (!exists) {
-                        cbex.ec_block = start;
+                        newex.ec_block = start;
-                        cbex.ec_len = end - start;
+                        newex.ec_len = end - start;
-                        cbex.ec_start = 0;
+                        newex.ec_start = 0;
                } else {
-                        cbex.ec_block = le32_to_cpu(ex->ee_block);
+                        newex.ec_block = le32_to_cpu(ex->ee_block);
-                        cbex.ec_len = ext4_ext_get_actual_len(ex);
+                        newex.ec_len = ext4_ext_get_actual_len(ex);
-                        cbex.ec_start = ext4_ext_pblock(ex);
+                        newex.ec_start = ext4_ext_pblock(ex);
+                        if (ext4_ext_is_uninitialized(ex))
+                                flags |= FIEMAP_EXTENT_UNWRITTEN;
                }
-                if (unlikely(cbex.ec_len == 0)) {
+                /*
-                        EXT4_ERROR_INODE(inode, "cbex.ec_len == 0");
+                 * Find delayed extent and update newex accordingly. We call
-                        err = -EIO;
+                 * it even in !exists case to find out whether newex is the
-                        break;
+                 * last existing extent or not.
+                 */
+                next_del = ext4_find_delayed_extent(inode, &newex);
+                if (!exists && next_del) {
+                        exists = 1;
+                        flags |= FIEMAP_EXTENT_DELALLOC;
                }
-                err = func(inode, next, &cbex, ex, cbdata);
+                up_read(&EXT4_I(inode)->i_data_sem);
-                ext4_ext_drop_refs(path);
-                if (err < 0)
+                if (unlikely(newex.ec_len == 0)) {
+                        EXT4_ERROR_INODE(inode, "newex.ec_len == 0");
+                        err = -EIO;
                        break;
+                }
-                if (err == EXT_REPEAT)
+                /* This is possible iff next == next_del == EXT_MAX_BLOCKS */
-                        continue;
+                if (next == next_del) {
-                else if (err == EXT_BREAK) {
+                        flags |= FIEMAP_EXTENT_LAST;
-                        err = 0;
+                        if (unlikely(next_del != EXT_MAX_BLOCKS ||
-                        break;
+                                     next != EXT_MAX_BLOCKS)) {
+                                EXT4_ERROR_INODE(inode,
+                                                 "next extent == %u, next "
+                                                 "delalloc extent = %u",
+                                                 next, next_del);
+                                err = -EIO;
+                                break;
+                        }
                }
-                if (ext_depth(inode) != depth) {
+                if (exists) {
-                        /* depth was changed. we have to realloc path */
+                        err = fiemap_fill_next_extent(fieinfo,
-                        kfree(path);
+                                (__u64)newex.ec_block << blksize_bits,
-                        path = NULL;
+                                (__u64)newex.ec_start << blksize_bits,
+                                (__u64)newex.ec_len << blksize_bits,
+                                flags);
+                        if (err < 0)
+                                break;
+                        if (err == 1) {
+                                err = 0;
+                                break;
+                        }
                }
-                block = cbex.ec_block + cbex.ec_len;
+                block = newex.ec_block + newex.ec_len;
        }
        if (path) {
@@ -2156,7 +2194,6 @@ ext4_ext_in_cache(struct inode *inode, ext4_lblk_t block,
                  struct ext4_extent *ex)
 {
        struct ext4_ext_cache *cex;
-        struct ext4_sb_info *sbi;
        int ret = 0;
        /*
@@ -2164,7 +2201,6 @@ ext4_ext_in_cache(struct inode *inode, ext4_lblk_t block,
         */
        spin_lock(&EXT4_I(inode)->i_block_reservation_lock);
        cex = &EXT4_I(inode)->i_cached_extent;
-        sbi = EXT4_SB(inode->i_sb);
        /* has cache valid data? */
        if (cex->ec_len == 0)
@@ -2273,7 +2309,13 @@ int ext4_ext_calc_credits_for_single_extent(struct inode *inode, int nrblocks,
 int ext4_ext_index_trans_blocks(struct inode *inode, int nrblocks, int chunk)
 {
        int index;
-        int depth = ext_depth(inode);
+        int depth;
+        /* If we are converting the inline data, only one is needed here. */
+        if (ext4_has_inline_data(inode))
+                return 1;
+        depth = ext_depth(inode);
        if (chunk)
                index = depth * 2;
@@ -3461,115 +3503,34 @@ out:
 /**
 * ext4_find_delalloc_range: find delayed allocated block in the given range.
 *
- * Goes through the buffer heads in the range [lblk_start, lblk_end] and returns
+ * Return 1 if there is a delalloc block in the range, otherwise 0.
- * whether there are any buffers marked for delayed allocation. It returns '1'
- * on the first delalloc'ed buffer head found. If no buffer head in the given
- * range is marked for delalloc, it returns 0.
- * lblk_start should always be <= lblk_end.
- * search_hint_reverse is to indicate that searching in reverse from lblk_end to
- * lblk_start might be more efficient (i.e., we will likely hit the delalloc'ed
- * block sooner). This is useful when blocks are truncated sequentially from
- * lblk_start towards lblk_end.
 */
 static int ext4_find_delalloc_range(struct inode *inode,
                                    ext4_lblk_t lblk_start,
-                                    ext4_lblk_t lblk_end,
+                                    ext4_lblk_t lblk_end)
-                                    int search_hint_reverse)
 {
-        struct address_space *mapping = inode->i_mapping;
+        struct extent_status es;
-        struct buffer_head *head, *bh = NULL;
-        struct page *page;
-        ext4_lblk_t i, pg_lblk;
-        pgoff_t index;
-        if (!test_opt(inode->i_sb, DELALLOC))
-                return 0;
-        /* reverse search wont work if fs block size is less than page size */
-        if (inode->i_blkbits < PAGE_CACHE_SHIFT)
-                search_hint_reverse = 0;
-        if (search_hint_reverse)
+        es.start = lblk_start;
-                i = lblk_end;
+        ext4_es_find_extent(inode, &es);
+        if (es.len == 0)
+                return 0; /* there is no delay extent in this tree */
+        else if (es.start <= lblk_start && lblk_start < es.start + es.len)
+                return 1;
+        else if (lblk_start <= es.start && es.start <= lblk_end)
+                return 1;
        else
-                i = lblk_start;
+                return 0;
-        index = i >> (PAGE_CACHE_SHIFT - inode->i_blkbits);
-        while ((i >= lblk_start) && (i <= lblk_end)) {
-                page = find_get_page(mapping, index);
-                if (!page)
-                        goto nextpage;
-                if (!page_has_buffers(page))
-                        goto nextpage;
-                head = page_buffers(page);
-                if (!head)
-                        goto nextpage;
-                bh = head;
-                pg_lblk = index << (PAGE_CACHE_SHIFT -
-                                                inode->i_blkbits);
-                do {
-                        if (unlikely(pg_lblk < lblk_start)) {
-                                /*
-                                 * This is possible when fs block size is less
-                                 * than page size and our cluster starts/ends in
-                                 * middle of the page. So we need to skip the
-                                 * initial few blocks till we reach the 'lblk'
-                                 */
-                                pg_lblk++;
-                                continue;
-                        }
-                        /* Check if the buffer is delayed allocated and that it
-                         * is not yet mapped. (when da-buffers are mapped during
-                         * their writeout, their da_mapped bit is set.)
-                         */
-                        if (buffer_delay(bh) && !buffer_da_mapped(bh)) {
-                                page_cache_release(page);
-                                trace_ext4_find_delalloc_range(inode,
-                                                lblk_start, lblk_end,
-                                                search_hint_reverse,
-                                                1, i);
-                                return 1;
-                        }
-                        if (search_hint_reverse)
-                                i--;
-                        else
-                                i++;
-                } while ((i >= lblk_start) && (i <= lblk_end) &&
-                                ((bh = bh->b_this_page) != head));
-nextpage:
-                if (page)
-                        page_cache_release(page);
-                /*
-                 * Move to next page. 'i' will be the first lblk in the next
-                 * page.
-                 */
-                if (search_hint_reverse)
-                        index--;
-                else
-                        index++;
-                i = index << (PAGE_CACHE_SHIFT - inode->i_blkbits);
-        }
-        trace_ext4_find_delalloc_range(inode, lblk_start, lblk_end,
-                                        search_hint_reverse, 0, 0);
-        return 0;
 }
-int ext4_find_delalloc_cluster(struct inode *inode, ext4_lblk_t lblk,
+int ext4_find_delalloc_cluster(struct inode *inode, ext4_lblk_t lblk)
-                               int search_hint_reverse)
 {
        struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
        ext4_lblk_t lblk_start, lblk_end;
        lblk_start = lblk & (~(sbi->s_cluster_ratio - 1));
        lblk_end = lblk_start + sbi->s_cluster_ratio - 1;
-        return ext4_find_delalloc_range(inode, lblk_start, lblk_end,
+        return ext4_find_delalloc_range(inode, lblk_start, lblk_end);
-                                        search_hint_reverse);
 }
 /**
@@ -3630,7 +3591,7 @@ get_reserved_cluster_alloc(struct inode *inode, ext4_lblk_t lblk_start,
                lblk_from = lblk_start & (~(sbi->s_cluster_ratio - 1));
                lblk_to = lblk_from + c_offset - 1;
-                if (ext4_find_delalloc_range(inode, lblk_from, lblk_to, 0))
+                if (ext4_find_delalloc_range(inode, lblk_from, lblk_to))
                        allocated_clusters--;
        }
@@ -3640,7 +3601,7 @@ get_reserved_cluster_alloc(struct inode *inode, ext4_lblk_t lblk_start,
                lblk_from = lblk_start + num_blks;
                lblk_to = lblk_from + (sbi->s_cluster_ratio - c_offset) - 1;
-                if (ext4_find_delalloc_range(inode, lblk_from, lblk_to, 0))
+                if (ext4_find_delalloc_range(inode, lblk_from, lblk_to))
                        allocated_clusters--;
        }
@@ -3663,8 +3624,8 @@ ext4_ext_handle_uninitialized_extents(handle_t *handle, struct inode *inode,
                  flags, allocated);
        ext4_ext_show_leaf(inode, path);
-        trace_ext4_ext_handle_uninitialized_extents(inode, map, allocated,
+        trace_ext4_ext_handle_uninitialized_extents(inode, map, flags,
-                                                    newblock);
+                                                    allocated, newblock);
        /* get_block() before submit the IO, split the extent */
        if ((flags & EXT4_GET_BLOCKS_PRE_IO)) {
@@ -3911,7 +3872,7 @@ int ext4_ext_map_blocks(handle_t *handle, struct inode *inode,
        struct ext4_extent newex, *ex, *ex2;
        struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
        ext4_fsblk_t newblock = 0;
-        int free_on_err = 0, err = 0, depth, ret;
+        int free_on_err = 0, err = 0, depth;
        unsigned int allocated = 0, offset = 0;
        unsigned int allocated_clusters = 0;
        struct ext4_allocation_request ar;
@@ -3927,7 +3888,7 @@ int ext4_ext_map_blocks(handle_t *handle, struct inode *inode,
        if (ext4_ext_in_cache(inode, map->m_lblk, &newex)) {
                if (!newex.ee_start_lo && !newex.ee_start_hi) {
                        if ((sbi->s_cluster_ratio > 1) &&
-                            ext4_find_delalloc_cluster(inode, map->m_lblk, 0))
+                            ext4_find_delalloc_cluster(inode, map->m_lblk))
                                map->m_flags |= EXT4_MAP_FROM_CLUSTER;
                        if ((flags & EXT4_GET_BLOCKS_CREATE) == 0) {
@@ -4007,15 +3968,15 @@ int ext4_ext_map_blocks(handle_t *handle, struct inode *inode,
                                        ee_len, ee_start);
                                goto out;
                        }
-                        ret = ext4_ext_handle_uninitialized_extents(
+                        allocated = ext4_ext_handle_uninitialized_extents(
                                handle, inode, map, path, flags,
                                allocated, newblock);
-                        return ret;
+                        goto out3;
                }
        }
        if ((sbi->s_cluster_ratio > 1) &&
-            ext4_find_delalloc_cluster(inode, map->m_lblk, 0))
+            ext4_find_delalloc_cluster(inode, map->m_lblk))
                map->m_flags |= EXT4_MAP_FROM_CLUSTER;
        /*
@@ -4284,8 +4245,8 @@ out2:
                kfree(path);
        }
-        trace_ext4_ext_map_blocks_exit(inode, map->m_lblk,
+out3:
-                newblock, map->m_len, err ? err : allocated);
+        trace_ext4_ext_map_blocks_exit(inode, map, err ? err : allocated);
        return err ? err : allocated;
 }
@@ -4344,6 +4305,8 @@ void ext4_ext_truncate(struct inode *inode)
        last_block = (inode->i_size + sb->s_blocksize - 1)
                        >> EXT4_BLOCK_SIZE_BITS(sb);
+        err = ext4_es_remove_extent(inode, last_block,
+                                    EXT_MAX_BLOCKS - last_block);
        err = ext4_ext_remove_space(inode, last_block, EXT_MAX_BLOCKS - 1);
        /* In a multi-transaction truncate, we only make the final
@@ -4434,6 +4397,10 @@ long ext4_fallocate(struct file *file, int mode, loff_t offset, loff_t len)
        if (mode & FALLOC_FL_PUNCH_HOLE)
                return ext4_punch_hole(file, offset, len);
+        ret = ext4_convert_inline_data(inode);
+        if (ret)
+                return ret;
        trace_ext4_fallocate_enter(inode, offset, len, mode);
        map.m_lblk = offset >> blkbits;
        /*
@@ -4572,206 +4539,43 @@ int ext4_convert_unwritten_extents(struct inode *inode, loff_t offset,
 }
 /*
- * Callback function called for each extent to gather FIEMAP information.
+ * If newex is not existing extent (newex->ec_start equals zero) find
+ * delayed extent at start of newex and update newex accordingly and
+ * return start of the next delayed extent.
+ *
+ * If newex is existing extent (newex->ec_start is not equal zero)
+ * return start of next delayed extent or EXT_MAX_BLOCKS if no delayed
+ * extent found. Leave newex unmodified.
 */
-static int ext4_ext_fiemap_cb(struct inode *inode, ext4_lblk_t next,
+static int ext4_find_delayed_extent(struct inode *inode,
-                       struct ext4_ext_cache *newex, struct ext4_extent *ex,
+                                    struct ext4_ext_cache *newex)
-                       void *data)
 {
-        __u64   logical;
+        struct extent_status es;
-        __u64   physical;
+        ext4_lblk_t next_del;
-        __u64   length;
-        __u32   flags = 0;
-        int             ret = 0;
-        struct fiemap_extent_info *fieinfo = data;
-        unsigned char blksize_bits;
-        blksize_bits = inode->i_sb->s_blocksize_bits;
+        es.start = newex->ec_block;
-        logical = (__u64)newex->ec_block << blksize_bits;
+        next_del = ext4_es_find_extent(inode, &es);
        if (newex->ec_start == 0) {
                /*
                 * No extent in extent-tree contains block @newex->ec_start,
                 * then the block may stay in 1)a hole or 2)delayed-extent.
-                 *
-                 * Holes or delayed-extents are processed as follows.
-                 * 1. lookup dirty pages with specified range in pagecache.
-                 *    If no page is got, then there is no delayed-extent and
-                 *    return with EXT_CONTINUE.
-                 * 2. find the 1st mapped buffer,
-                 * 3. check if the mapped buffer is both in the request range
-                 *    and a delayed buffer. If not, there is no delayed-extent,
-                 *    then return.
-                 * 4. a delayed-extent is found, the extent will be collected.
                 */
-                ext4_lblk_t     end = 0;
+                if (es.len == 0)
-                pgoff_t         last_offset;
+                        /* A hole found. */
-                pgoff_t         offset;
+                        return 0;
-                pgoff_t         index;
-                pgoff_t         start_index = 0;
-                struct page     **pages = NULL;
-                struct buffer_head *bh = NULL;
-                struct buffer_head *head = NULL;
-                unsigned int nr_pages = PAGE_SIZE / sizeof(struct page *);
-                pages = kmalloc(PAGE_SIZE, GFP_KERNEL);
-                if (pages == NULL)
-                        return -ENOMEM;
-                offset = logical >> PAGE_SHIFT;
-repeat:
-                last_offset = offset;
-                head = NULL;
-                ret = find_get_pages_tag(inode->i_mapping, &offset,
-                                        PAGECACHE_TAG_DIRTY, nr_pages, pages);
-                if (!(flags & FIEMAP_EXTENT_DELALLOC)) {
-                        /* First time, try to find a mapped buffer. */
-                        if (ret == 0) {
-out:
-                                for (index = 0; index < ret; index++)
-                                        page_cache_release(pages[index]);
-                                /* just a hole. */
-                                kfree(pages);
-                                return EXT_CONTINUE;
-                        }
-                        index = 0;
-next_page:
-                        /* Try to find the 1st mapped buffer. */
-                        end = ((__u64)pages[index]->index << PAGE_SHIFT) >>
-                                  blksize_bits;
-                        if (!page_has_buffers(pages[index]))
-                                goto out;
-                        head = page_buffers(pages[index]);
-                        if (!head)
-                                goto out;
-                        index++;
-                        bh = head;
-                        do {
-                                if (end >= newex->ec_block +
-                                        newex->ec_len)
-                                        /* The buffer is out of
-                                         * the request range.
-                                         */
-                                        goto out;
-                                if (buffer_mapped(bh) &&
-                                    end >= newex->ec_block) {
-                                        start_index = index - 1;
-                                        /* get the 1st mapped buffer. */
-                                        goto found_mapped_buffer;
-                                }
-                                bh = bh->b_this_page;
-                                end++;
-                        } while (bh != head);
-                        /* No mapped buffer in the range found in this page,
-                         * We need to look up next page.
-                         */
-                        if (index >= ret) {
-                                /* There is no page left, but we need to limit
-                                 * newex->ec_len.
-                                 */
-                                newex->ec_len = end - newex->ec_block;
-                                goto out;
-                        }
-                        goto next_page;
-                } else {
-                        /*Find contiguous delayed buffers. */
-                        if (ret > 0 && pages[0]->index == last_offset)
-                                head = page_buffers(pages[0]);
-                        bh = head;
-                        index = 1;
-                        start_index = 0;
-                }
-found_mapped_buffer:
-                if (bh != NULL && buffer_delay(bh)) {
-                        /* 1st or contiguous delayed buffer found. */
-                        if (!(flags & FIEMAP_EXTENT_DELALLOC)) {
-                                /*
-                                 * 1st delayed buffer found, record
-                                 * the start of extent.
-                                 */
-                                flags |= FIEMAP_EXTENT_DELALLOC;
-                                newex->ec_block = end;
-                                logical = (__u64)end << blksize_bits;
-                        }
-                        /* Find contiguous delayed buffers. */
-                        do {
-                                if (!buffer_delay(bh))
-                                        goto found_delayed_extent;
-                                bh = bh->b_this_page;
-                                end++;
-                        } while (bh != head);
-                        for (; index < ret; index++) {
-                                if (!page_has_buffers(pages[index])) {
-                                        bh = NULL;
-                                        break;
-                                }
-                                head = page_buffers(pages[index]);
-                                if (!head) {
-                                        bh = NULL;
-                                        break;
-                                }
-                                if (pages[index]->index !=
-                                    pages[start_index]->index + index
-                                    - start_index) {
-                                        /* Blocks are not contiguous. */
-                                        bh = NULL;
-                                        break;
-                                }
-                                bh = head;
-                                do {
-                                        if (!buffer_delay(bh))
-                                                /* Delayed-extent ends. */
-                                                goto found_delayed_extent;
-                                        bh = bh->b_this_page;
-                                        end++;
-                                } while (bh != head);
-                        }
-                } else if (!(flags & FIEMAP_EXTENT_DELALLOC))
-                        /* a hole found. */
-                        goto out;
-found_delayed_extent:
+                if (es.start > newex->ec_block) {
-                newex->ec_len = min(end - newex->ec_block,
+                        /* A hole found. */
-                                                (ext4_lblk_t)EXT_INIT_MAX_LEN);
+                        newex->ec_len = min(es.start - newex->ec_block,
-                if (ret == nr_pages && bh != NULL &&
+                                            newex->ec_len);
-                        newex->ec_len < EXT_INIT_MAX_LEN &&
+                        return 0;
-                        buffer_delay(bh)) {
-                        /* Have not collected an extent and continue. */
-                        for (index = 0; index < ret; index++)
-                                page_cache_release(pages[index]);
-                        goto repeat;
                }
-                for (index = 0; index < ret; index++)
+                newex->ec_len = es.start + es.len - newex->ec_block;
-                        page_cache_release(pages[index]);
-                kfree(pages);
        }
-        physical = (__u64)newex->ec_start << blksize_bits;
+        return next_del;
-        length =   (__u64)newex->ec_len << blksize_bits;
-        if (ex && ext4_ext_is_uninitialized(ex))
-                flags |= FIEMAP_EXTENT_UNWRITTEN;
-        if (next == EXT_MAX_BLOCKS)
-                flags |= FIEMAP_EXTENT_LAST;
-        ret = fiemap_fill_next_extent(fieinfo, logical, physical,
-                                        length, flags);
-        if (ret < 0)
-                return ret;
-        if (ret == 1)
-                return EXT_BREAK;
-        return EXT_CONTINUE;
 }
 /* fiemap flags we can handle specified here */
 #define EXT4_FIEMAP_FLAGS       (FIEMAP_FLAG_SYNC|FIEMAP_FLAG_XATTR)
@@ -4971,6 +4775,8 @@ int ext4_ext_punch_hole(struct file *file, loff_t offset, loff_t length)
        ext4_ext_invalidate_cache(inode);
        ext4_discard_preallocations(inode);
+        err = ext4_es_remove_extent(inode, first_block,
+                                    stop_block - first_block);
        err = ext4_ext_remove_space(inode, first_block, stop_block - 1);
        ext4_ext_invalidate_cache(inode);
@@ -4991,12 +4797,22 @@ out_mutex:
        mutex_unlock(&inode->i_mutex);
        return err;
 }
 int ext4_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
                __u64 start, __u64 len)
 {
        ext4_lblk_t start_blk;
        int error = 0;
+        if (ext4_has_inline_data(inode)) {
+                int has_inline = 1;
+                error = ext4_inline_data_fiemap(inode, fieinfo, &has_inline);
+                if (has_inline)
+                        return error;
+        }
        /* fallback to generic here if not in extents fmt */
        if (!(ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)))
                return generic_block_fiemap(inode, fieinfo, start, len,
@@ -5018,11 +4834,11 @@ int ext4_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
                len_blks = ((ext4_lblk_t) last_blk) - start_blk + 1;
                /*
-                 * Walk the extent tree gathering extent information.
+                 * Walk the extent tree gathering extent information
-                 * ext4_ext_fiemap_cb will push extents back to user.
+                 * and pushing extents back to the user.
                 */
-                error = ext4_ext_walk_space(inode, start_blk, len_blks,
+                error = ext4_fill_fiemap_extents(inode, start_blk,
-                                          ext4_ext_fiemap_cb, fieinfo);
+                                                 len_blks, fieinfo);
        }
        return error;
diff --git a/fs/ext4/extents_status.c b/fs/ext4/extents_status.c
new file mode 100644
index 000000000000..564d981a2fcc
--- /dev/null
+++ b/fs/ext4/extents_status.c
@@ -0,0 +1,500 @@
+/*
+ *  fs/ext4/extents_status.c
+ *
+ * Written by Yongqiang Yang <xiaoqiangnk@gmail.com>
+ * Modified by
+ *      Allison Henderson <achender@linux.vnet.ibm.com>
+ *      Hugh Dickins <hughd@google.com>
+ *      Zheng Liu <wenqing.lz@taobao.com>
+ *
+ * Ext4 extents status tree core functions.
+ */
+#include <linux/rbtree.h>
+#include "ext4.h"
+#include "extents_status.h"
+#include "ext4_extents.h"
+#include <trace/events/ext4.h>
+/*
+ * According to previous discussion in Ext4 Developer Workshop, we
+ * will introduce a new structure called io tree to track all extent
+ * status in order to solve some problems that we have met
+ * (e.g. Reservation space warning), and provide extent-level locking.
+ * Delay extent tree is the first step to achieve this goal.  It is
+ * original built by Yongqiang Yang.  At that time it is called delay
+ * extent tree, whose goal is only track delay extent in memory to
+ * simplify the implementation of fiemap and bigalloc, and introduce
+ * lseek SEEK_DATA/SEEK_HOLE support.  That is why it is still called
+ * delay extent tree at the following comment.  But for better
+ * understand what it does, it has been rename to extent status tree.
+ *
+ * Currently the first step has been done.  All delay extents are
+ * tracked in the tree.  It maintains the delay extent when a delay
+ * allocation is issued, and the delay extent is written out or
+ * invalidated.  Therefore the implementation of fiemap and bigalloc
+ * are simplified, and SEEK_DATA/SEEK_HOLE are introduced.
+ *
+ * The following comment describes the implemenmtation of extent
+ * status tree and future works.
+ */
+/*
+ * extents status tree implementation for ext4.
+ *
+ *
+ * ==========================================================================
+ * Extents status encompass delayed extents and extent locks
+ *
+ * 1. Why delayed extent implementation ?
+ *
+ * Without delayed extent, ext4 identifies a delayed extent by looking
+ * up page cache, this has several deficiencies - complicated, buggy,
+ * and inefficient code.
+ *
+ * FIEMAP, SEEK_HOLE/DATA, bigalloc, punch hole and writeout all need
+ * to know if a block or a range of blocks are belonged to a delayed
+ * extent.
+ *
+ * Let us have a look at how they do without delayed extents implementation.
+ *   -- FIEMAP
+ *      FIEMAP looks up page cache to identify delayed allocations from holes.
+ *
+ *   -- SEEK_HOLE/DATA
+ *      SEEK_HOLE/DATA has the same problem as FIEMAP.
+ *
+ *   -- bigalloc
+ *      bigalloc looks up page cache to figure out if a block is
+ *      already under delayed allocation or not to determine whether
+ *      quota reserving is needed for the cluster.
+ *
+ *   -- punch hole
+ *      punch hole looks up page cache to identify a delayed extent.
+ *
+ *   -- writeout
+ *      Writeout looks up whole page cache to see if a buffer is
+ *      mapped, If there are not very many delayed buffers, then it is
+ *      time comsuming.
+ *
+ * With delayed extents implementation, FIEMAP, SEEK_HOLE/DATA,
+ * bigalloc and writeout can figure out if a block or a range of
+ * blocks is under delayed allocation(belonged to a delayed extent) or
+ * not by searching the delayed extent tree.
+ *
+ *
+ * ==========================================================================
+ * 2. ext4 delayed extents impelmentation
+ *
+ *   -- delayed extent
+ *      A delayed extent is a range of blocks which are contiguous
+ *      logically and under delayed allocation.  Unlike extent in
+ *      ext4, delayed extent in ext4 is a in-memory struct, there is
+ *      no corresponding on-disk data.  There is no limit on length of
+ *      delayed extent, so a delayed extent can contain as many blocks
+ *      as they are contiguous logically.
+ *
+ *   -- delayed extent tree
+ *      Every inode has a delayed extent tree and all under delayed
+ *      allocation blocks are added to the tree as delayed extents.
+ *      Delayed extents in the tree are ordered by logical block no.
+ *
+ *   -- operations on a delayed extent tree
+ *      There are three operations on a delayed extent tree: find next
+ *      delayed extent, adding a space(a range of blocks) and removing
+ *      a space.
+ *
+ *   -- race on a delayed extent tree
+ *      Delayed extent tree is protected inode->i_es_lock.
+ *
+ *
+ * ==========================================================================
+ * 3. performance analysis
+ *   -- overhead
+ *      1. There is a cache extent for write access, so if writes are
+ *      not very random, adding space operaions are in O(1) time.
+ *
+ *   -- gain
+ *      2. Code is much simpler, more readable, more maintainable and
+ *      more efficient.
+ *
+ *
+ * ==========================================================================
+ * 4. TODO list
+ *   -- Track all extent status
+ *
+ *   -- Improve get block process
+ *
+ *   -- Extent-level locking
+ */
+static struct kmem_cache *ext4_es_cachep;
+int __init ext4_init_es(void)
+{
+        ext4_es_cachep = KMEM_CACHE(extent_status, SLAB_RECLAIM_ACCOUNT);
+        if (ext4_es_cachep == NULL)
+                return -ENOMEM;
+        return 0;
+}
+void ext4_exit_es(void)
+{
+        if (ext4_es_cachep)
+                kmem_cache_destroy(ext4_es_cachep);
+}
+void ext4_es_init_tree(struct ext4_es_tree *tree)
+{
+        tree->root = RB_ROOT;
+        tree->cache_es = NULL;
+}
+#ifdef ES_DEBUG__
+static void ext4_es_print_tree(struct inode *inode)
+{
+        struct ext4_es_tree *tree;
+        struct rb_node *node;
+        printk(KERN_DEBUG "status extents for inode %lu:", inode->i_ino);
+        tree = &EXT4_I(inode)->i_es_tree;
+        node = rb_first(&tree->root);
+        while (node) {
+                struct extent_status *es;
+                es = rb_entry(node, struct extent_status, rb_node);
+                printk(KERN_DEBUG " [%u/%u)", es->start, es->len);
+                node = rb_next(node);
+        }
+        printk(KERN_DEBUG "\n");
+}
+#else
+#define ext4_es_print_tree(inode)
+#endif
+static inline ext4_lblk_t extent_status_end(struct extent_status *es)
+{
+        BUG_ON(es->start + es->len < es->start);
+        return es->start + es->len - 1;
+}
+/*
+ * search through the tree for an delayed extent with a given offset.  If
+ * it can't be found, try to find next extent.
+ */
+static struct extent_status *__es_tree_search(struct rb_root *root,
+                                              ext4_lblk_t offset)
+{
+        struct rb_node *node = root->rb_node;
+        struct extent_status *es = NULL;
+        while (node) {
+                es = rb_entry(node, struct extent_status, rb_node);
+                if (offset < es->start)
+                        node = node->rb_left;
+                else if (offset > extent_status_end(es))
+                        node = node->rb_right;
+                else
+                        return es;
+        }
+        if (es && offset < es->start)
+                return es;
+        if (es && offset > extent_status_end(es)) {
+                node = rb_next(&es->rb_node);
+                return node ? rb_entry(node, struct extent_status, rb_node) :
+                              NULL;
+        }
+        return NULL;
+}
+/*
+ * ext4_es_find_extent: find the 1st delayed extent covering @es->start
+ * if it exists, otherwise, the next extent after @es->start.
+ *
+ * @inode: the inode which owns delayed extents
+ * @es: delayed extent that we found
+ *
+ * Returns the first block of the next extent after es, otherwise
+ * EXT_MAX_BLOCKS if no delay extent is found.
+ * Delayed extent is returned via @es.
+ */
+ext4_lblk_t ext4_es_find_extent(struct inode *inode, struct extent_status *es)
+{
+        struct ext4_es_tree *tree = NULL;
+        struct extent_status *es1 = NULL;
+        struct rb_node *node;
+        ext4_lblk_t ret = EXT_MAX_BLOCKS;
+        trace_ext4_es_find_extent_enter(inode, es->start);
+        read_lock(&EXT4_I(inode)->i_es_lock);
+        tree = &EXT4_I(inode)->i_es_tree;
+        /* find delay extent in cache firstly */
+        if (tree->cache_es) {
+                es1 = tree->cache_es;
+                if (in_range(es->start, es1->start, es1->len)) {
+                        es_debug("%u cached by [%u/%u)\n",
+                                 es->start, es1->start, es1->len);
+                        goto out;
+                }
+        }
+        es->len = 0;
+        es1 = __es_tree_search(&tree->root, es->start);
+out:
+        if (es1) {
+                tree->cache_es = es1;
+                es->start = es1->start;
+                es->len = es1->len;
+                node = rb_next(&es1->rb_node);
+                if (node) {
+                        es1 = rb_entry(node, struct extent_status, rb_node);
+                        ret = es1->start;
+                }
+        }
+        read_unlock(&EXT4_I(inode)->i_es_lock);
+        trace_ext4_es_find_extent_exit(inode, es, ret);
+        return ret;
+}
+static struct extent_status *
+ext4_es_alloc_extent(ext4_lblk_t start, ext4_lblk_t len)
+{
+        struct extent_status *es;
+        es = kmem_cache_alloc(ext4_es_cachep, GFP_ATOMIC);
+        if (es == NULL)
+                return NULL;
+        es->start = start;
+        es->len = len;
+        return es;
+}
+static void ext4_es_free_extent(struct extent_status *es)
+{
+        kmem_cache_free(ext4_es_cachep, es);
+}
+static struct extent_status *
+ext4_es_try_to_merge_left(struct ext4_es_tree *tree, struct extent_status *es)
+{
+        struct extent_status *es1;
+        struct rb_node *node;
+        node = rb_prev(&es->rb_node);
+        if (!node)
+                return es;
+        es1 = rb_entry(node, struct extent_status, rb_node);
+        if (es->start == extent_status_end(es1) + 1) {
+                es1->len += es->len;
+                rb_erase(&es->rb_node, &tree->root);
+                ext4_es_free_extent(es);
+                es = es1;
+        }
+        return es;
+}
+static struct extent_status *
+ext4_es_try_to_merge_right(struct ext4_es_tree *tree, struct extent_status *es)
+{
+        struct extent_status *es1;
+        struct rb_node *node;
+        node = rb_next(&es->rb_node);
+        if (!node)
+                return es;
+        es1 = rb_entry(node, struct extent_status, rb_node);
+        if (es1->start == extent_status_end(es) + 1) {
+                es->len += es1->len;
+                rb_erase(node, &tree->root);
+                ext4_es_free_extent(es1);
+        }
+        return es;
+}
+static int __es_insert_extent(struct ext4_es_tree *tree, ext4_lblk_t offset,
+                              ext4_lblk_t len)
+{
+        struct rb_node **p = &tree->root.rb_node;
+        struct rb_node *parent = NULL;
+        struct extent_status *es;
+        ext4_lblk_t end = offset + len - 1;
+        BUG_ON(end < offset);
+        es = tree->cache_es;
+        if (es && offset == (extent_status_end(es) + 1)) {
+                es_debug("cached by [%u/%u)\n", es->start, es->len);
+                es->len += len;
+                es = ext4_es_try_to_merge_right(tree, es);
+                goto out;
+        } else if (es && es->start == end + 1) {
+                es_debug("cached by [%u/%u)\n", es->start, es->len);
+                es->start = offset;
+                es->len += len;
+                es = ext4_es_try_to_merge_left(tree, es);
+                goto out;
+        } else if (es && es->start <= offset &&
+                   end <= extent_status_end(es)) {
+                es_debug("cached by [%u/%u)\n", es->start, es->len);
+                goto out;
+        }
+        while (*p) {
+                parent = *p;
+                es = rb_entry(parent, struct extent_status, rb_node);
+                if (offset < es->start) {
+                        if (es->start == end + 1) {
+                                es->start = offset;
+                                es->len += len;
+                                es = ext4_es_try_to_merge_left(tree, es);
+                                goto out;
+                        }
+                        p = &(*p)->rb_left;
+                } else if (offset > extent_status_end(es)) {
+                        if (offset == extent_status_end(es) + 1) {
+                                es->len += len;
+                                es = ext4_es_try_to_merge_right(tree, es);
+                                goto out;
+                        }
+                        p = &(*p)->rb_right;
+                } else {
+                        if (extent_status_end(es) <= end)
+                                es->len = offset - es->start + len;
+                        goto out;
+                }
+        }
+        es = ext4_es_alloc_extent(offset, len);
+        if (!es)
+                return -ENOMEM;
+        rb_link_node(&es->rb_node, parent, p);
+        rb_insert_color(&es->rb_node, &tree->root);
+out:
+        tree->cache_es = es;
+        return 0;
+}
+/*
+ * ext4_es_insert_extent() adds a space to a delayed extent tree.
+ * Caller holds inode->i_es_lock.
+ *
+ * ext4_es_insert_extent is called by ext4_da_write_begin and
+ * ext4_es_remove_extent.
+ *
+ * Return 0 on success, error code on failure.
+ */
+int ext4_es_insert_extent(struct inode *inode, ext4_lblk_t offset,
+                          ext4_lblk_t len)
+{
+        struct ext4_es_tree *tree;
+        int err = 0;
+        trace_ext4_es_insert_extent(inode, offset, len);
+        es_debug("add [%u/%u) to extent status tree of inode %lu\n",
+                 offset, len, inode->i_ino);
+        write_lock(&EXT4_I(inode)->i_es_lock);
+        tree = &EXT4_I(inode)->i_es_tree;
+        err = __es_insert_extent(tree, offset, len);
+        write_unlock(&EXT4_I(inode)->i_es_lock);
+        ext4_es_print_tree(inode);
+        return err;
+}
+/*
+ * ext4_es_remove_extent() removes a space from a delayed extent tree.
+ * Caller holds inode->i_es_lock.
+ *
+ * Return 0 on success, error code on failure.
+ */
+int ext4_es_remove_extent(struct inode *inode, ext4_lblk_t offset,
+                          ext4_lblk_t len)
+{
+        struct rb_node *node;
+        struct ext4_es_tree *tree;
+        struct extent_status *es;
+        struct extent_status orig_es;
+        ext4_lblk_t len1, len2, end;
+        int err = 0;
+        trace_ext4_es_remove_extent(inode, offset, len);
+        es_debug("remove [%u/%u) from extent status tree of inode %lu\n",
+                 offset, len, inode->i_ino);
+        end = offset + len - 1;
+        BUG_ON(end < offset);
+        write_lock(&EXT4_I(inode)->i_es_lock);
+        tree = &EXT4_I(inode)->i_es_tree;
+        es = __es_tree_search(&tree->root, offset);
+        if (!es)
+                goto out;
+        if (es->start > end)
+                goto out;
+        /* Simply invalidate cache_es. */
+        tree->cache_es = NULL;
+        orig_es.start = es->start;
+        orig_es.len = es->len;
+        len1 = offset > es->start ? offset - es->start : 0;
+        len2 = extent_status_end(es) > end ?
+               extent_status_end(es) - end : 0;
+        if (len1 > 0)
+                es->len = len1;
+        if (len2 > 0) {
+                if (len1 > 0) {
+                        err = __es_insert_extent(tree, end + 1, len2);
+                        if (err) {
+                                es->start = orig_es.start;
+                                es->len = orig_es.len;
+                                goto out;
+                        }
+                } else {
+                        es->start = end + 1;
+                        es->len = len2;
+                }
+                goto out;
+        }
+        if (len1 > 0) {
+                node = rb_next(&es->rb_node);
+                if (node)
+                        es = rb_entry(node, struct extent_status, rb_node);
+                else
+                        es = NULL;
+        }
+        while (es && extent_status_end(es) <= end) {
+                node = rb_next(&es->rb_node);
+                rb_erase(&es->rb_node, &tree->root);
+                ext4_es_free_extent(es);
+                if (!node) {
+                        es = NULL;
+                        break;
+                }
+                es = rb_entry(node, struct extent_status, rb_node);
+        }
+        if (es && es->start < end + 1) {
+                len1 = extent_status_end(es) - end;
+                es->start = end + 1;
+                es->len = len1;
+        }
+out:
+        write_unlock(&EXT4_I(inode)->i_es_lock);
+        ext4_es_print_tree(inode);
+        return err;
+}
diff --git a/fs/ext4/extents_status.h b/fs/ext4/extents_status.h
new file mode 100644
index 000000000000..077f82db092a
--- /dev/null
+++ b/fs/ext4/extents_status.h
@@ -0,0 +1,45 @@
+/*
+ *  fs/ext4/extents_status.h
+ *
+ * Written by Yongqiang Yang <xiaoqiangnk@gmail.com>
+ * Modified by
+ *      Allison Henderson <achender@linux.vnet.ibm.com>
+ *      Zheng Liu <wenqing.lz@taobao.com>
+ *
+ */
+#ifndef _EXT4_EXTENTS_STATUS_H
+#define _EXT4_EXTENTS_STATUS_H
+/*
+ * Turn on ES_DEBUG__ to get lots of info about extent status operations.
+ */
+#ifdef ES_DEBUG__
+#define es_debug(fmt, ...)      printk(fmt, ##__VA_ARGS__)
+#else
+#define es_debug(fmt, ...)      no_printk(fmt, ##__VA_ARGS__)
+#endif
+struct extent_status {
+        struct rb_node rb_node;
+        ext4_lblk_t start;      /* first block extent covers */
+        ext4_lblk_t len;        /* length of extent in block */
+};
+struct ext4_es_tree {
+        struct rb_root root;
+        struct extent_status *cache_es; /* recently accessed extent */
+};
+extern int __init ext4_init_es(void);
+extern void ext4_exit_es(void);
+extern void ext4_es_init_tree(struct ext4_es_tree *tree);
+extern int ext4_es_insert_extent(struct inode *inode, ext4_lblk_t start,
+                                 ext4_lblk_t len);
+extern int ext4_es_remove_extent(struct inode *inode, ext4_lblk_t start,
+                                 ext4_lblk_t len);
+extern ext4_lblk_t ext4_es_find_extent(struct inode *inode,
+                                struct extent_status *es);
+#endif /* _EXT4_EXTENTS_STATUS_H */
diff --git a/fs/ext4/file.c b/fs/ext4/file.c
index bf3966bccd34..d07c27ca594a 100644
--- a/fs/ext4/file.c
+++ b/fs/ext4/file.c
@@ -24,6 +24,7 @@
 #include <linux/mount.h>
 #include <linux/path.h>
 #include <linux/quotaops.h>
+#include <linux/pagevec.h>
 #include "ext4.h"
 #include "ext4_jbd2.h"
 #include "xattr.h"
@@ -286,11 +287,329 @@ static int ext4_file_open(struct inode * inode, struct file * filp)
 }
 /*
+ * Here we use ext4_map_blocks() to get a block mapping for a extent-based
+ * file rather than ext4_ext_walk_space() because we can introduce
+ * SEEK_DATA/SEEK_HOLE for block-mapped and extent-mapped file at the same
+ * function.  When extent status tree has been fully implemented, it will
+ * track all extent status for a file and we can directly use it to
+ * retrieve the offset for SEEK_DATA/SEEK_HOLE.
+ */
+/*
+ * When we retrieve the offset for SEEK_DATA/SEEK_HOLE, we would need to
+ * lookup page cache to check whether or not there has some data between
+ * [startoff, endoff] because, if this range contains an unwritten extent,
+ * we determine this extent as a data or a hole according to whether the
+ * page cache has data or not.
+ */
+static int ext4_find_unwritten_pgoff(struct inode *inode,
+                                     int whence,
+                                     struct ext4_map_blocks *map,
+                                     loff_t *offset)
+{
+        struct pagevec pvec;
+        unsigned int blkbits;
+        pgoff_t index;
+        pgoff_t end;
+        loff_t endoff;
+        loff_t startoff;
+        loff_t lastoff;
+        int found = 0;
+        blkbits = inode->i_sb->s_blocksize_bits;
+        startoff = *offset;
+        lastoff = startoff;
+        endoff = (map->m_lblk + map->m_len) << blkbits;
+        index = startoff >> PAGE_CACHE_SHIFT;
+        end = endoff >> PAGE_CACHE_SHIFT;
+        pagevec_init(&pvec, 0);
+        do {
+                int i, num;
+                unsigned long nr_pages;
+                num = min_t(pgoff_t, end - index, PAGEVEC_SIZE);
+                nr_pages = pagevec_lookup(&pvec, inode->i_mapping, index,
+                                          (pgoff_t)num);
+                if (nr_pages == 0) {
+                        if (whence == SEEK_DATA)
+                                break;
+                        BUG_ON(whence != SEEK_HOLE);
+                        /*
+                         * If this is the first time to go into the loop and
+                         * offset is not beyond the end offset, it will be a
+                         * hole at this offset
+                         */
+                        if (lastoff == startoff || lastoff < endoff)
+                                found = 1;
+                        break;
+                }
+                /*
+                 * If this is the first time to go into the loop and
+                 * offset is smaller than the first page offset, it will be a
+                 * hole at this offset.
+                 */
+                if (lastoff == startoff && whence == SEEK_HOLE &&
+                    lastoff < page_offset(pvec.pages[0])) {
+                        found = 1;
+                        break;
+                }
+                for (i = 0; i < nr_pages; i++) {
+                        struct page *page = pvec.pages[i];
+                        struct buffer_head *bh, *head;
+                        /*
+                         * If the current offset is not beyond the end of given
+                         * range, it will be a hole.
+                         */
+                        if (lastoff < endoff && whence == SEEK_HOLE &&
+                            page->index > end) {
+                                found = 1;
+                                *offset = lastoff;
+                                goto out;
+                        }
+                        lock_page(page);
+                        if (unlikely(page->mapping != inode->i_mapping)) {
+                                unlock_page(page);
+                                continue;
+                        }
+                        if (!page_has_buffers(page)) {
+                                unlock_page(page);
+                                continue;
+                        }
+                        if (page_has_buffers(page)) {
+                                lastoff = page_offset(page);
+                                bh = head = page_buffers(page);
+                                do {
+                                        if (buffer_uptodate(bh) ||
+                                            buffer_unwritten(bh)) {
+                                                if (whence == SEEK_DATA)
+                                                        found = 1;
+                                        } else {
+                                                if (whence == SEEK_HOLE)
+                                                        found = 1;
+                                        }
+                                        if (found) {
+                                                *offset = max_t(loff_t,
+                                                        startoff, lastoff);
+                                                unlock_page(page);
+                                                goto out;
+                                        }
+                                        lastoff += bh->b_size;
+                                        bh = bh->b_this_page;
+                                } while (bh != head);
+                        }
+                        lastoff = page_offset(page) + PAGE_SIZE;
+                        unlock_page(page);
+                }
+                /*
+                 * The no. of pages is less than our desired, that would be a
+                 * hole in there.
+                 */
+                if (nr_pages < num && whence == SEEK_HOLE) {
+                        found = 1;
+                        *offset = lastoff;
+                        break;
+                }
+                index = pvec.pages[i - 1]->index + 1;
+                pagevec_release(&pvec);
+        } while (index <= end);
+out:
+        pagevec_release(&pvec);
+        return found;
+}
+/*
+ * ext4_seek_data() retrieves the offset for SEEK_DATA.
+ */
+static loff_t ext4_seek_data(struct file *file, loff_t offset, loff_t maxsize)
+{
+        struct inode *inode = file->f_mapping->host;
+        struct ext4_map_blocks map;
+        struct extent_status es;
+        ext4_lblk_t start, last, end;
+        loff_t dataoff, isize;
+        int blkbits;
+        int ret = 0;
+        mutex_lock(&inode->i_mutex);
+        isize = i_size_read(inode);
+        if (offset >= isize) {
+                mutex_unlock(&inode->i_mutex);
+                return -ENXIO;
+        }
+        blkbits = inode->i_sb->s_blocksize_bits;
+        start = offset >> blkbits;
+        last = start;
+        end = isize >> blkbits;
+        dataoff = offset;
+        do {
+                map.m_lblk = last;
+                map.m_len = end - last + 1;
+                ret = ext4_map_blocks(NULL, inode, &map, 0);
+                if (ret > 0 && !(map.m_flags & EXT4_MAP_UNWRITTEN)) {
+                        if (last != start)
+                                dataoff = last << blkbits;
+                        break;
+                }
+                /*
+                 * If there is a delay extent at this offset,
+                 * it will be as a data.
+                 */
+                es.start = last;
+                (void)ext4_es_find_extent(inode, &es);
+                if (last >= es.start &&
+                    last < es.start + es.len) {
+                        if (last != start)
+                                dataoff = last << blkbits;
+                        break;
+                }
+                /*
+                 * If there is a unwritten extent at this offset,
+                 * it will be as a data or a hole according to page
+                 * cache that has data or not.
+                 */
+                if (map.m_flags & EXT4_MAP_UNWRITTEN) {
+                        int unwritten;
+                        unwritten = ext4_find_unwritten_pgoff(inode, SEEK_DATA,
+                                                              &map, &dataoff);
+                        if (unwritten)
+                                break;
+                }
+                last++;
+                dataoff = last << blkbits;
+        } while (last <= end);
+        mutex_unlock(&inode->i_mutex);
+        if (dataoff > isize)
+                return -ENXIO;
+        if (dataoff < 0 && !(file->f_mode & FMODE_UNSIGNED_OFFSET))
+                return -EINVAL;
+        if (dataoff > maxsize)
+                return -EINVAL;
+        if (dataoff != file->f_pos) {
+                file->f_pos = dataoff;
+                file->f_version = 0;
+        }
+        return dataoff;
+}
+/*
+ * ext4_seek_hole() retrieves the offset for SEEK_HOLE.
+ */
+static loff_t ext4_seek_hole(struct file *file, loff_t offset, loff_t maxsize)
+{
+        struct inode *inode = file->f_mapping->host;
+        struct ext4_map_blocks map;
+        struct extent_status es;
+        ext4_lblk_t start, last, end;
+        loff_t holeoff, isize;
+        int blkbits;
+        int ret = 0;
+        mutex_lock(&inode->i_mutex);
+        isize = i_size_read(inode);
+        if (offset >= isize) {
+                mutex_unlock(&inode->i_mutex);
+                return -ENXIO;
+        }
+        blkbits = inode->i_sb->s_blocksize_bits;
+        start = offset >> blkbits;
+        last = start;
+        end = isize >> blkbits;
+        holeoff = offset;
+        do {
+                map.m_lblk = last;
+                map.m_len = end - last + 1;
+                ret = ext4_map_blocks(NULL, inode, &map, 0);
+                if (ret > 0 && !(map.m_flags & EXT4_MAP_UNWRITTEN)) {
+                        last += ret;
+                        holeoff = last << blkbits;
+                        continue;
+                }
+                /*
+                 * If there is a delay extent at this offset,
+                 * we will skip this extent.
+                 */
+                es.start = last;
+                (void)ext4_es_find_extent(inode, &es);
+                if (last >= es.start &&
+                    last < es.start + es.len) {
+                        last = es.start + es.len;
+                        holeoff = last << blkbits;
+                        continue;
+                }
+                /*
+                 * If there is a unwritten extent at this offset,
+                 * it will be as a data or a hole according to page
+                 * cache that has data or not.
+                 */
+                if (map.m_flags & EXT4_MAP_UNWRITTEN) {
+                        int unwritten;
+                        unwritten = ext4_find_unwritten_pgoff(inode, SEEK_HOLE,
+                                                              &map, &holeoff);
+                        if (!unwritten) {
+                                last += ret;
+                                holeoff = last << blkbits;
+                                continue;
+                        }
+                }
+                /* find a hole */
+                break;
+        } while (last <= end);
+        mutex_unlock(&inode->i_mutex);
+        if (holeoff > isize)
+                holeoff = isize;
+        if (holeoff < 0 && !(file->f_mode & FMODE_UNSIGNED_OFFSET))
+                return -EINVAL;
+        if (holeoff > maxsize)
+                return -EINVAL;
+        if (holeoff != file->f_pos) {
+                file->f_pos = holeoff;
+                file->f_version = 0;
+        }
+        return holeoff;
+}
+/*
 * ext4_llseek() handles both block-mapped and extent-mapped maxbytes values
 * by calling generic_file_llseek_size() with the appropriate maxbytes
 * value for each.
 */
-loff_t ext4_llseek(struct file *file, loff_t offset, int origin)
+loff_t ext4_llseek(struct file *file, loff_t offset, int whence)
 {
        struct inode *inode = file->f_mapping->host;
        loff_t maxbytes;
@@ -300,8 +619,19 @@ loff_t ext4_llseek(struct file *file, loff_t offset, int origin)
        else
                maxbytes = inode->i_sb->s_maxbytes;
-        return generic_file_llseek_size(file, offset, origin,
+        switch (whence) {
-                                        maxbytes, i_size_read(inode));
+        case SEEK_SET:
+        case SEEK_CUR:
+        case SEEK_END:
+                return generic_file_llseek_size(file, offset, whence,
+                                                maxbytes, i_size_read(inode));
+        case SEEK_DATA:
+                return ext4_seek_data(file, offset, maxbytes);
+        case SEEK_HOLE:
+                return ext4_seek_hole(file, offset, maxbytes);
+        }
+        return -EINVAL;
 }
 const struct file_operations ext4_file_operations = {
@@ -326,12 +656,10 @@ const struct file_operations ext4_file_operations = {
 const struct inode_operations ext4_file_inode_operations = {
        .setattr        = ext4_setattr,
        .getattr        = ext4_getattr,
-#ifdef CONFIG_EXT4_FS_XATTR
        .setxattr       = generic_setxattr,
        .getxattr       = generic_getxattr,
        .listxattr      = ext4_listxattr,
        .removexattr    = generic_removexattr,
-#endif
        .get_acl        = ext4_get_acl,
        .fiemap         = ext4_fiemap,
 };
diff --git a/fs/ext4/fsync.c b/fs/ext4/fsync.c
index be1d89f385b4..dfbc1fe96674 100644
--- a/fs/ext4/fsync.c
+++ b/fs/ext4/fsync.c
@@ -44,7 +44,6 @@
 */
 static int ext4_sync_parent(struct inode *inode)
 {
-        struct writeback_control wbc;
        struct dentry *dentry = NULL;
        struct inode *next;
        int ret = 0;
@@ -66,10 +65,7 @@ static int ext4_sync_parent(struct inode *inode)
                ret = sync_mapping_buffers(inode->i_mapping);
                if (ret)
                        break;
-                memset(&wbc, 0, sizeof(wbc));
+                ret = sync_inode_metadata(inode, 1);
-                wbc.sync_mode = WB_SYNC_ALL;
-                wbc.nr_to_write = 0;         /* only write out the inode */
-                ret = sync_inode(inode, &wbc);
                if (ret)
                        break;
        }
diff --git a/fs/ext4/ialloc.c b/fs/ext4/ialloc.c
index 3a100e7a62a8..3f32c8012447 100644
--- a/fs/ext4/ialloc.c
+++ b/fs/ext4/ialloc.c
@@ -762,7 +762,6 @@ got:
                BUFFER_TRACE(block_bitmap_bh, "dirty block bitmap");
                err = ext4_handle_dirty_metadata(handle, NULL, block_bitmap_bh);
-                brelse(block_bitmap_bh);
                /* recheck and clear flag under lock if we still need to */
                ext4_lock_group(sb, group);
@@ -775,6 +774,7 @@ got:
                        ext4_group_desc_csum_set(sb, group, gdp);
                }
                ext4_unlock_group(sb, group);
+                brelse(block_bitmap_bh);
                if (err)
                        goto fail;
@@ -902,6 +902,10 @@ got:
        ei->i_extra_isize = EXT4_SB(sb)->s_want_extra_isize;
+        ei->i_inline_off = 0;
+        if (EXT4_HAS_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_INLINE_DATA))
+                ext4_set_inode_state(inode, EXT4_STATE_MAY_INLINE_DATA);
        ret = inode;
        dquot_initialize(inode);
        err = dquot_alloc_inode(inode);
diff --git a/fs/ext4/indirect.c b/fs/ext4/indirect.c
index 792e388e7b44..20862f96e8ae 100644
--- a/fs/ext4/indirect.c
+++ b/fs/ext4/indirect.c
@@ -22,6 +22,7 @@
 #include "ext4_jbd2.h"
 #include "truncate.h"
+#include "ext4_extents.h"       /* Needed for EXT_MAX_BLOCKS */
 #include <trace/events/ext4.h>
@@ -755,8 +756,7 @@ cleanup:
                partial--;
        }
 out:
-        trace_ext4_ind_map_blocks_exit(inode, map->m_lblk,
+        trace_ext4_ind_map_blocks_exit(inode, map, err);
-                                map->m_pblk, map->m_len, err);
        return err;
 }
@@ -1412,6 +1412,7 @@ void ext4_ind_truncate(struct inode *inode)
        down_write(&ei->i_data_sem);
        ext4_discard_preallocations(inode);
+        ext4_es_remove_extent(inode, last_block, EXT_MAX_BLOCKS - last_block);
        /*
         * The orphan list entry will now protect us from any crash which
diff --git a/fs/ext4/inline.c b/fs/ext4/inline.c
new file mode 100644
index 000000000000..387c47c6cda9
--- /dev/null
+++ b/fs/ext4/inline.c
@@ -0,0 +1,1884 @@
+/*
+ * Copyright (c) 2012 Taobao.
+ * Written by Tao Ma <boyu.mt@taobao.com>
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of version 2.1 of the GNU Lesser General Public License
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ */
+#include "ext4_jbd2.h"
+#include "ext4.h"
+#include "xattr.h"
+#include "truncate.h"
+#include <linux/fiemap.h>
+#define EXT4_XATTR_SYSTEM_DATA  "data"
+#define EXT4_MIN_INLINE_DATA_SIZE       ((sizeof(__le32) * EXT4_N_BLOCKS))
+#define EXT4_INLINE_DOTDOT_SIZE 4
+int ext4_get_inline_size(struct inode *inode)
+{
+        if (EXT4_I(inode)->i_inline_off)
+                return EXT4_I(inode)->i_inline_size;
+        return 0;
+}
+static int get_max_inline_xattr_value_size(struct inode *inode,
+                                           struct ext4_iloc *iloc)
+{
+        struct ext4_xattr_ibody_header *header;
+        struct ext4_xattr_entry *entry;
+        struct ext4_inode *raw_inode;
+        int free, min_offs;
+        min_offs = EXT4_SB(inode->i_sb)->s_inode_size -
+                        EXT4_GOOD_OLD_INODE_SIZE -
+                        EXT4_I(inode)->i_extra_isize -
+                        sizeof(struct ext4_xattr_ibody_header);
+        /*
+         * We need to subtract another sizeof(__u32) since an in-inode xattr
+         * needs an empty 4 bytes to indicate the gap between the xattr entry
+         * and the name/value pair.
+         */
+        if (!ext4_test_inode_state(inode, EXT4_STATE_XATTR))
+                return EXT4_XATTR_SIZE(min_offs -
+                        EXT4_XATTR_LEN(strlen(EXT4_XATTR_SYSTEM_DATA)) -
+                        EXT4_XATTR_ROUND - sizeof(__u32));
+        raw_inode = ext4_raw_inode(iloc);
+        header = IHDR(inode, raw_inode);
+        entry = IFIRST(header);
+        /* Compute min_offs. */
+        for (; !IS_LAST_ENTRY(entry); entry = EXT4_XATTR_NEXT(entry)) {
+                if (!entry->e_value_block && entry->e_value_size) {
+                        size_t offs = le16_to_cpu(entry->e_value_offs);
+                        if (offs < min_offs)
+                                min_offs = offs;
+                }
+        }
+        free = min_offs -
+                ((void *)entry - (void *)IFIRST(header)) - sizeof(__u32);
+        if (EXT4_I(inode)->i_inline_off) {
+                entry = (struct ext4_xattr_entry *)
+                        ((void *)raw_inode + EXT4_I(inode)->i_inline_off);
+                free += le32_to_cpu(entry->e_value_size);
+                goto out;
+        }
+        free -= EXT4_XATTR_LEN(strlen(EXT4_XATTR_SYSTEM_DATA));
+        if (free > EXT4_XATTR_ROUND)
+                free = EXT4_XATTR_SIZE(free - EXT4_XATTR_ROUND);
+        else
+                free = 0;
+out:
+        return free;
+}
+/*
+ * Get the maximum size we now can store in an inode.
+ * If we can't find the space for a xattr entry, don't use the space
+ * of the extents since we have no space to indicate the inline data.
+ */
+int ext4_get_max_inline_size(struct inode *inode)
+{
+        int error, max_inline_size;
+        struct ext4_iloc iloc;
+        if (EXT4_I(inode)->i_extra_isize == 0)
+                return 0;
+        error = ext4_get_inode_loc(inode, &iloc);
+        if (error) {
+                ext4_error_inode(inode, __func__, __LINE__, 0,
+                                 "can't get inode location %lu",
+                                 inode->i_ino);
+                return 0;
+        }
+        down_read(&EXT4_I(inode)->xattr_sem);
+        max_inline_size = get_max_inline_xattr_value_size(inode, &iloc);
+        up_read(&EXT4_I(inode)->xattr_sem);
+        brelse(iloc.bh);
+        if (!max_inline_size)
+                return 0;
+        return max_inline_size + EXT4_MIN_INLINE_DATA_SIZE;
+}
+int ext4_has_inline_data(struct inode *inode)
+{
+        return ext4_test_inode_flag(inode, EXT4_INODE_INLINE_DATA) &&
+               EXT4_I(inode)->i_inline_off;
+}
+/*
+ * this function does not take xattr_sem, which is OK because it is
+ * currently only used in a code path coming form ext4_iget, before
+ * the new inode has been unlocked
+ */
+int ext4_find_inline_data_nolock(struct inode *inode)
+{
+        struct ext4_xattr_ibody_find is = {
+                .s = { .not_found = -ENODATA, },
+        };
+        struct ext4_xattr_info i = {
+                .name_index = EXT4_XATTR_INDEX_SYSTEM,
+                .name = EXT4_XATTR_SYSTEM_DATA,
+        };
+        int error;
+        if (EXT4_I(inode)->i_extra_isize == 0)
+                return 0;
+        error = ext4_get_inode_loc(inode, &is.iloc);
+        if (error)
+                return error;
+        error = ext4_xattr_ibody_find(inode, &i, &is);
+        if (error)
+                goto out;
+        if (!is.s.not_found) {
+                EXT4_I(inode)->i_inline_off = (u16)((void *)is.s.here -
+                                        (void *)ext4_raw_inode(&is.iloc));
+                EXT4_I(inode)->i_inline_size = EXT4_MIN_INLINE_DATA_SIZE +
+                                le32_to_cpu(is.s.here->e_value_size);
+                ext4_set_inode_state(inode, EXT4_STATE_MAY_INLINE_DATA);
+        }
+out:
+        brelse(is.iloc.bh);
+        return error;
+}
+static int ext4_read_inline_data(struct inode *inode, void *buffer,
+                                 unsigned int len,
+                                 struct ext4_iloc *iloc)
+{
+        struct ext4_xattr_entry *entry;
+        struct ext4_xattr_ibody_header *header;
+        int cp_len = 0;
+        struct ext4_inode *raw_inode;
+        if (!len)
+                return 0;
+        BUG_ON(len > EXT4_I(inode)->i_inline_size);
+        cp_len = len < EXT4_MIN_INLINE_DATA_SIZE ?
+                        len : EXT4_MIN_INLINE_DATA_SIZE;
+        raw_inode = ext4_raw_inode(iloc);
+        memcpy(buffer, (void *)(raw_inode->i_block), cp_len);
+        len -= cp_len;
+        buffer += cp_len;
+        if (!len)
+                goto out;
+        header = IHDR(inode, raw_inode);
+        entry = (struct ext4_xattr_entry *)((void *)raw_inode +
+                                            EXT4_I(inode)->i_inline_off);
+        len = min_t(unsigned int, len,
+                    (unsigned int)le32_to_cpu(entry->e_value_size));
+        memcpy(buffer,
+               (void *)IFIRST(header) + le16_to_cpu(entry->e_value_offs), len);
+        cp_len += len;
+out:
+        return cp_len;
+}
+/*
+ * write the buffer to the inline inode.
+ * If 'create' is set, we don't need to do the extra copy in the xattr
+ * value since it is already handled by ext4_xattr_ibody_inline_set.
+ * That saves us one memcpy.
+ */
+void ext4_write_inline_data(struct inode *inode, struct ext4_iloc *iloc,
+                            void *buffer, loff_t pos, unsigned int len)
+{
+        struct ext4_xattr_entry *entry;
+        struct ext4_xattr_ibody_header *header;
+        struct ext4_inode *raw_inode;
+        int cp_len = 0;
+        BUG_ON(!EXT4_I(inode)->i_inline_off);
+        BUG_ON(pos + len > EXT4_I(inode)->i_inline_size);
+        raw_inode = ext4_raw_inode(iloc);
+        buffer += pos;
+        if (pos < EXT4_MIN_INLINE_DATA_SIZE) {
+                cp_len = pos + len > EXT4_MIN_INLINE_DATA_SIZE ?
+                         EXT4_MIN_INLINE_DATA_SIZE - pos : len;
+                memcpy((void *)raw_inode->i_block + pos, buffer, cp_len);
+                len -= cp_len;
+                buffer += cp_len;
+                pos += cp_len;
+        }
+        if (!len)
+                return;
+        pos -= EXT4_MIN_INLINE_DATA_SIZE;
+        header = IHDR(inode, raw_inode);
+        entry = (struct ext4_xattr_entry *)((void *)raw_inode +
+                                            EXT4_I(inode)->i_inline_off);
+        memcpy((void *)IFIRST(header) + le16_to_cpu(entry->e_value_offs) + pos,
+               buffer, len);
+}
+static int ext4_create_inline_data(handle_t *handle,
+                                   struct inode *inode, unsigned len)
+{
+        int error;
+        void *value = NULL;
+        struct ext4_xattr_ibody_find is = {
+                .s = { .not_found = -ENODATA, },
+        };
+        struct ext4_xattr_info i = {
+                .name_index = EXT4_XATTR_INDEX_SYSTEM,
+                .name = EXT4_XATTR_SYSTEM_DATA,
+        };
+        error = ext4_get_inode_loc(inode, &is.iloc);
+        if (error)
+                return error;
+        error = ext4_journal_get_write_access(handle, is.iloc.bh);
+        if (error)
+                goto out;
+        if (len > EXT4_MIN_INLINE_DATA_SIZE) {
+                value = EXT4_ZERO_XATTR_VALUE;
+                len -= EXT4_MIN_INLINE_DATA_SIZE;
+        } else {
+                value = "";
+                len = 0;
+        }
+        /* Insert the the xttr entry. */
+        i.value = value;
+        i.value_len = len;
+        error = ext4_xattr_ibody_find(inode, &i, &is);
+        if (error)
+                goto out;
+        BUG_ON(!is.s.not_found);
+        error = ext4_xattr_ibody_inline_set(handle, inode, &i, &is);
+        if (error) {
+                if (error == -ENOSPC)
+                        ext4_clear_inode_state(inode,
+                                               EXT4_STATE_MAY_INLINE_DATA);
+                goto out;
+        }
+        memset((void *)ext4_raw_inode(&is.iloc)->i_block,
+                0, EXT4_MIN_INLINE_DATA_SIZE);
+        EXT4_I(inode)->i_inline_off = (u16)((void *)is.s.here -
+                                      (void *)ext4_raw_inode(&is.iloc));
+        EXT4_I(inode)->i_inline_size = len + EXT4_MIN_INLINE_DATA_SIZE;
+        ext4_clear_inode_flag(inode, EXT4_INODE_EXTENTS);
+        ext4_set_inode_flag(inode, EXT4_INODE_INLINE_DATA);
+        get_bh(is.iloc.bh);
+        error = ext4_mark_iloc_dirty(handle, inode, &is.iloc);
+out:
+        brelse(is.iloc.bh);
+        return error;
+}
+static int ext4_update_inline_data(handle_t *handle, struct inode *inode,
+                                   unsigned int len)
+{
+        int error;
+        void *value = NULL;
+        struct ext4_xattr_ibody_find is = {
+                .s = { .not_found = -ENODATA, },
+        };
+        struct ext4_xattr_info i = {
+                .name_index = EXT4_XATTR_INDEX_SYSTEM,
+                .name = EXT4_XATTR_SYSTEM_DATA,
+        };
+        /* If the old space is ok, write the data directly. */
+        if (len <= EXT4_I(inode)->i_inline_size)
+                return 0;
+        error = ext4_get_inode_loc(inode, &is.iloc);
+        if (error)
+                return error;
+        error = ext4_xattr_ibody_find(inode, &i, &is);
+        if (error)
+                goto out;
+        BUG_ON(is.s.not_found);
+        len -= EXT4_MIN_INLINE_DATA_SIZE;
+        value = kzalloc(len, GFP_NOFS);
+        if (!value)
+                goto out;
+        error = ext4_xattr_ibody_get(inode, i.name_index, i.name,
+                                     value, len);
+        if (error == -ENODATA)
+                goto out;
+        error = ext4_journal_get_write_access(handle, is.iloc.bh);
+        if (error)
+                goto out;
+        /* Update the xttr entry. */
+        i.value = value;
+        i.value_len = len;
+        error = ext4_xattr_ibody_inline_set(handle, inode, &i, &is);
+        if (error)
+                goto out;
+        EXT4_I(inode)->i_inline_off = (u16)((void *)is.s.here -
+                                      (void *)ext4_raw_inode(&is.iloc));
+        EXT4_I(inode)->i_inline_size = EXT4_MIN_INLINE_DATA_SIZE +
+                                le32_to_cpu(is.s.here->e_value_size);
+        ext4_set_inode_state(inode, EXT4_STATE_MAY_INLINE_DATA);
+        get_bh(is.iloc.bh);
+        error = ext4_mark_iloc_dirty(handle, inode, &is.iloc);
+out:
+        kfree(value);
+        brelse(is.iloc.bh);
+        return error;
+}
+int ext4_prepare_inline_data(handle_t *handle, struct inode *inode,
+                             unsigned int len)
+{
+        int ret, size;
+        struct ext4_inode_info *ei = EXT4_I(inode);
+        if (!ext4_test_inode_state(inode, EXT4_STATE_MAY_INLINE_DATA))
+                return -ENOSPC;
+        size = ext4_get_max_inline_size(inode);
+        if (size < len)
+                return -ENOSPC;
+        down_write(&EXT4_I(inode)->xattr_sem);
+        if (ei->i_inline_off)
+                ret = ext4_update_inline_data(handle, inode, len);
+        else
+                ret = ext4_create_inline_data(handle, inode, len);
+        up_write(&EXT4_I(inode)->xattr_sem);
+        return ret;
+}
+static int ext4_destroy_inline_data_nolock(handle_t *handle,
+                                           struct inode *inode)
+{
+        struct ext4_inode_info *ei = EXT4_I(inode);
+        struct ext4_xattr_ibody_find is = {
+                .s = { .not_found = 0, },
+        };
+        struct ext4_xattr_info i = {
+                .name_index = EXT4_XATTR_INDEX_SYSTEM,
+                .name = EXT4_XATTR_SYSTEM_DATA,
+                .value = NULL,
+                .value_len = 0,
+        };
+        int error;
+        if (!ei->i_inline_off)
+                return 0;
+        error = ext4_get_inode_loc(inode, &is.iloc);
+        if (error)
+                return error;
+        error = ext4_xattr_ibody_find(inode, &i, &is);
+        if (error)
+                goto out;
+        error = ext4_journal_get_write_access(handle, is.iloc.bh);
+        if (error)
+                goto out;
+        error = ext4_xattr_ibody_inline_set(handle, inode, &i, &is);
+        if (error)
+                goto out;
+        memset((void *)ext4_raw_inode(&is.iloc)->i_block,
+                0, EXT4_MIN_INLINE_DATA_SIZE);
+        if (EXT4_HAS_INCOMPAT_FEATURE(inode->i_sb,
+                                      EXT4_FEATURE_INCOMPAT_EXTENTS)) {
+                if (S_ISDIR(inode->i_mode) ||
+                    S_ISREG(inode->i_mode) || S_ISLNK(inode->i_mode)) {
+                        ext4_set_inode_flag(inode, EXT4_INODE_EXTENTS);
+                        ext4_ext_tree_init(handle, inode);
+                }
+        }
+        ext4_clear_inode_flag(inode, EXT4_INODE_INLINE_DATA);
+        get_bh(is.iloc.bh);
+        error = ext4_mark_iloc_dirty(handle, inode, &is.iloc);
+        EXT4_I(inode)->i_inline_off = 0;
+        EXT4_I(inode)->i_inline_size = 0;
+        ext4_clear_inode_state(inode, EXT4_STATE_MAY_INLINE_DATA);
+out:
+        brelse(is.iloc.bh);
+        if (error == -ENODATA)
+                error = 0;
+        return error;
+}
+static int ext4_read_inline_page(struct inode *inode, struct page *page)
+{
+        void *kaddr;
+        int ret = 0;
+        size_t len;
+        struct ext4_iloc iloc;
+        BUG_ON(!PageLocked(page));
+        BUG_ON(!ext4_has_inline_data(inode));
+        BUG_ON(page->index);
+        if (!EXT4_I(inode)->i_inline_off) {
+                ext4_warning(inode->i_sb, "inode %lu doesn't have inline data.",
+                             inode->i_ino);
+                goto out;
+        }
+        ret = ext4_get_inode_loc(inode, &iloc);
+        if (ret)
+                goto out;
+        len = min_t(size_t, ext4_get_inline_size(inode), i_size_read(inode));
+        kaddr = kmap_atomic(page);
+        ret = ext4_read_inline_data(inode, kaddr, len, &iloc);
+        flush_dcache_page(page);
+        kunmap_atomic(kaddr);
+        zero_user_segment(page, len, PAGE_CACHE_SIZE);
+        SetPageUptodate(page);
+        brelse(iloc.bh);
+out:
+        return ret;
+}
+int ext4_readpage_inline(struct inode *inode, struct page *page)
+{
+        int ret = 0;
+        down_read(&EXT4_I(inode)->xattr_sem);
+        if (!ext4_has_inline_data(inode)) {
+                up_read(&EXT4_I(inode)->xattr_sem);
+                return -EAGAIN;
+        }
+        /*
+         * Current inline data can only exist in the 1st page,
+         * So for all the other pages, just set them uptodate.
+         */
+        if (!page->index)
+                ret = ext4_read_inline_page(inode, page);
+        else if (!PageUptodate(page)) {
+                zero_user_segment(page, 0, PAGE_CACHE_SIZE);
+                SetPageUptodate(page);
+        }
+        up_read(&EXT4_I(inode)->xattr_sem);
+        unlock_page(page);
+        return ret >= 0 ? 0 : ret;
+}
+static int ext4_convert_inline_data_to_extent(struct address_space *mapping,
+                                              struct inode *inode,
+                                              unsigned flags)
+{
+        int ret, needed_blocks;
+        handle_t *handle = NULL;
+        int retries = 0, sem_held = 0;
+        struct page *page = NULL;
+        unsigned from, to;
+        struct ext4_iloc iloc;
+        if (!ext4_has_inline_data(inode)) {
+                /*
+                 * clear the flag so that no new write
+                 * will trap here again.
+                 */
+                ext4_clear_inode_state(inode, EXT4_STATE_MAY_INLINE_DATA);
+                return 0;
+        }
+        needed_blocks = ext4_writepage_trans_blocks(inode);
+        ret = ext4_get_inode_loc(inode, &iloc);
+        if (ret)
+                return ret;
+retry:
+        handle = ext4_journal_start(inode, needed_blocks);
+        if (IS_ERR(handle)) {
+                ret = PTR_ERR(handle);
+                handle = NULL;
+                goto out;
+        }
+        /* We cannot recurse into the filesystem as the transaction is already
+         * started */
+        flags |= AOP_FLAG_NOFS;
+        page = grab_cache_page_write_begin(mapping, 0, flags);
+        if (!page) {
+                ret = -ENOMEM;
+                goto out;
+        }
+        down_write(&EXT4_I(inode)->xattr_sem);
+        sem_held = 1;
+        /* If some one has already done this for us, just exit. */
+        if (!ext4_has_inline_data(inode)) {
+                ret = 0;
+                goto out;
+        }
+        from = 0;
+        to = ext4_get_inline_size(inode);
+        if (!PageUptodate(page)) {
+                ret = ext4_read_inline_page(inode, page);
+                if (ret < 0)
+                        goto out;
+        }
+        ret = ext4_destroy_inline_data_nolock(handle, inode);
+        if (ret)
+                goto out;
+        if (ext4_should_dioread_nolock(inode))
+                ret = __block_write_begin(page, from, to, ext4_get_block_write);
+        else
+                ret = __block_write_begin(page, from, to, ext4_get_block);
+        if (!ret && ext4_should_journal_data(inode)) {
+                ret = ext4_walk_page_buffers(handle, page_buffers(page),
+                                             from, to, NULL,
+                                             do_journal_get_write_access);
+        }
+        if (ret) {
+                unlock_page(page);
+                page_cache_release(page);
+                ext4_orphan_add(handle, inode);
+                up_write(&EXT4_I(inode)->xattr_sem);
+                sem_held = 0;
+                ext4_journal_stop(handle);
+                handle = NULL;
+                ext4_truncate_failed_write(inode);
+                /*
+                 * If truncate failed early the inode might
+                 * still be on the orphan list; we need to
+                 * make sure the inode is removed from the
+                 * orphan list in that case.
+                 */
+                if (inode->i_nlink)
+                        ext4_orphan_del(NULL, inode);
+        }
+        if (ret == -ENOSPC && ext4_should_retry_alloc(inode->i_sb, &retries))
+                goto retry;
+        block_commit_write(page, from, to);
+out:
+        if (page) {
+                unlock_page(page);
+                page_cache_release(page);
+        }
+        if (sem_held)
+                up_write(&EXT4_I(inode)->xattr_sem);
+        if (handle)
+                ext4_journal_stop(handle);
+        brelse(iloc.bh);
+        return ret;
+}
+/*
+ * Try to write data in the inode.
+ * If the inode has inline data, check whether the new write can be
+ * in the inode also. If not, create the page the handle, move the data
+ * to the page make it update and let the later codes create extent for it.
+ */
+int ext4_try_to_write_inline_data(struct address_space *mapping,
+                                  struct inode *inode,
+                                  loff_t pos, unsigned len,
+                                  unsigned flags,
+                                  struct page **pagep)
+{
+        int ret;
+        handle_t *handle;
+        struct page *page;
+        struct ext4_iloc iloc;
+        if (pos + len > ext4_get_max_inline_size(inode))
+                goto convert;
+        ret = ext4_get_inode_loc(inode, &iloc);
+        if (ret)
+                return ret;
+        /*
+         * The possible write could happen in the inode,
+         * so try to reserve the space in inode first.
+         */
+        handle = ext4_journal_start(inode, 1);
+        if (IS_ERR(handle)) {
+                ret = PTR_ERR(handle);
+                handle = NULL;
+                goto out;
+        }
+        ret = ext4_prepare_inline_data(handle, inode, pos + len);
+        if (ret && ret != -ENOSPC)
+                goto out;
+        /* We don't have space in inline inode, so convert it to extent. */
+        if (ret == -ENOSPC) {
+                ext4_journal_stop(handle);
+                brelse(iloc.bh);
+                goto convert;
+        }
+        flags |= AOP_FLAG_NOFS;
+        page = grab_cache_page_write_begin(mapping, 0, flags);
+        if (!page) {
+                ret = -ENOMEM;
+                goto out;
+        }
+        *pagep = page;
+        down_read(&EXT4_I(inode)->xattr_sem);
+        if (!ext4_has_inline_data(inode)) {
+                ret = 0;
+                unlock_page(page);
+                page_cache_release(page);
+                goto out_up_read;
+        }
+        if (!PageUptodate(page)) {
+                ret = ext4_read_inline_page(inode, page);
+                if (ret < 0)
+                        goto out_up_read;
+        }
+        ret = 1;
+        handle = NULL;
+out_up_read:
+        up_read(&EXT4_I(inode)->xattr_sem);
+out:
+        if (handle)
+                ext4_journal_stop(handle);
+        brelse(iloc.bh);
+        return ret;
+convert:
+        return ext4_convert_inline_data_to_extent(mapping,
+                                                  inode, flags);
+}
+int ext4_write_inline_data_end(struct inode *inode, loff_t pos, unsigned len,
+                               unsigned copied, struct page *page)
+{
+        int ret;
+        void *kaddr;
+        struct ext4_iloc iloc;
+        if (unlikely(copied < len)) {
+                if (!PageUptodate(page)) {
+                        copied = 0;
+                        goto out;
+                }
+        }
+        ret = ext4_get_inode_loc(inode, &iloc);
+        if (ret) {
+                ext4_std_error(inode->i_sb, ret);
+                copied = 0;
+                goto out;
+        }
+        down_write(&EXT4_I(inode)->xattr_sem);
+        BUG_ON(!ext4_has_inline_data(inode));
+        kaddr = kmap_atomic(page);
+        ext4_write_inline_data(inode, &iloc, kaddr, pos, len);
+        kunmap_atomic(kaddr);
+        SetPageUptodate(page);
+        /* clear page dirty so that writepages wouldn't work for us. */
+        ClearPageDirty(page);
+        up_write(&EXT4_I(inode)->xattr_sem);
+        brelse(iloc.bh);
+out:
+        return copied;
+}
+struct buffer_head *
+ext4_journalled_write_inline_data(struct inode *inode,
+                                  unsigned len,
+                                  struct page *page)
+{
+        int ret;
+        void *kaddr;
+        struct ext4_iloc iloc;
+        ret = ext4_get_inode_loc(inode, &iloc);
+        if (ret) {
+                ext4_std_error(inode->i_sb, ret);
+                return NULL;
+        }
+        down_write(&EXT4_I(inode)->xattr_sem);
+        kaddr = kmap_atomic(page);
+        ext4_write_inline_data(inode, &iloc, kaddr, 0, len);
+        kunmap_atomic(kaddr);
+        up_write(&EXT4_I(inode)->xattr_sem);
+        return iloc.bh;
+}
+/*
+ * Try to make the page cache and handle ready for the inline data case.
+ * We can call this function in 2 cases:
+ * 1. The inode is created and the first write exceeds inline size. We can
+ *    clear the inode state safely.
+ * 2. The inode has inline data, then we need to read the data, make it
+ *    update and dirty so that ext4_da_writepages can handle it. We don't
+ *    need to start the journal since the file's metatdata isn't changed now.
+ */
+static int ext4_da_convert_inline_data_to_extent(struct address_space *mapping,
+                                                 struct inode *inode,
+                                                 unsigned flags,
+                                                 void **fsdata)
+{
+        int ret = 0, inline_size;
+        struct page *page;
+        page = grab_cache_page_write_begin(mapping, 0, flags);
+        if (!page)
+                return -ENOMEM;
+        down_read(&EXT4_I(inode)->xattr_sem);
+        if (!ext4_has_inline_data(inode)) {
+                ext4_clear_inode_state(inode, EXT4_STATE_MAY_INLINE_DATA);
+                goto out;
+        }
+        inline_size = ext4_get_inline_size(inode);
+        if (!PageUptodate(page)) {
+                ret = ext4_read_inline_page(inode, page);
+                if (ret < 0)
+                        goto out;
+        }
+        ret = __block_write_begin(page, 0, inline_size,
+                                  ext4_da_get_block_prep);
+        if (ret) {
+                ext4_truncate_failed_write(inode);
+                goto out;
+        }
+        SetPageDirty(page);
+        SetPageUptodate(page);
+        ext4_clear_inode_state(inode, EXT4_STATE_MAY_INLINE_DATA);
+        *fsdata = (void *)CONVERT_INLINE_DATA;
+out:
+        up_read(&EXT4_I(inode)->xattr_sem);
+        if (page) {
+                unlock_page(page);
+                page_cache_release(page);
+        }
+        return ret;
+}
+/*
+ * Prepare the write for the inline data.
+ * If the the data can be written into the inode, we just read
+ * the page and make it uptodate, and start the journal.
+ * Otherwise read the page, makes it dirty so that it can be
+ * handle in writepages(the i_disksize update is left to the
+ * normal ext4_da_write_end).
+ */
+int ext4_da_write_inline_data_begin(struct address_space *mapping,
+                                    struct inode *inode,
+                                    loff_t pos, unsigned len,
+                                    unsigned flags,
+                                    struct page **pagep,
+                                    void **fsdata)
+{
+        int ret, inline_size;
+        handle_t *handle;
+        struct page *page;
+        struct ext4_iloc iloc;
+        ret = ext4_get_inode_loc(inode, &iloc);
+        if (ret)
+                return ret;
+        handle = ext4_journal_start(inode, 1);
+        if (IS_ERR(handle)) {
+                ret = PTR_ERR(handle);
+                handle = NULL;
+                goto out;
+        }
+        inline_size = ext4_get_max_inline_size(inode);
+        ret = -ENOSPC;
+        if (inline_size >= pos + len) {
+                ret = ext4_prepare_inline_data(handle, inode, pos + len);
+                if (ret && ret != -ENOSPC)
+                        goto out;
+        }
+        if (ret == -ENOSPC) {
+                ret = ext4_da_convert_inline_data_to_extent(mapping,
+                                                            inode,
+                                                            flags,
+                                                            fsdata);
+                goto out;
+        }
+        /*
+         * We cannot recurse into the filesystem as the transaction
+         * is already started.
+         */
+        flags |= AOP_FLAG_NOFS;
+        page = grab_cache_page_write_begin(mapping, 0, flags);
+        if (!page) {
+                ret = -ENOMEM;
+                goto out;
+        }
+        down_read(&EXT4_I(inode)->xattr_sem);
+        if (!ext4_has_inline_data(inode)) {
+                ret = 0;
+                goto out_release_page;
+        }
+        if (!PageUptodate(page)) {
+                ret = ext4_read_inline_page(inode, page);
+                if (ret < 0)
+                        goto out_release_page;
+        }
+        up_read(&EXT4_I(inode)->xattr_sem);
+        *pagep = page;
+        handle = NULL;
+        brelse(iloc.bh);
+        return 1;
+out_release_page:
+        up_read(&EXT4_I(inode)->xattr_sem);
+        unlock_page(page);
+        page_cache_release(page);
+out:
+        if (handle)
+                ext4_journal_stop(handle);
+        brelse(iloc.bh);
+        return ret;
+}
+int ext4_da_write_inline_data_end(struct inode *inode, loff_t pos,
+                                  unsigned len, unsigned copied,
+                                  struct page *page)
+{
+        int i_size_changed = 0;
+        copied = ext4_write_inline_data_end(inode, pos, len, copied, page);
+        /*
+         * No need to use i_size_read() here, the i_size
+         * cannot change under us because we hold i_mutex.
+         *
+         * But it's important to update i_size while still holding page lock:
+         * page writeout could otherwise come in and zero beyond i_size.
+         */
+        if (pos+copied > inode->i_size) {
+                i_size_write(inode, pos+copied);
+                i_size_changed = 1;
+        }
+        unlock_page(page);
+        page_cache_release(page);
+        /*
+         * Don't mark the inode dirty under page lock. First, it unnecessarily
+         * makes the holding time of page lock longer. Second, it forces lock
+         * ordering of page lock and transaction start for journaling
+         * filesystems.
+         */
+        if (i_size_changed)
+                mark_inode_dirty(inode);
+        return copied;
+}
+#ifdef INLINE_DIR_DEBUG
+void ext4_show_inline_dir(struct inode *dir, struct buffer_head *bh,
+                          void *inline_start, int inline_size)
+{
+        int offset;
+        unsigned short de_len;
+        struct ext4_dir_entry_2 *de = inline_start;
+        void *dlimit = inline_start + inline_size;
+        trace_printk("inode %lu\n", dir->i_ino);
+        offset = 0;
+        while ((void *)de < dlimit) {
+                de_len = ext4_rec_len_from_disk(de->rec_len, inline_size);
+                trace_printk("de: off %u rlen %u name %*.s nlen %u ino %u\n",
+                             offset, de_len, de->name_len, de->name,
+                             de->name_len, le32_to_cpu(de->inode));
+                if (ext4_check_dir_entry(dir, NULL, de, bh,
+                                         inline_start, inline_size, offset))
+                        BUG();
+                offset += de_len;
+                de = (struct ext4_dir_entry_2 *) ((char *) de + de_len);
+        }
+}
+#else
+#define ext4_show_inline_dir(dir, bh, inline_start, inline_size)
+#endif
+/*
+ * Add a new entry into a inline dir.
+ * It will return -ENOSPC if no space is available, and -EIO
+ * and -EEXIST if directory entry already exists.
+ */
+static int ext4_add_dirent_to_inline(handle_t *handle,
+                                     struct dentry *dentry,
+                                     struct inode *inode,
+                                     struct ext4_iloc *iloc,
+                                     void *inline_start, int inline_size)
+{
+        struct inode    *dir = dentry->d_parent->d_inode;
+        const char      *name = dentry->d_name.name;
+        int             namelen = dentry->d_name.len;
+        unsigned short  reclen;
+        int             err;
+        struct ext4_dir_entry_2 *de;
+        reclen = EXT4_DIR_REC_LEN(namelen);
+        err = ext4_find_dest_de(dir, inode, iloc->bh,
+                                inline_start, inline_size,
+                                name, namelen, &de);
+        if (err)
+                return err;
+        err = ext4_journal_get_write_access(handle, iloc->bh);
+        if (err)
+                return err;
+        ext4_insert_dentry(inode, de, inline_size, name, namelen);
+        ext4_show_inline_dir(dir, iloc->bh, inline_start, inline_size);
+        /*
+         * XXX shouldn't update any times until successful
+         * completion of syscall, but too many callers depend
+         * on this.
+         *
+         * XXX similarly, too many callers depend on
+         * ext4_new_inode() setting the times, but error
+         * recovery deletes the inode, so the worst that can
+         * happen is that the times are slightly out of date
+         * and/or different from the directory change time.
+         */
+        dir->i_mtime = dir->i_ctime = ext4_current_time(dir);
+        ext4_update_dx_flag(dir);
+        dir->i_version++;
+        ext4_mark_inode_dirty(handle, dir);
+        return 1;
+}
+static void *ext4_get_inline_xattr_pos(struct inode *inode,
+                                       struct ext4_iloc *iloc)
+{
+        struct ext4_xattr_entry *entry;
+        struct ext4_xattr_ibody_header *header;
+        BUG_ON(!EXT4_I(inode)->i_inline_off);
+        header = IHDR(inode, ext4_raw_inode(iloc));
+        entry = (struct ext4_xattr_entry *)((void *)ext4_raw_inode(iloc) +
+                                            EXT4_I(inode)->i_inline_off);
+        return (void *)IFIRST(header) + le16_to_cpu(entry->e_value_offs);
+}
+/* Set the final de to cover the whole block. */
+static void ext4_update_final_de(void *de_buf, int old_size, int new_size)
+{
+        struct ext4_dir_entry_2 *de, *prev_de;
+        void *limit;
+        int de_len;
+        de = (struct ext4_dir_entry_2 *)de_buf;
+        if (old_size) {
+                limit = de_buf + old_size;
+                do {
+                        prev_de = de;
+                        de_len = ext4_rec_len_from_disk(de->rec_len, old_size);
+                        de_buf += de_len;
+                        de = (struct ext4_dir_entry_2 *)de_buf;
+                } while (de_buf < limit);
+                prev_de->rec_len = ext4_rec_len_to_disk(de_len + new_size -
+                                                        old_size, new_size);
+        } else {
+                /* this is just created, so create an empty entry. */
+                de->inode = 0;
+                de->rec_len = ext4_rec_len_to_disk(new_size, new_size);
+        }
+}
+static int ext4_update_inline_dir(handle_t *handle, struct inode *dir,
+                                  struct ext4_iloc *iloc)
+{
+        int ret;
+        int old_size = EXT4_I(dir)->i_inline_size - EXT4_MIN_INLINE_DATA_SIZE;
+        int new_size = get_max_inline_xattr_value_size(dir, iloc);
+        if (new_size - old_size <= EXT4_DIR_REC_LEN(1))
+                return -ENOSPC;
+        ret = ext4_update_inline_data(handle, dir,
+                                      new_size + EXT4_MIN_INLINE_DATA_SIZE);
+        if (ret)
+                return ret;
+        ext4_update_final_de(ext4_get_inline_xattr_pos(dir, iloc), old_size,
+                             EXT4_I(dir)->i_inline_size -
+                                                EXT4_MIN_INLINE_DATA_SIZE);
+        dir->i_size = EXT4_I(dir)->i_disksize = EXT4_I(dir)->i_inline_size;
+        return 0;
+}
+static void ext4_restore_inline_data(handle_t *handle, struct inode *inode,
+                                     struct ext4_iloc *iloc,
+                                     void *buf, int inline_size)
+{
+        ext4_create_inline_data(handle, inode, inline_size);
+        ext4_write_inline_data(inode, iloc, buf, 0, inline_size);
+        ext4_set_inode_state(inode, EXT4_STATE_MAY_INLINE_DATA);
+}
+static int ext4_finish_convert_inline_dir(handle_t *handle,
+                                          struct inode *inode,
+                                          struct buffer_head *dir_block,
+                                          void *buf,
+                                          int inline_size)
+{
+        int err, csum_size = 0, header_size = 0;
+        struct ext4_dir_entry_2 *de;
+        struct ext4_dir_entry_tail *t;
+        void *target = dir_block->b_data;
+        /*
+         * First create "." and ".." and then copy the dir information
+         * back to the block.
+         */
+        de = (struct ext4_dir_entry_2 *)target;
+        de = ext4_init_dot_dotdot(inode, de,
+                inode->i_sb->s_blocksize, csum_size,
+                le32_to_cpu(((struct ext4_dir_entry_2 *)buf)->inode), 1);
+        header_size = (void *)de - target;
+        memcpy((void *)de, buf + EXT4_INLINE_DOTDOT_SIZE,
+                inline_size - EXT4_INLINE_DOTDOT_SIZE);
+        if (EXT4_HAS_RO_COMPAT_FEATURE(inode->i_sb,
+                                       EXT4_FEATURE_RO_COMPAT_METADATA_CSUM))
+                csum_size = sizeof(struct ext4_dir_entry_tail);
+        inode->i_size = inode->i_sb->s_blocksize;
+        i_size_write(inode, inode->i_sb->s_blocksize);
+        EXT4_I(inode)->i_disksize = inode->i_sb->s_blocksize;
+        ext4_update_final_de(dir_block->b_data,
+                        inline_size - EXT4_INLINE_DOTDOT_SIZE + header_size,
+                        inode->i_sb->s_blocksize - csum_size);
+        if (csum_size) {
+                t = EXT4_DIRENT_TAIL(dir_block->b_data,
+                                     inode->i_sb->s_blocksize);
+                initialize_dirent_tail(t, inode->i_sb->s_blocksize);
+        }
+        set_buffer_uptodate(dir_block);
+        err = ext4_handle_dirty_dirent_node(handle, inode, dir_block);
+        if (err)
+                goto out;
+        set_buffer_verified(dir_block);
+out:
+        return err;
+}
+static int ext4_convert_inline_data_nolock(handle_t *handle,
+                                           struct inode *inode,
+                                           struct ext4_iloc *iloc)
+{
+        int error;
+        void *buf = NULL;
+        struct buffer_head *data_bh = NULL;
+        struct ext4_map_blocks map;
+        int inline_size;
+        inline_size = ext4_get_inline_size(inode);
+        buf = kmalloc(inline_size, GFP_NOFS);
+        if (!buf) {
+                error = -ENOMEM;
+                goto out;
+        }
+        error = ext4_read_inline_data(inode, buf, inline_size, iloc);
+        if (error < 0)
+                goto out;
+        error = ext4_destroy_inline_data_nolock(handle, inode);
+        if (error)
+                goto out;
+        map.m_lblk = 0;
+        map.m_len = 1;
+        map.m_flags = 0;
+        error = ext4_map_blocks(handle, inode, &map, EXT4_GET_BLOCKS_CREATE);
+        if (error < 0)
+                goto out_restore;
+        if (!(map.m_flags & EXT4_MAP_MAPPED)) {
+                error = -EIO;
+                goto out_restore;
+        }
+        data_bh = sb_getblk(inode->i_sb, map.m_pblk);
+        if (!data_bh) {
+                error = -EIO;
+                goto out_restore;
+        }
+        lock_buffer(data_bh);
+        error = ext4_journal_get_create_access(handle, data_bh);
+        if (error) {
+                unlock_buffer(data_bh);
+                error = -EIO;
+                goto out_restore;
+        }
+        memset(data_bh->b_data, 0, inode->i_sb->s_blocksize);
+        if (!S_ISDIR(inode->i_mode)) {
+                memcpy(data_bh->b_data, buf, inline_size);
+                set_buffer_uptodate(data_bh);
+                error = ext4_handle_dirty_metadata(handle,
+                                                   inode, data_bh);
+        } else {
+                error = ext4_finish_convert_inline_dir(handle, inode, data_bh,
+                                                       buf, inline_size);
+        }
+        unlock_buffer(data_bh);
+out_restore:
+        if (error)
+                ext4_restore_inline_data(handle, inode, iloc, buf, inline_size);
+out:
+        brelse(data_bh);
+        kfree(buf);
+        return error;
+}
+/*
+ * Try to add the new entry to the inline data.
+ * If succeeds, return 0. If not, extended the inline dir and copied data to
+ * the new created block.
+ */
+int ext4_try_add_inline_entry(handle_t *handle, struct dentry *dentry,
+                              struct inode *inode)
+{
+        int ret, inline_size;
+        void *inline_start;
+        struct ext4_iloc iloc;
+        struct inode *dir = dentry->d_parent->d_inode;
+        ret = ext4_get_inode_loc(dir, &iloc);
+        if (ret)
+                return ret;
+        down_write(&EXT4_I(dir)->xattr_sem);
+        if (!ext4_has_inline_data(dir))
+                goto out;
+        inline_start = (void *)ext4_raw_inode(&iloc)->i_block +
+                                                 EXT4_INLINE_DOTDOT_SIZE;
+        inline_size = EXT4_MIN_INLINE_DATA_SIZE - EXT4_INLINE_DOTDOT_SIZE;
+        ret = ext4_add_dirent_to_inline(handle, dentry, inode, &iloc,
+                                        inline_start, inline_size);
+        if (ret != -ENOSPC)
+                goto out;
+        /* check whether it can be inserted to inline xattr space. */
+        inline_size = EXT4_I(dir)->i_inline_size -
+                        EXT4_MIN_INLINE_DATA_SIZE;
+        if (!inline_size) {
+                /* Try to use the xattr space.*/
+                ret = ext4_update_inline_dir(handle, dir, &iloc);
+                if (ret && ret != -ENOSPC)
+                        goto out;
+                inline_size = EXT4_I(dir)->i_inline_size -
+                                EXT4_MIN_INLINE_DATA_SIZE;
+        }
+        if (inline_size) {
+                inline_start = ext4_get_inline_xattr_pos(dir, &iloc);
+                ret = ext4_add_dirent_to_inline(handle, dentry, inode, &iloc,
+                                                inline_start, inline_size);
+                if (ret != -ENOSPC)
+                        goto out;
+        }
+        /*
+         * The inline space is filled up, so create a new block for it.
+         * As the extent tree will be created, we have to save the inline
+         * dir first.
+         */
+        ret = ext4_convert_inline_data_nolock(handle, dir, &iloc);
+out:
+        ext4_mark_inode_dirty(handle, dir);
+        up_write(&EXT4_I(dir)->xattr_sem);
+        brelse(iloc.bh);
+        return ret;
+}
+int ext4_read_inline_dir(struct file *filp,
+                         void *dirent, filldir_t filldir,
+                         int *has_inline_data)
+{
+        int error = 0;
+        unsigned int offset, parent_ino;
+        int i, stored;
+        struct ext4_dir_entry_2 *de;
+        struct super_block *sb;
+        struct inode *inode = filp->f_path.dentry->d_inode;
+        int ret, inline_size = 0;
+        struct ext4_iloc iloc;
+        void *dir_buf = NULL;
+        ret = ext4_get_inode_loc(inode, &iloc);
+        if (ret)
+                return ret;
+        down_read(&EXT4_I(inode)->xattr_sem);
+        if (!ext4_has_inline_data(inode)) {
+                up_read(&EXT4_I(inode)->xattr_sem);
+                *has_inline_data = 0;
+                goto out;
+        }
+        inline_size = ext4_get_inline_size(inode);
+        dir_buf = kmalloc(inline_size, GFP_NOFS);
+        if (!dir_buf) {
+                ret = -ENOMEM;
+                up_read(&EXT4_I(inode)->xattr_sem);
+                goto out;
+        }
+        ret = ext4_read_inline_data(inode, dir_buf, inline_size, &iloc);
+        up_read(&EXT4_I(inode)->xattr_sem);
+        if (ret < 0)
+                goto out;
+        sb = inode->i_sb;
+        stored = 0;
+        parent_ino = le32_to_cpu(((struct ext4_dir_entry_2 *)dir_buf)->inode);
+        while (!error && !stored && filp->f_pos < inode->i_size) {
+revalidate:
+                /*
+                 * If the version has changed since the last call to
+                 * readdir(2), then we might be pointing to an invalid
+                 * dirent right now.  Scan from the start of the inline
+                 * dir to make sure.
+                 */
+                if (filp->f_version != inode->i_version) {
+                        for (i = 0;
+                             i < inode->i_size && i < offset;) {
+                                if (!i) {
+                                        /* skip "." and ".." if needed. */
+                                        i += EXT4_INLINE_DOTDOT_SIZE;
+                                        continue;
+                                }
+                                de = (struct ext4_dir_entry_2 *)
+                                        (dir_buf + i);
+                                /* It's too expensive to do a full
+                                 * dirent test each time round this
+                                 * loop, but we do have to test at
+                                 * least that it is non-zero.  A
+                                 * failure will be detected in the
+                                 * dirent test below. */
+                                if (ext4_rec_len_from_disk(de->rec_len,
+                                        inline_size) < EXT4_DIR_REC_LEN(1))
+                                        break;
+                                i += ext4_rec_len_from_disk(de->rec_len,
+                                                            inline_size);
+                        }
+                        offset = i;
+                        filp->f_pos = offset;
+                        filp->f_version = inode->i_version;
+                }
+                while (!error && filp->f_pos < inode->i_size) {
+                        if (filp->f_pos == 0) {
+                                error = filldir(dirent, ".", 1, 0, inode->i_ino,
+                                                DT_DIR);
+                                if (error)
+                                        break;
+                                stored++;
+                                error = filldir(dirent, "..", 2, 0, parent_ino,
+                                                DT_DIR);
+                                if (error)
+                                        break;
+                                stored++;
+                                filp->f_pos = offset = EXT4_INLINE_DOTDOT_SIZE;
+                                continue;
+                        }
+                        de = (struct ext4_dir_entry_2 *)(dir_buf + offset);
+                        if (ext4_check_dir_entry(inode, filp, de,
+                                                 iloc.bh, dir_buf,
+                                                 inline_size, offset)) {
+                                ret = stored;
+                                goto out;
+                        }
+                        offset += ext4_rec_len_from_disk(de->rec_len,
+                                                         inline_size);
+                        if (le32_to_cpu(de->inode)) {
+                                /* We might block in the next section
+                                 * if the data destination is
+                                 * currently swapped out.  So, use a
+                                 * version stamp to detect whether or
+                                 * not the directory has been modified
+                                 * during the copy operation.
+                                 */
+                                u64 version = filp->f_version;
+                                error = filldir(dirent, de->name,
+                                                de->name_len,
+                                                filp->f_pos,
+                                                le32_to_cpu(de->inode),
+                                                get_dtype(sb, de->file_type));
+                                if (error)
+                                        break;
+                                if (version != filp->f_version)
+                                        goto revalidate;
+                                stored++;
+                        }
+                        filp->f_pos += ext4_rec_len_from_disk(de->rec_len,
+                                                              inline_size);
+                }
+                offset = 0;
+        }
+out:
+        kfree(dir_buf);
+        brelse(iloc.bh);
+        return ret;
+}
+struct buffer_head *ext4_get_first_inline_block(struct inode *inode,
+                                        struct ext4_dir_entry_2 **parent_de,
+                                        int *retval)
+{
+        struct ext4_iloc iloc;
+        *retval = ext4_get_inode_loc(inode, &iloc);
+        if (*retval)
+                return NULL;
+        *parent_de = (struct ext4_dir_entry_2 *)ext4_raw_inode(&iloc)->i_block;
+        return iloc.bh;
+}
+/*
+ * Try to create the inline data for the new dir.
+ * If it succeeds, return 0, otherwise return the error.
+ * In case of ENOSPC, the caller should create the normal disk layout dir.
+ */
+int ext4_try_create_inline_dir(handle_t *handle, struct inode *parent,
+                               struct inode *inode)
+{
+        int ret, inline_size = EXT4_MIN_INLINE_DATA_SIZE;
+        struct ext4_iloc iloc;
+        struct ext4_dir_entry_2 *de;
+        ret = ext4_get_inode_loc(inode, &iloc);
+        if (ret)
+                return ret;
+        ret = ext4_prepare_inline_data(handle, inode, inline_size);
+        if (ret)
+                goto out;
+        /*
+         * For inline dir, we only save the inode information for the ".."
+         * and create a fake dentry to cover the left space.
+         */
+        de = (struct ext4_dir_entry_2 *)ext4_raw_inode(&iloc)->i_block;
+        de->inode = cpu_to_le32(parent->i_ino);
+        de = (struct ext4_dir_entry_2 *)((void *)de + EXT4_INLINE_DOTDOT_SIZE);
+        de->inode = 0;
+        de->rec_len = ext4_rec_len_to_disk(
+                                inline_size - EXT4_INLINE_DOTDOT_SIZE,
+                                inline_size);
+        set_nlink(inode, 2);
+        inode->i_size = EXT4_I(inode)->i_disksize = inline_size;
+out:
+        brelse(iloc.bh);
+        return ret;
+}
+struct buffer_head *ext4_find_inline_entry(struct inode *dir,
+                                        const struct qstr *d_name,
+                                        struct ext4_dir_entry_2 **res_dir,
+                                        int *has_inline_data)
+{
+        int ret;
+        struct ext4_iloc iloc;
+        void *inline_start;
+        int inline_size;
+        if (ext4_get_inode_loc(dir, &iloc))
+                return NULL;
+        down_read(&EXT4_I(dir)->xattr_sem);
+        if (!ext4_has_inline_data(dir)) {
+                *has_inline_data = 0;
+                goto out;
+        }
+        inline_start = (void *)ext4_raw_inode(&iloc)->i_block +
+                                                EXT4_INLINE_DOTDOT_SIZE;
+        inline_size = EXT4_MIN_INLINE_DATA_SIZE - EXT4_INLINE_DOTDOT_SIZE;
+        ret = search_dir(iloc.bh, inline_start, inline_size,
+                         dir, d_name, 0, res_dir);
+        if (ret == 1)
+                goto out_find;
+        if (ret < 0)
+                goto out;
+        if (ext4_get_inline_size(dir) == EXT4_MIN_INLINE_DATA_SIZE)
+                goto out;
+        inline_start = ext4_get_inline_xattr_pos(dir, &iloc);
+        inline_size = ext4_get_inline_size(dir) - EXT4_MIN_INLINE_DATA_SIZE;
+        ret = search_dir(iloc.bh, inline_start, inline_size,
+                         dir, d_name, 0, res_dir);
+        if (ret == 1)
+                goto out_find;
+out:
+        brelse(iloc.bh);
+        iloc.bh = NULL;
+out_find:
+        up_read(&EXT4_I(dir)->xattr_sem);
+        return iloc.bh;
+}
+int ext4_delete_inline_entry(handle_t *handle,
+                             struct inode *dir,
+                             struct ext4_dir_entry_2 *de_del,
+                             struct buffer_head *bh,
+                             int *has_inline_data)
+{
+        int err, inline_size;
+        struct ext4_iloc iloc;
+        void *inline_start;
+        err = ext4_get_inode_loc(dir, &iloc);
+        if (err)
+                return err;
+        down_write(&EXT4_I(dir)->xattr_sem);
+        if (!ext4_has_inline_data(dir)) {
+                *has_inline_data = 0;
+                goto out;
+        }
+        if ((void *)de_del - ((void *)ext4_raw_inode(&iloc)->i_block) <
+                EXT4_MIN_INLINE_DATA_SIZE) {
+                inline_start = (void *)ext4_raw_inode(&iloc)->i_block +
+                                        EXT4_INLINE_DOTDOT_SIZE;
+                inline_size = EXT4_MIN_INLINE_DATA_SIZE -
+                                EXT4_INLINE_DOTDOT_SIZE;
+        } else {
+                inline_start = ext4_get_inline_xattr_pos(dir, &iloc);
+                inline_size = ext4_get_inline_size(dir) -
+                                EXT4_MIN_INLINE_DATA_SIZE;
+        }
+        err = ext4_journal_get_write_access(handle, bh);
+        if (err)
+                goto out;
+        err = ext4_generic_delete_entry(handle, dir, de_del, bh,
+                                        inline_start, inline_size, 0);
+        if (err)
+                goto out;
+        BUFFER_TRACE(bh, "call ext4_handle_dirty_metadata");
+        err = ext4_mark_inode_dirty(handle, dir);
+        if (unlikely(err))
+                goto out;
+        ext4_show_inline_dir(dir, iloc.bh, inline_start, inline_size);
+out:
+        up_write(&EXT4_I(dir)->xattr_sem);
+        brelse(iloc.bh);
+        if (err != -ENOENT)
+                ext4_std_error(dir->i_sb, err);
+        return err;
+}
+/*
+ * Get the inline dentry at offset.
+ */
+static inline struct ext4_dir_entry_2 *
+ext4_get_inline_entry(struct inode *inode,
+                      struct ext4_iloc *iloc,
+                      unsigned int offset,
+                      void **inline_start,
+                      int *inline_size)
+{
+        void *inline_pos;
+        BUG_ON(offset > ext4_get_inline_size(inode));
+        if (offset < EXT4_MIN_INLINE_DATA_SIZE) {
+                inline_pos = (void *)ext4_raw_inode(iloc)->i_block;
+                *inline_size = EXT4_MIN_INLINE_DATA_SIZE;
+        } else {
+                inline_pos = ext4_get_inline_xattr_pos(inode, iloc);
+                offset -= EXT4_MIN_INLINE_DATA_SIZE;
+                *inline_size = ext4_get_inline_size(inode) -
+                                EXT4_MIN_INLINE_DATA_SIZE;
+        }
+        if (inline_start)
+                *inline_start = inline_pos;
+        return (struct ext4_dir_entry_2 *)(inline_pos + offset);
+}
+int empty_inline_dir(struct inode *dir, int *has_inline_data)
+{
+        int err, inline_size;
+        struct ext4_iloc iloc;
+        void *inline_pos;
+        unsigned int offset;
+        struct ext4_dir_entry_2 *de;
+        int ret = 1;
+        err = ext4_get_inode_loc(dir, &iloc);
+        if (err) {
+                EXT4_ERROR_INODE(dir, "error %d getting inode %lu block",
+                                 err, dir->i_ino);
+                return 1;
+        }
+        down_read(&EXT4_I(dir)->xattr_sem);
+        if (!ext4_has_inline_data(dir)) {
+                *has_inline_data = 0;
+                goto out;
+        }
+        de = (struct ext4_dir_entry_2 *)ext4_raw_inode(&iloc)->i_block;
+        if (!le32_to_cpu(de->inode)) {
+                ext4_warning(dir->i_sb,
+                             "bad inline directory (dir #%lu) - no `..'",
+                             dir->i_ino);
+                ret = 1;
+                goto out;
+        }
+        offset = EXT4_INLINE_DOTDOT_SIZE;
+        while (offset < dir->i_size) {
+                de = ext4_get_inline_entry(dir, &iloc, offset,
+                                           &inline_pos, &inline_size);
+                if (ext4_check_dir_entry(dir, NULL, de,
+                                         iloc.bh, inline_pos,
+                                         inline_size, offset)) {
+                        ext4_warning(dir->i_sb,
+                                     "bad inline directory (dir #%lu) - "
+                                     "inode %u, rec_len %u, name_len %d"
+                                     "inline size %d\n",
+                                     dir->i_ino, le32_to_cpu(de->inode),
+                                     le16_to_cpu(de->rec_len), de->name_len,
+                                     inline_size);
+                        ret = 1;
+                        goto out;
+                }
+                if (le32_to_cpu(de->inode)) {
+                        ret = 0;
+                        goto out;
+                }
+                offset += ext4_rec_len_from_disk(de->rec_len, inline_size);
+        }
+out:
+        up_read(&EXT4_I(dir)->xattr_sem);
+        brelse(iloc.bh);
+        return ret;
+}
+int ext4_destroy_inline_data(handle_t *handle, struct inode *inode)
+{
+        int ret;
+        down_write(&EXT4_I(inode)->xattr_sem);
+        ret = ext4_destroy_inline_data_nolock(handle, inode);
+        up_write(&EXT4_I(inode)->xattr_sem);
+        return ret;
+}
+int ext4_inline_data_fiemap(struct inode *inode,
+                            struct fiemap_extent_info *fieinfo,
+                            int *has_inline)
+{
+        __u64 physical = 0;
+        __u64 length;
+        __u32 flags = FIEMAP_EXTENT_DATA_INLINE | FIEMAP_EXTENT_LAST;
+        int error = 0;
+        struct ext4_iloc iloc;
+        down_read(&EXT4_I(inode)->xattr_sem);
+        if (!ext4_has_inline_data(inode)) {
+                *has_inline = 0;
+                goto out;
+        }
+        error = ext4_get_inode_loc(inode, &iloc);
+        if (error)
+                goto out;
+        physical = iloc.bh->b_blocknr << inode->i_sb->s_blocksize_bits;
+        physical += (char *)ext4_raw_inode(&iloc) - iloc.bh->b_data;
+        physical += offsetof(struct ext4_inode, i_block);
+        length = i_size_read(inode);
+        if (physical)
+                error = fiemap_fill_next_extent(fieinfo, 0, physical,
+                                                length, flags);
+        brelse(iloc.bh);
+out:
+        up_read(&EXT4_I(inode)->xattr_sem);
+        return (error < 0 ? error : 0);
+}
+/*
+ * Called during xattr set, and if we can sparse space 'needed',
+ * just create the extent tree evict the data to the outer block.
+ *
+ * We use jbd2 instead of page cache to move data to the 1st block
+ * so that the whole transaction can be committed as a whole and
+ * the data isn't lost because of the delayed page cache write.
+ */
+int ext4_try_to_evict_inline_data(handle_t *handle,
+                                  struct inode *inode,
+                                  int needed)
+{
+        int error;
+        struct ext4_xattr_entry *entry;
+        struct ext4_xattr_ibody_header *header;
+        struct ext4_inode *raw_inode;
+        struct ext4_iloc iloc;
+        error = ext4_get_inode_loc(inode, &iloc);
+        if (error)
+                return error;
+        raw_inode = ext4_raw_inode(&iloc);
+        header = IHDR(inode, raw_inode);
+        entry = (struct ext4_xattr_entry *)((void *)raw_inode +
+                                            EXT4_I(inode)->i_inline_off);
+        if (EXT4_XATTR_LEN(entry->e_name_len) +
+            EXT4_XATTR_SIZE(le32_to_cpu(entry->e_value_size)) < needed) {
+                error = -ENOSPC;
+                goto out;
+        }
+        error = ext4_convert_inline_data_nolock(handle, inode, &iloc);
+out:
+        brelse(iloc.bh);
+        return error;
+}
+void ext4_inline_data_truncate(struct inode *inode, int *has_inline)
+{
+        handle_t *handle;
+        int inline_size, value_len, needed_blocks;
+        size_t i_size;
+        void *value = NULL;
+        struct ext4_xattr_ibody_find is = {
+                .s = { .not_found = -ENODATA, },
+        };
+        struct ext4_xattr_info i = {
+                .name_index = EXT4_XATTR_INDEX_SYSTEM,
+                .name = EXT4_XATTR_SYSTEM_DATA,
+        };
+        needed_blocks = ext4_writepage_trans_blocks(inode);
+        handle = ext4_journal_start(inode, needed_blocks);
+        if (IS_ERR(handle))
+                return;
+        down_write(&EXT4_I(inode)->xattr_sem);
+        if (!ext4_has_inline_data(inode)) {
+                *has_inline = 0;
+                ext4_journal_stop(handle);
+                return;
+        }
+        if (ext4_orphan_add(handle, inode))
+                goto out;
+        if (ext4_get_inode_loc(inode, &is.iloc))
+                goto out;
+        down_write(&EXT4_I(inode)->i_data_sem);
+        i_size = inode->i_size;
+        inline_size = ext4_get_inline_size(inode);
+        EXT4_I(inode)->i_disksize = i_size;
+        if (i_size < inline_size) {
+                /* Clear the content in the xattr space. */
+                if (inline_size > EXT4_MIN_INLINE_DATA_SIZE) {
+                        if (ext4_xattr_ibody_find(inode, &i, &is))
+                                goto out_error;
+                        BUG_ON(is.s.not_found);
+                        value_len = le32_to_cpu(is.s.here->e_value_size);
+                        value = kmalloc(value_len, GFP_NOFS);
+                        if (!value)
+                                goto out_error;
+                        if (ext4_xattr_ibody_get(inode, i.name_index, i.name,
+                                                value, value_len))
+                                goto out_error;
+                        i.value = value;
+                        i.value_len = i_size > EXT4_MIN_INLINE_DATA_SIZE ?
+                                        i_size - EXT4_MIN_INLINE_DATA_SIZE : 0;
+                        if (ext4_xattr_ibody_inline_set(handle, inode, &i, &is))
+                                goto out_error;
+                }
+                /* Clear the content within i_blocks. */
+                if (i_size < EXT4_MIN_INLINE_DATA_SIZE)
+                        memset(ext4_raw_inode(&is.iloc)->i_block + i_size, 0,
+                                        EXT4_MIN_INLINE_DATA_SIZE - i_size);
+                EXT4_I(inode)->i_inline_size = i_size <
+                                        EXT4_MIN_INLINE_DATA_SIZE ?
+                                        EXT4_MIN_INLINE_DATA_SIZE : i_size;
+        }
+out_error:
+        up_write(&EXT4_I(inode)->i_data_sem);
+out:
+        brelse(is.iloc.bh);
+        up_write(&EXT4_I(inode)->xattr_sem);
+        kfree(value);
+        if (inode->i_nlink)
+                ext4_orphan_del(handle, inode);
+        inode->i_mtime = inode->i_ctime = ext4_current_time(inode);
+        ext4_mark_inode_dirty(handle, inode);
+        if (IS_SYNC(inode))
+                ext4_handle_sync(handle);
+        ext4_journal_stop(handle);
+        return;
+}
+int ext4_convert_inline_data(struct inode *inode)
+{
+        int error, needed_blocks;
+        handle_t *handle;
+        struct ext4_iloc iloc;
+        if (!ext4_has_inline_data(inode)) {
+                ext4_clear_inode_state(inode, EXT4_STATE_MAY_INLINE_DATA);
+                return 0;
+        }
+        needed_blocks = ext4_writepage_trans_blocks(inode);
+        iloc.bh = NULL;
+        error = ext4_get_inode_loc(inode, &iloc);
+        if (error)
+                return error;
+        handle = ext4_journal_start(inode, needed_blocks);
+        if (IS_ERR(handle)) {
+                error = PTR_ERR(handle);
+                goto out_free;
+        }
+        down_write(&EXT4_I(inode)->xattr_sem);
+        if (!ext4_has_inline_data(inode)) {
+                up_write(&EXT4_I(inode)->xattr_sem);
+                goto out;
+        }
+        error = ext4_convert_inline_data_nolock(handle, inode, &iloc);
+        up_write(&EXT4_I(inode)->xattr_sem);
+out:
+        ext4_journal_stop(handle);
+out_free:
+        brelse(iloc.bh);
+        return error;
+}
diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
index b3c243b9afa5..cb1c1ab2720b 100644
--- a/fs/ext4/inode.c
+++ b/fs/ext4/inode.c
@@ -484,49 +484,6 @@ static pgoff_t ext4_num_dirty_pages(struct inode *inode, pgoff_t idx,
 }
 /*
- * Sets the BH_Da_Mapped bit on the buffer heads corresponding to the given map.
- */
-static void set_buffers_da_mapped(struct inode *inode,
-                                   struct ext4_map_blocks *map)
-{
-        struct address_space *mapping = inode->i_mapping;
-        struct pagevec pvec;
-        int i, nr_pages;
-        pgoff_t index, end;
-        index = map->m_lblk >> (PAGE_CACHE_SHIFT - inode->i_blkbits);
-        end = (map->m_lblk + map->m_len - 1) >>
-                (PAGE_CACHE_SHIFT - inode->i_blkbits);
-        pagevec_init(&pvec, 0);
-        while (index <= end) {
-                nr_pages = pagevec_lookup(&pvec, mapping, index,
-                                          min(end - index + 1,
-                                              (pgoff_t)PAGEVEC_SIZE));
-                if (nr_pages == 0)
-                        break;
-                for (i = 0; i < nr_pages; i++) {
-                        struct page *page = pvec.pages[i];
-                        struct buffer_head *bh, *head;
-                        if (unlikely(page->mapping != mapping) ||
-                            !PageDirty(page))
-                                break;
-                        if (page_has_buffers(page)) {
-                                bh = head = page_buffers(page);
-                                do {
-                                        set_buffer_da_mapped(bh);
-                                        bh = bh->b_this_page;
-                                } while (bh != head);
-                        }
-                        index++;
-                }
-                pagevec_release(&pvec);
-        }
-}
-/*
 * The ext4_map_blocks() function tries to look up the requested blocks,
 * and returns if the blocks are already mapped.
 *
@@ -574,7 +531,16 @@ int ext4_map_blocks(handle_t *handle, struct inode *inode,
                up_read((&EXT4_I(inode)->i_data_sem));
        if (retval > 0 && map->m_flags & EXT4_MAP_MAPPED) {
-                int ret = check_block_validity(inode, map);
+                int ret;
+                if (flags & EXT4_GET_BLOCKS_DELALLOC_RESERVE) {
+                        /* delayed alloc may be allocated by fallocate and
+                         * coverted to initialized by directIO.
+                         * we need to handle delayed extent here.
+                         */
+                        down_write((&EXT4_I(inode)->i_data_sem));
+                        goto delayed_mapped;
+                }
+                ret = check_block_validity(inode, map);
                if (ret != 0)
                        return ret;
        }
@@ -652,12 +618,15 @@ int ext4_map_blocks(handle_t *handle, struct inode *inode,
        if (flags & EXT4_GET_BLOCKS_DELALLOC_RESERVE) {
                ext4_clear_inode_state(inode, EXT4_STATE_DELALLOC_RESERVED);
-                /* If we have successfully mapped the delayed allocated blocks,
+                if (retval > 0 && map->m_flags & EXT4_MAP_MAPPED) {
-                 * set the BH_Da_Mapped bit on them. Its important to do this
+                        int ret;
-                 * under the protection of i_data_sem.
+delayed_mapped:
-                 */
+                        /* delayed allocation blocks has been allocated */
-                if (retval > 0 && map->m_flags & EXT4_MAP_MAPPED)
+                        ret = ext4_es_remove_extent(inode, map->m_lblk,
-                        set_buffers_da_mapped(inode, map);
+                                                    map->m_len);
+                        if (ret < 0)
+                                retval = ret;
+                }
        }
        up_write((&EXT4_I(inode)->i_data_sem));
@@ -680,10 +649,13 @@ static int _ext4_get_block(struct inode *inode, sector_t iblock,
        int ret = 0, started = 0;
        int dio_credits;
+        if (ext4_has_inline_data(inode))
+                return -ERANGE;
        map.m_lblk = iblock;
        map.m_len = bh->b_size >> inode->i_blkbits;
-        if (flags && !handle) {
+        if (flags && !(flags & EXT4_GET_BLOCKS_NO_LOCK) && !handle) {
                /* Direct IO write... */
                if (map.m_len > DIO_MAX_BLOCKS)
                        map.m_len = DIO_MAX_BLOCKS;
@@ -798,13 +770,13 @@ struct buffer_head *ext4_bread(handle_t *handle, struct inode *inode,
        return NULL;
 }
-static int walk_page_buffers(handle_t *handle,
+int ext4_walk_page_buffers(handle_t *handle,
-                             struct buffer_head *head,
+                           struct buffer_head *head,
-                             unsigned from,
+                           unsigned from,
-                             unsigned to,
+                           unsigned to,
-                             int *partial,
+                           int *partial,
-                             int (*fn)(handle_t *handle,
+                           int (*fn)(handle_t *handle,
-                                       struct buffer_head *bh))
+                                     struct buffer_head *bh))
 {
        struct buffer_head *bh;
        unsigned block_start, block_end;
@@ -854,8 +826,8 @@ static int walk_page_buffers(handle_t *handle,
 * is elevated.  We'll still have enough credits for the tiny quotafile
 * write.
 */
-static int do_journal_get_write_access(handle_t *handle,
+int do_journal_get_write_access(handle_t *handle,
-                                       struct buffer_head *bh)
+                                struct buffer_head *bh)
 {
        int dirty = buffer_dirty(bh);
        int ret;
@@ -878,7 +850,7 @@ static int do_journal_get_write_access(handle_t *handle,
        return ret;
 }
-static int ext4_get_block_write(struct inode *inode, sector_t iblock,
+static int ext4_get_block_write_nolock(struct inode *inode, sector_t iblock,
                   struct buffer_head *bh_result, int create);
 static int ext4_write_begin(struct file *file, struct address_space *mapping,
                            loff_t pos, unsigned len, unsigned flags,
@@ -902,6 +874,17 @@ static int ext4_write_begin(struct file *file, struct address_space *mapping,
        from = pos & (PAGE_CACHE_SIZE - 1);
        to = from + len;
+        if (ext4_test_inode_state(inode, EXT4_STATE_MAY_INLINE_DATA)) {
+                ret = ext4_try_to_write_inline_data(mapping, inode, pos, len,
+                                                    flags, pagep);
+                if (ret < 0)
+                        goto out;
+                if (ret == 1) {
+                        ret = 0;
+                        goto out;
+                }
+        }
 retry:
        handle = ext4_journal_start(inode, needed_blocks);
        if (IS_ERR(handle)) {
@@ -919,6 +902,7 @@ retry:
                ret = -ENOMEM;
                goto out;
        }
        *pagep = page;
        if (ext4_should_dioread_nolock(inode))
@@ -927,8 +911,9 @@ retry:
                ret = __block_write_begin(page, pos, len, ext4_get_block);
        if (!ret && ext4_should_journal_data(inode)) {
-                ret = walk_page_buffers(handle, page_buffers(page),
+                ret = ext4_walk_page_buffers(handle, page_buffers(page),
-                                from, to, NULL, do_journal_get_write_access);
+                                             from, to, NULL,
+                                             do_journal_get_write_access);
        }
        if (ret) {
@@ -983,7 +968,12 @@ static int ext4_generic_write_end(struct file *file,
        struct inode *inode = mapping->host;
        handle_t *handle = ext4_journal_current_handle();
-        copied = block_write_end(file, mapping, pos, len, copied, page, fsdata);
+        if (ext4_has_inline_data(inode))
+                copied = ext4_write_inline_data_end(inode, pos, len,
+                                                    copied, page);
+        else
+                copied = block_write_end(file, mapping, pos,
+                                         len, copied, page, fsdata);
        /*
         * No need to use i_size_read() here, the i_size
@@ -1134,16 +1124,21 @@ static int ext4_journalled_write_end(struct file *file,
        BUG_ON(!ext4_handle_valid(handle));
-        if (copied < len) {
+        if (ext4_has_inline_data(inode))
-                if (!PageUptodate(page))
+                copied = ext4_write_inline_data_end(inode, pos, len,
-                        copied = 0;
+                                                    copied, page);
-                page_zero_new_buffers(page, from+copied, to);
+        else {
-        }
+                if (copied < len) {
+                        if (!PageUptodate(page))
+                                copied = 0;
+                        page_zero_new_buffers(page, from+copied, to);
+                }
-        ret = walk_page_buffers(handle, page_buffers(page), from,
+                ret = ext4_walk_page_buffers(handle, page_buffers(page), from,
-                                to, &partial, write_end_fn);
+                                             to, &partial, write_end_fn);
-        if (!partial)
+                if (!partial)
-                SetPageUptodate(page);
+                        SetPageUptodate(page);
+        }
        new_i_size = pos + copied;
        if (new_i_size > inode->i_size)
                i_size_write(inode, pos+copied);
@@ -1301,6 +1296,7 @@ static void ext4_da_page_release_reservation(struct page *page,
        struct inode *inode = page->mapping->host;
        struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
        int num_clusters;
+        ext4_fsblk_t lblk;
        head = page_buffers(page);
        bh = head;
@@ -1310,20 +1306,23 @@ static void ext4_da_page_release_reservation(struct page *page,
                if ((offset <= curr_off) && (buffer_delay(bh))) {
                        to_release++;
                        clear_buffer_delay(bh);
-                        clear_buffer_da_mapped(bh);
                }
                curr_off = next_off;
        } while ((bh = bh->b_this_page) != head);
+        if (to_release) {
+                lblk = page->index << (PAGE_CACHE_SHIFT - inode->i_blkbits);
+                ext4_es_remove_extent(inode, lblk, to_release);
+        }
        /* If we have released all the blocks belonging to a cluster, then we
         * need to release the reserved space for that cluster. */
        num_clusters = EXT4_NUM_B2C(sbi, to_release);
        while (num_clusters > 0) {
-                ext4_fsblk_t lblk;
                lblk = (page->index << (PAGE_CACHE_SHIFT - inode->i_blkbits)) +
                        ((num_clusters - 1) << sbi->s_cluster_bits);
                if (sbi->s_cluster_ratio == 1 ||
-                    !ext4_find_delalloc_cluster(inode, lblk, 1))
+                    !ext4_find_delalloc_cluster(inode, lblk))
                        ext4_da_release_space(inode, 1);
                num_clusters--;
@@ -1429,8 +1428,6 @@ static int mpage_da_submit_io(struct mpage_da_data *mpd,
                                                clear_buffer_delay(bh);
                                                bh->b_blocknr = pblock;
                                        }
-                                        if (buffer_da_mapped(bh))
-                                                clear_buffer_da_mapped(bh);
                                        if (buffer_unwritten(bh) ||
                                            buffer_mapped(bh))
                                                BUG_ON(bh->b_blocknr != pblock);
@@ -1500,9 +1497,16 @@ static void ext4_da_block_invalidatepages(struct mpage_da_data *mpd)
        struct pagevec pvec;
        struct inode *inode = mpd->inode;
        struct address_space *mapping = inode->i_mapping;
+        ext4_lblk_t start, last;
        index = mpd->first_page;
        end   = mpd->next_page - 1;
+        start = index << (PAGE_CACHE_SHIFT - inode->i_blkbits);
+        last = end << (PAGE_CACHE_SHIFT - inode->i_blkbits);
+        ext4_es_remove_extent(inode, start, last - start + 1);
+        pagevec_init(&pvec, 0);
        while (index <= end) {
                nr_pages = pagevec_lookup(&pvec, mapping, index, PAGEVEC_SIZE);
                if (nr_pages == 0)
@@ -1656,15 +1660,6 @@ static void mpage_da_map_and_submit(struct mpage_da_data *mpd)
                for (i = 0; i < map.m_len; i++)
                        unmap_underlying_metadata(bdev, map.m_pblk + i);
-                if (ext4_should_order_data(mpd->inode)) {
-                        err = ext4_jbd2_file_inode(handle, mpd->inode);
-                        if (err) {
-                                /* Only if the journal is aborted */
-                                mpd->retval = err;
-                                goto submit_io;
-                        }
-                }
        }
        /*
@@ -1795,7 +1790,19 @@ static int ext4_da_map_blocks(struct inode *inode, sector_t iblock,
         * file system block.
         */
        down_read((&EXT4_I(inode)->i_data_sem));
-        if (ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS))
+        if (ext4_has_inline_data(inode)) {
+                /*
+                 * We will soon create blocks for this page, and let
+                 * us pretend as if the blocks aren't allocated yet.
+                 * In case of clusters, we have to handle the work
+                 * of mapping from cluster so that the reserved space
+                 * is calculated properly.
+                 */
+                if ((EXT4_SB(inode->i_sb)->s_cluster_ratio > 1) &&
+                    ext4_find_delalloc_cluster(inode, map->m_lblk))
+                        map->m_flags |= EXT4_MAP_FROM_CLUSTER;
+                retval = 0;
+        } else if (ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS))
                retval = ext4_ext_map_blocks(NULL, inode, map, 0);
        else
                retval = ext4_ind_map_blocks(NULL, inode, map, 0);
@@ -1814,6 +1821,10 @@ static int ext4_da_map_blocks(struct inode *inode, sector_t iblock,
                                goto out_unlock;
                }
+                retval = ext4_es_insert_extent(inode, map->m_lblk, map->m_len);
+                if (retval)
+                        goto out_unlock;
                /* Clear EXT4_MAP_FROM_CLUSTER flag since its purpose is served
                 * and it should not appear on the bh->b_state.
                 */
@@ -1842,8 +1853,8 @@ out_unlock:
 * We also have b_blocknr = physicalblock mapping unwritten extent and b_bdev
 * initialized properly.
 */
-static int ext4_da_get_block_prep(struct inode *inode, sector_t iblock,
+int ext4_da_get_block_prep(struct inode *inode, sector_t iblock,
-                                  struct buffer_head *bh, int create)
+                           struct buffer_head *bh, int create)
 {
        struct ext4_map_blocks map;
        int ret = 0;
@@ -1917,15 +1928,29 @@ static int __ext4_journalled_writepage(struct page *page,
 {
        struct address_space *mapping = page->mapping;
        struct inode *inode = mapping->host;
-        struct buffer_head *page_bufs;
+        struct buffer_head *page_bufs = NULL;
        handle_t *handle = NULL;
-        int ret = 0;
+        int ret = 0, err = 0;
-        int err;
+        int inline_data = ext4_has_inline_data(inode);
+        struct buffer_head *inode_bh = NULL;
        ClearPageChecked(page);
-        page_bufs = page_buffers(page);
-        BUG_ON(!page_bufs);
+        if (inline_data) {
-        walk_page_buffers(handle, page_bufs, 0, len, NULL, bget_one);
+                BUG_ON(page->index != 0);
+                BUG_ON(len > ext4_get_max_inline_size(inode));
+                inode_bh = ext4_journalled_write_inline_data(inode, len, page);
+                if (inode_bh == NULL)
+                        goto out;
+        } else {
+                page_bufs = page_buffers(page);
+                if (!page_bufs) {
+                        BUG();
+                        goto out;
+                }
+                ext4_walk_page_buffers(handle, page_bufs, 0, len,
+                                       NULL, bget_one);
+        }
        /* As soon as we unlock the page, it can go away, but we have
         * references to buffers so we are safe */
        unlock_page(page);
@@ -1938,11 +1963,18 @@ static int __ext4_journalled_writepage(struct page *page,
        BUG_ON(!ext4_handle_valid(handle));
-        ret = walk_page_buffers(handle, page_bufs, 0, len, NULL,
+        if (inline_data) {
-                                do_journal_get_write_access);
+                ret = ext4_journal_get_write_access(handle, inode_bh);
+                err = ext4_handle_dirty_metadata(handle, inode, inode_bh);
-        err = walk_page_buffers(handle, page_bufs, 0, len, NULL,
+        } else {
-                                write_end_fn);
+                ret = ext4_walk_page_buffers(handle, page_bufs, 0, len, NULL,
+                                             do_journal_get_write_access);
+                err = ext4_walk_page_buffers(handle, page_bufs, 0, len, NULL,
+                                             write_end_fn);
+        }
        if (ret == 0)
                ret = err;
        EXT4_I(inode)->i_datasync_tid = handle->h_transaction->t_tid;
@@ -1950,9 +1982,12 @@ static int __ext4_journalled_writepage(struct page *page,
        if (!ret)
                ret = err;
-        walk_page_buffers(handle, page_bufs, 0, len, NULL, bput_one);
+        if (!ext4_has_inline_data(inode))
+                ext4_walk_page_buffers(handle, page_bufs, 0, len,
+                                       NULL, bput_one);
        ext4_set_inode_state(inode, EXT4_STATE_JDATA);
 out:
+        brelse(inode_bh);
        return ret;
 }
@@ -2029,8 +2064,8 @@ static int ext4_writepage(struct page *page,
                commit_write = 1;
        }
        page_bufs = page_buffers(page);
-        if (walk_page_buffers(NULL, page_bufs, 0, len, NULL,
+        if (ext4_walk_page_buffers(NULL, page_bufs, 0, len, NULL,
-                              ext4_bh_delay_or_unwritten)) {
+                                   ext4_bh_delay_or_unwritten)) {
                /*
                 * We don't want to do block allocation, so redirty
                 * the page and return.  We may reach here when we do
@@ -2096,7 +2131,8 @@ static int ext4_da_writepages_trans_blocks(struct inode *inode)
 * mpage_da_map_and_submit to map a single contiguous memory region
 * and then write them.
 */
-static int write_cache_pages_da(struct address_space *mapping,
+static int write_cache_pages_da(handle_t *handle,
+                                struct address_space *mapping,
                                struct writeback_control *wbc,
                                struct mpage_da_data *mpd,
                                pgoff_t *done_index)
@@ -2175,6 +2211,17 @@ static int write_cache_pages_da(struct address_space *mapping,
                        wait_on_page_writeback(page);
                        BUG_ON(PageWriteback(page));
+                        /*
+                         * If we have inline data and arrive here, it means that
+                         * we will soon create the block for the 1st page, so
+                         * we'd better clear the inline data here.
+                         */
+                        if (ext4_has_inline_data(inode)) {
+                                BUG_ON(ext4_test_inode_state(inode,
+                                                EXT4_STATE_MAY_INLINE_DATA));
+                                ext4_destroy_inline_data(handle, inode);
+                        }
                        if (mpd->next_page != page->index)
                                mpd->first_page = page->index;
                        mpd->next_page = page->index + 1;
@@ -2381,7 +2428,8 @@ retry:
                 * contiguous region of logical blocks that need
                 * blocks to be allocated by ext4 and submit them.
                 */
-                ret = write_cache_pages_da(mapping, wbc, &mpd, &done_index);
+                ret = write_cache_pages_da(handle, mapping,
+                                           wbc, &mpd, &done_index);
                /*
                 * If we have a contiguous extent of pages and we
                 * haven't done the I/O yet, map the blocks and submit
@@ -2445,7 +2493,6 @@ out_writepages:
        return ret;
 }
-#define FALL_BACK_TO_NONDELALLOC 1
 static int ext4_nonda_switch(struct super_block *sb)
 {
        s64 free_blocks, dirty_blocks;
@@ -2502,6 +2549,19 @@ static int ext4_da_write_begin(struct file *file, struct address_space *mapping,
        }
        *fsdata = (void *)0;
        trace_ext4_da_write_begin(inode, pos, len, flags);
+        if (ext4_test_inode_state(inode, EXT4_STATE_MAY_INLINE_DATA)) {
+                ret = ext4_da_write_inline_data_begin(mapping, inode,
+                                                      pos, len, flags,
+                                                      pagep, fsdata);
+                if (ret < 0)
+                        goto out;
+                if (ret == 1) {
+                        ret = 0;
+                        goto out;
+                }
+        }
 retry:
        /*
         * With delayed allocation, we don't log the i_disksize update
@@ -2603,22 +2663,13 @@ static int ext4_da_write_end(struct file *file,
         * changes.  So let's piggyback the i_disksize mark_inode_dirty
         * into that.
         */
        new_i_size = pos + copied;
        if (copied && new_i_size > EXT4_I(inode)->i_disksize) {
-                if (ext4_da_should_update_i_disksize(page, end)) {
+                if (ext4_has_inline_data(inode) ||
+                    ext4_da_should_update_i_disksize(page, end)) {
                        down_write(&EXT4_I(inode)->i_data_sem);
-                        if (new_i_size > EXT4_I(inode)->i_disksize) {
+                        if (new_i_size > EXT4_I(inode)->i_disksize)
-                                /*
-                                 * Updating i_disksize when extending file
-                                 * without needing block allocation
-                                 */
-                                if (ext4_should_order_data(inode))
-                                        ret = ext4_jbd2_file_inode(handle,
-                                                                   inode);
                                EXT4_I(inode)->i_disksize = new_i_size;
-                        }
                        up_write(&EXT4_I(inode)->i_data_sem);
                        /* We need to mark inode dirty even if
                         * new_i_size is less that inode->i_size
@@ -2627,8 +2678,16 @@ static int ext4_da_write_end(struct file *file,
                        ext4_mark_inode_dirty(handle, inode);
                }
        }
-        ret2 = generic_write_end(file, mapping, pos, len, copied,
+        if (write_mode != CONVERT_INLINE_DATA &&
+            ext4_test_inode_state(inode, EXT4_STATE_MAY_INLINE_DATA) &&
+            ext4_has_inline_data(inode))
+                ret2 = ext4_da_write_inline_data_end(inode, pos, len, copied,
+                                                     page);
+        else
+                ret2 = generic_write_end(file, mapping, pos, len, copied,
                                                        page, fsdata);
        copied = ret2;
        if (ret2 < 0)
                ret = ret2;
@@ -2721,6 +2780,12 @@ static sector_t ext4_bmap(struct address_space *mapping, sector_t block)
        journal_t *journal;
        int err;
+        /*
+         * We can get here for an inline file via the FIBMAP ioctl
+         */
+        if (ext4_has_inline_data(inode))
+                return 0;
        if (mapping_tagged(mapping, PAGECACHE_TAG_DIRTY) &&
                        test_opt(inode->i_sb, DELALLOC)) {
                /*
@@ -2766,14 +2831,30 @@ static sector_t ext4_bmap(struct address_space *mapping, sector_t block)
 static int ext4_readpage(struct file *file, struct page *page)
 {
+        int ret = -EAGAIN;
+        struct inode *inode = page->mapping->host;
        trace_ext4_readpage(page);
-        return mpage_readpage(page, ext4_get_block);
+        if (ext4_has_inline_data(inode))
+                ret = ext4_readpage_inline(inode, page);
+        if (ret == -EAGAIN)
+                return mpage_readpage(page, ext4_get_block);
+        return ret;
 }
 static int
 ext4_readpages(struct file *file, struct address_space *mapping,
                struct list_head *pages, unsigned nr_pages)
 {
+        struct inode *inode = mapping->host;
+        /* If the file has inline data, no need to do readpages. */
+        if (ext4_has_inline_data(inode))
+                return 0;
        return mpage_readpages(mapping, pages, nr_pages, ext4_get_block);
 }
@@ -2840,7 +2921,7 @@ static int ext4_releasepage(struct page *page, gfp_t wait)
 * We allocate an uinitialized extent if blocks haven't been allocated.
 * The extent will be converted to initialized after the IO is complete.
 */
-static int ext4_get_block_write(struct inode *inode, sector_t iblock,
+int ext4_get_block_write(struct inode *inode, sector_t iblock,
                   struct buffer_head *bh_result, int create)
 {
        ext4_debug("ext4_get_block_write: inode %lu, create flag %d\n",
@@ -2850,29 +2931,12 @@ static int ext4_get_block_write(struct inode *inode, sector_t iblock,
 }
 static int ext4_get_block_write_nolock(struct inode *inode, sector_t iblock,
-                   struct buffer_head *bh_result, int flags)
+                   struct buffer_head *bh_result, int create)
 {
-        handle_t *handle = ext4_journal_current_handle();
+        ext4_debug("ext4_get_block_write_nolock: inode %lu, create flag %d\n",
-        struct ext4_map_blocks map;
+                   inode->i_ino, create);
-        int ret = 0;
+        return _ext4_get_block(inode, iblock, bh_result,
+                               EXT4_GET_BLOCKS_NO_LOCK);
-        ext4_debug("ext4_get_block_write_nolock: inode %lu, flag %d\n",
-                   inode->i_ino, flags);
-        flags = EXT4_GET_BLOCKS_NO_LOCK;
-        map.m_lblk = iblock;
-        map.m_len = bh_result->b_size >> inode->i_blkbits;
-        ret = ext4_map_blocks(handle, inode, &map, flags);
-        if (ret > 0) {
-                map_bh(bh_result, inode->i_sb, map.m_pblk);
-                bh_result->b_state = (bh_result->b_state & ~EXT4_MAP_FLAGS) |
-                                        map.m_flags;
-                bh_result->b_size = inode->i_sb->s_blocksize * map.m_len;
-                ret = 0;
-        }
-        return ret;
 }
 static void ext4_end_io_dio(struct kiocb *iocb, loff_t offset,
@@ -2978,10 +3042,10 @@ retry:
 * fall back to buffered IO.
 *
 * For holes, we fallocate those blocks, mark them as uninitialized
- * If those blocks were preallocated, we mark sure they are splited, but
+ * If those blocks were preallocated, we mark sure they are split, but
 * still keep the range to write as uninitialized.
 *
- * The unwrritten extents will be converted to written when DIO is completed.
+ * The unwritten extents will be converted to written when DIO is completed.
 * For async direct IO, since the IO may still pending when return, we
 * set up an end_io call back function, which will do the conversion
 * when async direct IO completed.
@@ -2999,125 +3063,120 @@ static ssize_t ext4_ext_direct_IO(int rw, struct kiocb *iocb,
        struct inode *inode = file->f_mapping->host;
        ssize_t ret;
        size_t count = iov_length(iov, nr_segs);
+        int overwrite = 0;
+        get_block_t *get_block_func = NULL;
+        int dio_flags = 0;
        loff_t final_size = offset + count;
-        if (rw == WRITE && final_size <= inode->i_size) {
-                int overwrite = 0;
-                BUG_ON(iocb->private == NULL);
+        /* Use the old path for reads and writes beyond i_size. */
+        if (rw != WRITE || final_size > inode->i_size)
+                return ext4_ind_direct_IO(rw, iocb, iov, offset, nr_segs);
-                /* If we do a overwrite dio, i_mutex locking can be released */
+        BUG_ON(iocb->private == NULL);
-                overwrite = *((int *)iocb->private);
-                if (overwrite) {
+        /* If we do a overwrite dio, i_mutex locking can be released */
-                        atomic_inc(&inode->i_dio_count);
+        overwrite = *((int *)iocb->private);
-                        down_read(&EXT4_I(inode)->i_data_sem);
-                        mutex_unlock(&inode->i_mutex);
-                }
-                /*
+        if (overwrite) {
-                 * We could direct write to holes and fallocate.
+                atomic_inc(&inode->i_dio_count);
-                 *
+                down_read(&EXT4_I(inode)->i_data_sem);
-                 * Allocated blocks to fill the hole are marked as uninitialized
+                mutex_unlock(&inode->i_mutex);
-                 * to prevent parallel buffered read to expose the stale data
+        }
-                 * before DIO complete the data IO.
-                 *
-                 * As to previously fallocated extents, ext4 get_block
-                 * will just simply mark the buffer mapped but still
-                 * keep the extents uninitialized.
-                 *
-                 * for non AIO case, we will convert those unwritten extents
-                 * to written after return back from blockdev_direct_IO.
-                 *
-                 * for async DIO, the conversion needs to be defered when
-                 * the IO is completed. The ext4 end_io callback function
-                 * will be called to take care of the conversion work.
-                 * Here for async case, we allocate an io_end structure to
-                 * hook to the iocb.
-                 */
-                iocb->private = NULL;
-                ext4_inode_aio_set(inode, NULL);
-                if (!is_sync_kiocb(iocb)) {
-                        ext4_io_end_t *io_end =
-                                ext4_init_io_end(inode, GFP_NOFS);
-                        if (!io_end) {
-                                ret = -ENOMEM;
-                                goto retake_lock;
-                        }
-                        io_end->flag |= EXT4_IO_END_DIRECT;
-                        iocb->private = io_end;
-                        /*
-                         * we save the io structure for current async
-                         * direct IO, so that later ext4_map_blocks()
-                         * could flag the io structure whether there
-                         * is a unwritten extents needs to be converted
-                         * when IO is completed.
-                         */
-                        ext4_inode_aio_set(inode, io_end);
-                }
-                if (overwrite)
+        /*
-                        ret = __blockdev_direct_IO(rw, iocb, inode,
+         * We could direct write to holes and fallocate.
-                                                 inode->i_sb->s_bdev, iov,
+         *
-                                                 offset, nr_segs,
+         * Allocated blocks to fill the hole are marked as
-                                                 ext4_get_block_write_nolock,
+         * uninitialized to prevent parallel buffered read to expose
-                                                 ext4_end_io_dio,
+         * the stale data before DIO complete the data IO.
-                                                 NULL,
+         *
-                                                 0);
+         * As to previously fallocated extents, ext4 get_block will
-                else
+         * just simply mark the buffer mapped but still keep the
-                        ret = __blockdev_direct_IO(rw, iocb, inode,
+         * extents uninitialized.
-                                                 inode->i_sb->s_bdev, iov,
+         *
-                                                 offset, nr_segs,
+         * For non AIO case, we will convert those unwritten extents
-                                                 ext4_get_block_write,
+         * to written after return back from blockdev_direct_IO.
-                                                 ext4_end_io_dio,
+         *
-                                                 NULL,
+         * For async DIO, the conversion needs to be deferred when the
-                                                 DIO_LOCKING);
+         * IO is completed. The ext4 end_io callback function will be
-                if (iocb->private)
+         * called to take care of the conversion work.  Here for async
-                        ext4_inode_aio_set(inode, NULL);
+         * case, we allocate an io_end structure to hook to the iocb.
+         */
+        iocb->private = NULL;
+        ext4_inode_aio_set(inode, NULL);
+        if (!is_sync_kiocb(iocb)) {
+                ext4_io_end_t *io_end = ext4_init_io_end(inode, GFP_NOFS);
+                if (!io_end) {
+                        ret = -ENOMEM;
+                        goto retake_lock;
+                }
+                io_end->flag |= EXT4_IO_END_DIRECT;
+                iocb->private = io_end;
                /*
-                 * The io_end structure takes a reference to the inode,
+                 * we save the io structure for current async direct
-                 * that structure needs to be destroyed and the
+                 * IO, so that later ext4_map_blocks() could flag the
-                 * reference to the inode need to be dropped, when IO is
+                 * io structure whether there is a unwritten extents
-                 * complete, even with 0 byte write, or failed.
+                 * needs to be converted when IO is completed.
-                 *
-                 * In the successful AIO DIO case, the io_end structure will be
-                 * desctroyed and the reference to the inode will be dropped
-                 * after the end_io call back function is called.
-                 *
-                 * In the case there is 0 byte write, or error case, since
-                 * VFS direct IO won't invoke the end_io call back function,
-                 * we need to free the end_io structure here.
                 */
-                if (ret != -EIOCBQUEUED && ret <= 0 && iocb->private) {
+                ext4_inode_aio_set(inode, io_end);
-                        ext4_free_io_end(iocb->private);
+        }
-                        iocb->private = NULL;
-                } else if (ret > 0 && !overwrite && ext4_test_inode_state(inode,
-                                                EXT4_STATE_DIO_UNWRITTEN)) {
-                        int err;
-                        /*
-                         * for non AIO case, since the IO is already
-                         * completed, we could do the conversion right here
-                         */
-                        err = ext4_convert_unwritten_extents(inode,
-                                                             offset, ret);
-                        if (err < 0)
-                                ret = err;
-                        ext4_clear_inode_state(inode, EXT4_STATE_DIO_UNWRITTEN);
-                }
-        retake_lock:
+        if (overwrite) {
-                /* take i_mutex locking again if we do a ovewrite dio */
+                get_block_func = ext4_get_block_write_nolock;
-                if (overwrite) {
+        } else {
-                        inode_dio_done(inode);
+                get_block_func = ext4_get_block_write;
-                        up_read(&EXT4_I(inode)->i_data_sem);
+                dio_flags = DIO_LOCKING;
-                        mutex_lock(&inode->i_mutex);
+        }
-                }
+        ret = __blockdev_direct_IO(rw, iocb, inode,
+                                   inode->i_sb->s_bdev, iov,
+                                   offset, nr_segs,
+                                   get_block_func,
+                                   ext4_end_io_dio,
+                                   NULL,
+                                   dio_flags);
+        if (iocb->private)
+                ext4_inode_aio_set(inode, NULL);
+        /*
+         * The io_end structure takes a reference to the inode, that
+         * structure needs to be destroyed and the reference to the
+         * inode need to be dropped, when IO is complete, even with 0
+         * byte write, or failed.
+         *
+         * In the successful AIO DIO case, the io_end structure will
+         * be destroyed and the reference to the inode will be dropped
+         * after the end_io call back function is called.
+         *
+         * In the case there is 0 byte write, or error case, since VFS
+         * direct IO won't invoke the end_io call back function, we
+         * need to free the end_io structure here.
+         */
+        if (ret != -EIOCBQUEUED && ret <= 0 && iocb->private) {
+                ext4_free_io_end(iocb->private);
+                iocb->private = NULL;
+        } else if (ret > 0 && !overwrite && ext4_test_inode_state(inode,
+                                                EXT4_STATE_DIO_UNWRITTEN)) {
+                int err;
+                /*
+                 * for non AIO case, since the IO is already
+                 * completed, we could do the conversion right here
+                 */
+                err = ext4_convert_unwritten_extents(inode,
+                                                     offset, ret);
+                if (err < 0)
+                        ret = err;
+                ext4_clear_inode_state(inode, EXT4_STATE_DIO_UNWRITTEN);
+        }
-                return ret;
+retake_lock:
+        /* take i_mutex locking again if we do a ovewrite dio */
+        if (overwrite) {
+                inode_dio_done(inode);
+                up_read(&EXT4_I(inode)->i_data_sem);
+                mutex_lock(&inode->i_mutex);
        }
-        /* for write the the end of file case, we fall back to old way */
+        return ret;
-        return ext4_ind_direct_IO(rw, iocb, iov, offset, nr_segs);
 }
 static ssize_t ext4_direct_IO(int rw, struct kiocb *iocb,
@@ -3134,6 +3193,10 @@ static ssize_t ext4_direct_IO(int rw, struct kiocb *iocb,
        if (ext4_should_journal_data(inode))
                return 0;
+        /* Let buffer I/O handle the inline data case. */
+        if (ext4_has_inline_data(inode))
+                return 0;
        trace_ext4_direct_IO_enter(inode, offset, iov_length(iov, nr_segs), rw);
        if (ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS))
                ret = ext4_ext_direct_IO(rw, iocb, iov, offset, nr_segs);
@@ -3531,6 +3594,14 @@ void ext4_truncate(struct inode *inode)
        if (inode->i_size == 0 && !test_opt(inode->i_sb, NO_AUTO_DA_ALLOC))
                ext4_set_inode_state(inode, EXT4_STATE_DA_ALLOC_CLOSE);
+        if (ext4_has_inline_data(inode)) {
+                int has_inline = 1;
+                ext4_inline_data_truncate(inode, &has_inline);
+                if (has_inline)
+                        return;
+        }
        if (ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS))
                ext4_ext_truncate(inode);
        else
@@ -3756,6 +3827,19 @@ static blkcnt_t ext4_inode_blocks(struct ext4_inode *raw_inode,
        }
 }
+static inline void ext4_iget_extra_inode(struct inode *inode,
+                                         struct ext4_inode *raw_inode,
+                                         struct ext4_inode_info *ei)
+{
+        __le32 *magic = (void *)raw_inode +
+                        EXT4_GOOD_OLD_INODE_SIZE + ei->i_extra_isize;
+        if (*magic == cpu_to_le32(EXT4_XATTR_MAGIC)) {
+                ext4_set_inode_state(inode, EXT4_STATE_XATTR);
+                ext4_find_inline_data_nolock(inode);
+        } else
+                EXT4_I(inode)->i_inline_off = 0;
+}
 struct inode *ext4_iget(struct super_block *sb, unsigned long ino)
 {
        struct ext4_iloc iloc;
@@ -3826,6 +3910,7 @@ struct inode *ext4_iget(struct super_block *sb, unsigned long ino)
        set_nlink(inode, le16_to_cpu(raw_inode->i_links_count));
        ext4_clear_state_flags(ei);     /* Only relevant on 32-bit archs */
+        ei->i_inline_off = 0;
        ei->i_dir_start_lookup = 0;
        ei->i_dtime = le32_to_cpu(raw_inode->i_dtime);
        /* We now have enough fields to check if the inode was active or not.
@@ -3898,11 +3983,7 @@ struct inode *ext4_iget(struct super_block *sb, unsigned long ino)
                        ei->i_extra_isize = sizeof(struct ext4_inode) -
                                            EXT4_GOOD_OLD_INODE_SIZE;
                } else {
-                        __le32 *magic = (void *)raw_inode +
+                        ext4_iget_extra_inode(inode, raw_inode, ei);
-                                        EXT4_GOOD_OLD_INODE_SIZE +
-                                        ei->i_extra_isize;
-                        if (*magic == cpu_to_le32(EXT4_XATTR_MAGIC))
-                                ext4_set_inode_state(inode, EXT4_STATE_XATTR);
                }
        }
@@ -3925,17 +4006,19 @@ struct inode *ext4_iget(struct super_block *sb, unsigned long ino)
                                 ei->i_file_acl);
                ret = -EIO;
                goto bad_inode;
-        } else if (ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)) {
+        } else if (!ext4_has_inline_data(inode)) {
-                if (S_ISREG(inode->i_mode) || S_ISDIR(inode->i_mode) ||
+                if (ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)) {
-                    (S_ISLNK(inode->i_mode) &&
+                        if ((S_ISREG(inode->i_mode) || S_ISDIR(inode->i_mode) ||
-                     !ext4_inode_is_fast_symlink(inode)))
+                            (S_ISLNK(inode->i_mode) &&
-                        /* Validate extent which is part of inode */
+                             !ext4_inode_is_fast_symlink(inode))))
-                        ret = ext4_ext_check_inode(inode);
+                                /* Validate extent which is part of inode */
-        } else if (S_ISREG(inode->i_mode) || S_ISDIR(inode->i_mode) ||
+                                ret = ext4_ext_check_inode(inode);
-                   (S_ISLNK(inode->i_mode) &&
+                } else if (S_ISREG(inode->i_mode) || S_ISDIR(inode->i_mode) ||
-                    !ext4_inode_is_fast_symlink(inode))) {
+                           (S_ISLNK(inode->i_mode) &&
-                /* Validate block references which are part of inode */
+                            !ext4_inode_is_fast_symlink(inode))) {
-                ret = ext4_ind_check_inode(inode);
+                        /* Validate block references which are part of inode */
+                        ret = ext4_ind_check_inode(inode);
+                }
        }
        if (ret)
                goto bad_inode;
@@ -4122,9 +4205,10 @@ static int ext4_do_update_inode(handle_t *handle,
                                cpu_to_le32(new_encode_dev(inode->i_rdev));
                        raw_inode->i_block[2] = 0;
                }
-        } else
+        } else if (!ext4_has_inline_data(inode)) {
                for (block = 0; block < EXT4_N_BLOCKS; block++)
                        raw_inode->i_block[block] = ei->i_data[block];
+        }
        raw_inode->i_disk_version = cpu_to_le32(inode->i_version);
        if (ei->i_extra_isize) {
@@ -4811,8 +4895,9 @@ int ext4_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf)
         * journal_start/journal_stop which can block and take a long time
         */
        if (page_has_buffers(page)) {
-                if (!walk_page_buffers(NULL, page_buffers(page), 0, len, NULL,
+                if (!ext4_walk_page_buffers(NULL, page_buffers(page),
-                                        ext4_bh_unmapped)) {
+                                            0, len, NULL,
+                                            ext4_bh_unmapped)) {
                        /* Wait so that we don't change page under IO */
                        wait_on_page_writeback(page);
                        ret = VM_FAULT_LOCKED;
@@ -4833,7 +4918,7 @@ retry_alloc:
        }
        ret = __block_page_mkwrite(vma, vmf, get_block);
        if (!ret && ext4_should_journal_data(inode)) {
-                if (walk_page_buffers(handle, page_buffers(page), 0,
+                if (ext4_walk_page_buffers(handle, page_buffers(page), 0,
                          PAGE_CACHE_SIZE, NULL, do_journal_get_write_access)) {
                        unlock_page(page);
                        ret = VM_FAULT_SIGBUS;
diff --git a/fs/ext4/mballoc.c b/fs/ext4/mballoc.c
index 526e55358606..1bf6fe785c4f 100644
--- a/fs/ext4/mballoc.c
+++ b/fs/ext4/mballoc.c
@@ -1373,7 +1373,7 @@ static int mb_find_extent(struct ext4_buddy *e4b, int block,
        ex->fe_start += next;
        while (needed > ex->fe_len &&
-               (buddy = mb_find_buddy(e4b, order, &max))) {
+               mb_find_buddy(e4b, order, &max)) {
                if (block + 1 >= max)
                        break;
@@ -2607,9 +2607,17 @@ static void ext4_free_data_callback(struct super_block *sb,
        mb_debug(1, "gonna free %u blocks in group %u (0x%p):",
                 entry->efd_count, entry->efd_group, entry);
-        if (test_opt(sb, DISCARD))
+        if (test_opt(sb, DISCARD)) {
-                ext4_issue_discard(sb, entry->efd_group,
+                err = ext4_issue_discard(sb, entry->efd_group,
-                                   entry->efd_start_cluster, entry->efd_count);
+                                         entry->efd_start_cluster,
+                                         entry->efd_count);
+                if (err && err != -EOPNOTSUPP)
+                        ext4_msg(sb, KERN_WARNING, "discard request in"
+                                 " group:%d block:%d count:%d failed"
+                                 " with %d", entry->efd_group,
+                                 entry->efd_start_cluster,
+                                 entry->efd_count, err);
+        }
        err = ext4_mb_load_buddy(sb, entry->efd_group, &e4b);
        /* we expect to find existing buddy because it's pinned */
@@ -4310,8 +4318,10 @@ ext4_fsblk_t ext4_mb_new_blocks(handle_t *handle,
 repeat:
                /* allocate space in core */
                *errp = ext4_mb_regular_allocator(ac);
-                if (*errp)
+                if (*errp) {
+                        ext4_discard_allocated_blocks(ac);
                        goto errout;
+                }
                /* as we've just preallocated more space than
                 * user requested orinally, we store allocated
@@ -4333,10 +4343,10 @@ repeat:
                        ac->ac_b_ex.fe_len = 0;
                        ac->ac_status = AC_STATUS_CONTINUE;
                        goto repeat;
-                } else if (*errp)
+                } else if (*errp) {
-                errout:
                        ext4_discard_allocated_blocks(ac);
-                else {
+                        goto errout;
+                } else {
                        block = ext4_grp_offs_to_block(sb, &ac->ac_b_ex);
                        ar->len = ac->ac_b_ex.fe_len;
                }
@@ -4347,6 +4357,7 @@ repeat:
                *errp = -ENOSPC;
        }
+errout:
        if (*errp) {
                ac->ac_b_ex.fe_len = 0;
                ar->len = 0;
@@ -4656,8 +4667,16 @@ do_more:
                 * with group lock held. generate_buddy look at
                 * them with group lock_held
                 */
-                if (test_opt(sb, DISCARD))
+                if (test_opt(sb, DISCARD)) {
-                        ext4_issue_discard(sb, block_group, bit, count);
+                        err = ext4_issue_discard(sb, block_group, bit, count);
+                        if (err && err != -EOPNOTSUPP)
+                                ext4_msg(sb, KERN_WARNING, "discard request in"
+                                         " group:%d block:%d count:%lu failed"
+                                         " with %d", block_group, bit, count,
+                                         err);
+                }
                ext4_lock_group(sb, block_group);
                mb_clear_bits(bitmap_bh->b_data, bit, count_clusters);
                mb_free_blocks(inode, &e4b, bit, count_clusters);
@@ -4851,10 +4870,11 @@ error_return:
 * one will allocate those blocks, mark it as used in buddy bitmap. This must
 * be called with under the group lock.
 */
-static void ext4_trim_extent(struct super_block *sb, int start, int count,
+static int ext4_trim_extent(struct super_block *sb, int start, int count,
                             ext4_group_t group, struct ext4_buddy *e4b)
 {
        struct ext4_free_extent ex;
+        int ret = 0;
        trace_ext4_trim_extent(sb, group, start, count);
@@ -4870,9 +4890,10 @@ static void ext4_trim_extent(struct super_block *sb, int start, int count,
         */
        mb_mark_used(e4b, &ex);
        ext4_unlock_group(sb, group);
-        ext4_issue_discard(sb, group, start, count);
+        ret = ext4_issue_discard(sb, group, start, count);
        ext4_lock_group(sb, group);
        mb_free_blocks(NULL, e4b, start, ex.fe_len);
+        return ret;
 }
 /**
@@ -4901,7 +4922,7 @@ ext4_trim_all_free(struct super_block *sb, ext4_group_t group,
        void *bitmap;
        ext4_grpblk_t next, count = 0, free_count = 0;
        struct ext4_buddy e4b;
-        int ret;
+        int ret = 0;
        trace_ext4_trim_all_free(sb, group, start, max);
@@ -4928,8 +4949,11 @@ ext4_trim_all_free(struct super_block *sb, ext4_group_t group,
                next = mb_find_next_bit(bitmap, max + 1, start);
                if ((next - start) >= minblocks) {
-                        ext4_trim_extent(sb, start,
+                        ret = ext4_trim_extent(sb, start,
-                                         next - start, group, &e4b);
+                                               next - start, group, &e4b);
+                        if (ret && ret != -EOPNOTSUPP)
+                                break;
+                        ret = 0;
                        count += next - start;
                }
                free_count += next - start;
@@ -4950,8 +4974,10 @@ ext4_trim_all_free(struct super_block *sb, ext4_group_t group,
                        break;
        }
-        if (!ret)
+        if (!ret) {
+                ret = count;
                EXT4_MB_GRP_SET_TRIMMED(e4b.bd_info);
+        }
 out:
        ext4_unlock_group(sb, group);
        ext4_mb_unload_buddy(&e4b);
@@ -4959,7 +4985,7 @@ out:
        ext4_debug("trimmed %d blocks in the group %d\n",
                count, group);
-        return count;
+        return ret;
 }
 /**
diff --git a/fs/ext4/migrate.c b/fs/ext4/migrate.c
index f1bb32ec0169..db8226d595fa 100644
--- a/fs/ext4/migrate.c
+++ b/fs/ext4/migrate.c
@@ -14,6 +14,7 @@
 #include <linux/slab.h>
 #include "ext4_jbd2.h"
+#include "ext4_extents.h"
 /*
 * The contiguous blocks details which can be
diff --git a/fs/ext4/move_extent.c b/fs/ext4/move_extent.c
index 292daeeed455..d9cc5ee42f53 100644
--- a/fs/ext4/move_extent.c
+++ b/fs/ext4/move_extent.c
@@ -18,6 +18,7 @@
 #include <linux/slab.h>
 #include "ext4_jbd2.h"
 #include "ext4.h"
+#include "ext4_extents.h"
 /**
 * get_ext_path - Find an extent path for designated logical block number.
diff --git a/fs/ext4/namei.c b/fs/ext4/namei.c
index 6d600a69fc9d..cac448282331 100644
--- a/fs/ext4/namei.c
+++ b/fs/ext4/namei.c
@@ -202,13 +202,8 @@ static int ext4_dx_add_entry(handle_t *handle, struct dentry *dentry,
                             struct inode *inode);
 /* checksumming functions */
-#define EXT4_DIRENT_TAIL(block, blocksize) \
+void initialize_dirent_tail(struct ext4_dir_entry_tail *t,
-        ((struct ext4_dir_entry_tail *)(((void *)(block)) + \
+                            unsigned int blocksize)
-                                        ((blocksize) - \
-                                         sizeof(struct ext4_dir_entry_tail))))
-static void initialize_dirent_tail(struct ext4_dir_entry_tail *t,
-                                   unsigned int blocksize)
 {
        memset(t, 0, sizeof(struct ext4_dir_entry_tail));
        t->det_rec_len = ext4_rec_len_to_disk(
@@ -261,6 +256,12 @@ static __le32 ext4_dirent_csum(struct inode *inode,
        return cpu_to_le32(csum);
 }
+static void warn_no_space_for_csum(struct inode *inode)
+{
+        ext4_warning(inode->i_sb, "no space in directory inode %lu leaf for "
+                     "checksum.  Please run e2fsck -D.", inode->i_ino);
+}
 int ext4_dirent_csum_verify(struct inode *inode, struct ext4_dir_entry *dirent)
 {
        struct ext4_dir_entry_tail *t;
@@ -271,8 +272,7 @@ int ext4_dirent_csum_verify(struct inode *inode, struct ext4_dir_entry *dirent)
        t = get_dirent_tail(inode, dirent);
        if (!t) {
-                EXT4_ERROR_INODE(inode, "metadata_csum set but no space in dir "
+                warn_no_space_for_csum(inode);
-                                 "leaf for checksum.  Please run e2fsck -D.");
                return 0;
        }
@@ -294,8 +294,7 @@ static void ext4_dirent_csum_set(struct inode *inode,
        t = get_dirent_tail(inode, dirent);
        if (!t) {
-                EXT4_ERROR_INODE(inode, "metadata_csum set but no space in dir "
+                warn_no_space_for_csum(inode);
-                                 "leaf for checksum.  Please run e2fsck -D.");
                return;
        }
@@ -303,9 +302,9 @@ static void ext4_dirent_csum_set(struct inode *inode,
                                           (void *)t - (void *)dirent);
 }
-static inline int ext4_handle_dirty_dirent_node(handle_t *handle,
+int ext4_handle_dirty_dirent_node(handle_t *handle,
-                                                struct inode *inode,
+                                  struct inode *inode,
-                                                struct buffer_head *bh)
+                                  struct buffer_head *bh)
 {
        ext4_dirent_csum_set(inode, (struct ext4_dir_entry *)bh->b_data);
        return ext4_handle_dirty_metadata(handle, inode, bh);
@@ -377,8 +376,7 @@ static int ext4_dx_csum_verify(struct inode *inode,
        count = le16_to_cpu(c->count);
        if (count_offset + (limit * sizeof(struct dx_entry)) >
            EXT4_BLOCK_SIZE(inode->i_sb) - sizeof(struct dx_tail)) {
-                EXT4_ERROR_INODE(inode, "metadata_csum set but no space for "
+                warn_no_space_for_csum(inode);
-                                 "tree checksum found.  Run e2fsck -D.");
                return 1;
        }
        t = (struct dx_tail *)(((struct dx_entry *)c) + limit);
@@ -408,8 +406,7 @@ static void ext4_dx_csum_set(struct inode *inode, struct ext4_dir_entry *dirent)
        count = le16_to_cpu(c->count);
        if (count_offset + (limit * sizeof(struct dx_entry)) >
            EXT4_BLOCK_SIZE(inode->i_sb) - sizeof(struct dx_tail)) {
-                EXT4_ERROR_INODE(inode, "metadata_csum set but no space for "
+                warn_no_space_for_csum(inode);
-                                 "tree checksum.  Run e2fsck -D.");
                return;
        }
        t = (struct dx_tail *)(((struct dx_entry *)c) + limit);
@@ -890,6 +887,7 @@ static int htree_dirblock_to_tree(struct file *dir_file,
                                           EXT4_DIR_REC_LEN(0));
        for (; de < top; de = ext4_next_entry(de, dir->i_sb->s_blocksize)) {
                if (ext4_check_dir_entry(dir, NULL, de, bh,
+                                bh->b_data, bh->b_size,
                                (block<<EXT4_BLOCK_SIZE_BITS(dir->i_sb))
                                         + ((char *)de - bh->b_data))) {
                        /* On error, skip the f_pos to the next block. */
@@ -1007,6 +1005,15 @@ errout:
        return (err);
 }
+static inline int search_dirblock(struct buffer_head *bh,
+                                  struct inode *dir,
+                                  const struct qstr *d_name,
+                                  unsigned int offset,
+                                  struct ext4_dir_entry_2 **res_dir)
+{
+        return search_dir(bh, bh->b_data, dir->i_sb->s_blocksize, dir,
+                          d_name, offset, res_dir);
+}
 /*
 * Directory block splitting, compacting
@@ -1081,13 +1088,6 @@ static void dx_insert_block(struct dx_frame *frame, u32 hash, ext4_lblk_t block)
        dx_set_count(entries, count + 1);
 }
-static void ext4_update_dx_flag(struct inode *inode)
-{
-        if (!EXT4_HAS_COMPAT_FEATURE(inode->i_sb,
-                                     EXT4_FEATURE_COMPAT_DIR_INDEX))
-                ext4_clear_inode_flag(inode, EXT4_INODE_INDEX);
-}
 /*
 * NOTE! unlike strncmp, ext4_match returns 1 for success, 0 for failure.
 *
@@ -1107,11 +1107,13 @@ static inline int ext4_match (int len, const char * const name,
 /*
 * Returns 0 if not found, -1 on failure, and 1 on success
 */
-static inline int search_dirblock(struct buffer_head *bh,
+int search_dir(struct buffer_head *bh,
-                                  struct inode *dir,
+               char *search_buf,
-                                  const struct qstr *d_name,
+               int buf_size,
-                                  unsigned int offset,
+               struct inode *dir,
-                                  struct ext4_dir_entry_2 ** res_dir)
+               const struct qstr *d_name,
+               unsigned int offset,
+               struct ext4_dir_entry_2 **res_dir)
 {
        struct ext4_dir_entry_2 * de;
        char * dlimit;
@@ -1119,8 +1121,8 @@ static inline int search_dirblock(struct buffer_head *bh,
        const char *name = d_name->name;
        int namelen = d_name->len;
-        de = (struct ext4_dir_entry_2 *) bh->b_data;
+        de = (struct ext4_dir_entry_2 *)search_buf;
-        dlimit = bh->b_data + dir->i_sb->s_blocksize;
+        dlimit = search_buf + buf_size;
        while ((char *) de < dlimit) {
                /* this code is executed quadratically often */
                /* do minimal checking `by hand' */
@@ -1128,7 +1130,8 @@ static inline int search_dirblock(struct buffer_head *bh,
                if ((char *) de + namelen <= dlimit &&
                    ext4_match (namelen, name, de)) {
                        /* found a match - just to be sure, do a full check */
-                        if (ext4_check_dir_entry(dir, NULL, de, bh, offset))
+                        if (ext4_check_dir_entry(dir, NULL, de, bh, bh->b_data,
+                                                 bh->b_size, offset))
                                return -1;
                        *res_dir = de;
                        return 1;
@@ -1144,6 +1147,21 @@ static inline int search_dirblock(struct buffer_head *bh,
        return 0;
 }
+static int is_dx_internal_node(struct inode *dir, ext4_lblk_t block,
+                               struct ext4_dir_entry *de)
+{
+        struct super_block *sb = dir->i_sb;
+        if (!is_dx(dir))
+                return 0;
+        if (block == 0)
+                return 1;
+        if (de->inode == 0 &&
+            ext4_rec_len_from_disk(de->rec_len, sb->s_blocksize) ==
+                        sb->s_blocksize)
+                return 1;
+        return 0;
+}
 /*
 *      ext4_find_entry()
@@ -1158,7 +1176,8 @@ static inline int search_dirblock(struct buffer_head *bh,
 */
 static struct buffer_head * ext4_find_entry (struct inode *dir,
                                        const struct qstr *d_name,
-                                        struct ext4_dir_entry_2 ** res_dir)
+                                        struct ext4_dir_entry_2 **res_dir,
+                                        int *inlined)
 {
        struct super_block *sb;
        struct buffer_head *bh_use[NAMEI_RA_SIZE];
@@ -1179,6 +1198,18 @@ static struct buffer_head * ext4_find_entry (struct inode *dir,
        namelen = d_name->len;
        if (namelen > EXT4_NAME_LEN)
                return NULL;
+        if (ext4_has_inline_data(dir)) {
+                int has_inline_data = 1;
+                ret = ext4_find_inline_entry(dir, d_name, res_dir,
+                                             &has_inline_data);
+                if (has_inline_data) {
+                        if (inlined)
+                                *inlined = 1;
+                        return ret;
+                }
+        }
        if ((namelen <= 2) && (name[0] == '.') &&
            (name[1] == '.' || name[1] == '\0')) {
                /*
@@ -1244,6 +1275,8 @@ restart:
                        goto next;
                }
                if (!buffer_verified(bh) &&
+                    !is_dx_internal_node(dir, block,
+                                         (struct ext4_dir_entry *)bh->b_data) &&
                    !ext4_dirent_csum_verify(dir,
                                (struct ext4_dir_entry *)bh->b_data)) {
                        EXT4_ERROR_INODE(dir, "checksumming directory "
@@ -1361,7 +1394,7 @@ static struct dentry *ext4_lookup(struct inode *dir, struct dentry *dentry, unsi
        if (dentry->d_name.len > EXT4_NAME_LEN)
                return ERR_PTR(-ENAMETOOLONG);
-        bh = ext4_find_entry(dir, &dentry->d_name, &de);
+        bh = ext4_find_entry(dir, &dentry->d_name, &de, NULL);
        inode = NULL;
        if (bh) {
                __u32 ino = le32_to_cpu(de->inode);
@@ -1395,7 +1428,7 @@ struct dentry *ext4_get_parent(struct dentry *child)
        struct ext4_dir_entry_2 * de;
        struct buffer_head *bh;
-        bh = ext4_find_entry(child->d_inode, &dotdot, &de);
+        bh = ext4_find_entry(child->d_inode, &dotdot, &de, NULL);
        if (!bh)
                return ERR_PTR(-ENOENT);
        ino = le32_to_cpu(de->inode);
@@ -1593,6 +1626,63 @@ errout:
        return NULL;
 }
+int ext4_find_dest_de(struct inode *dir, struct inode *inode,
+                      struct buffer_head *bh,
+                      void *buf, int buf_size,
+                      const char *name, int namelen,
+                      struct ext4_dir_entry_2 **dest_de)
+{
+        struct ext4_dir_entry_2 *de;
+        unsigned short reclen = EXT4_DIR_REC_LEN(namelen);
+        int nlen, rlen;
+        unsigned int offset = 0;
+        char *top;
+        de = (struct ext4_dir_entry_2 *)buf;
+        top = buf + buf_size - reclen;
+        while ((char *) de <= top) {
+                if (ext4_check_dir_entry(dir, NULL, de, bh,
+                                         buf, buf_size, offset))
+                        return -EIO;
+                if (ext4_match(namelen, name, de))
+                        return -EEXIST;
+                nlen = EXT4_DIR_REC_LEN(de->name_len);
+                rlen = ext4_rec_len_from_disk(de->rec_len, buf_size);
+                if ((de->inode ? rlen - nlen : rlen) >= reclen)
+                        break;
+                de = (struct ext4_dir_entry_2 *)((char *)de + rlen);
+                offset += rlen;
+        }
+        if ((char *) de > top)
+                return -ENOSPC;
+        *dest_de = de;
+        return 0;
+}
+void ext4_insert_dentry(struct inode *inode,
+                        struct ext4_dir_entry_2 *de,
+                        int buf_size,
+                        const char *name, int namelen)
+{
+        int nlen, rlen;
+        nlen = EXT4_DIR_REC_LEN(de->name_len);
+        rlen = ext4_rec_len_from_disk(de->rec_len, buf_size);
+        if (de->inode) {
+                struct ext4_dir_entry_2 *de1 =
+                                (struct ext4_dir_entry_2 *)((char *)de + nlen);
+                de1->rec_len = ext4_rec_len_to_disk(rlen - nlen, buf_size);
+                de->rec_len = ext4_rec_len_to_disk(nlen, buf_size);
+                de = de1;
+        }
+        de->file_type = EXT4_FT_UNKNOWN;
+        de->inode = cpu_to_le32(inode->i_ino);
+        ext4_set_de_type(inode->i_sb, de, inode->i_mode);
+        de->name_len = namelen;
+        memcpy(de->name, name, namelen);
+}
 /*
 * Add a new entry into a directory (leaf) block.  If de is non-NULL,
 * it points to a directory entry which is guaranteed to be large
@@ -1608,12 +1698,10 @@ static int add_dirent_to_buf(handle_t *handle, struct dentry *dentry,
        struct inode    *dir = dentry->d_parent->d_inode;
        const char      *name = dentry->d_name.name;
        int             namelen = dentry->d_name.len;
-        unsigned int    offset = 0;
        unsigned int    blocksize = dir->i_sb->s_blocksize;
        unsigned short  reclen;
-        int             nlen, rlen, err;
-        char            *top;
        int             csum_size = 0;
+        int             err;
        if (EXT4_HAS_RO_COMPAT_FEATURE(inode->i_sb,
                                       EXT4_FEATURE_RO_COMPAT_METADATA_CSUM))
@@ -1621,22 +1709,11 @@ static int add_dirent_to_buf(handle_t *handle, struct dentry *dentry,
        reclen = EXT4_DIR_REC_LEN(namelen);
        if (!de) {
-                de = (struct ext4_dir_entry_2 *)bh->b_data;
+                err = ext4_find_dest_de(dir, inode,
-                top = bh->b_data + (blocksize - csum_size) - reclen;
+                                        bh, bh->b_data, blocksize - csum_size,
-                while ((char *) de <= top) {
+                                        name, namelen, &de);
-                        if (ext4_check_dir_entry(dir, NULL, de, bh, offset))
+                if (err)
-                                return -EIO;
+                        return err;
-                        if (ext4_match(namelen, name, de))
-                                return -EEXIST;
-                        nlen = EXT4_DIR_REC_LEN(de->name_len);
-                        rlen = ext4_rec_len_from_disk(de->rec_len, blocksize);
-                        if ((de->inode? rlen - nlen: rlen) >= reclen)
-                                break;
-                        de = (struct ext4_dir_entry_2 *)((char *)de + rlen);
-                        offset += rlen;
-                }
-                if ((char *) de > top)
-                        return -ENOSPC;
        }
        BUFFER_TRACE(bh, "get_write_access");
        err = ext4_journal_get_write_access(handle, bh);
@@ -1646,19 +1723,8 @@ static int add_dirent_to_buf(handle_t *handle, struct dentry *dentry,
        }
        /* By now the buffer is marked for journaling */
-        nlen = EXT4_DIR_REC_LEN(de->name_len);
+        ext4_insert_dentry(inode, de, blocksize, name, namelen);
-        rlen = ext4_rec_len_from_disk(de->rec_len, blocksize);
-        if (de->inode) {
-                struct ext4_dir_entry_2 *de1 = (struct ext4_dir_entry_2 *)((char *)de + nlen);
-                de1->rec_len = ext4_rec_len_to_disk(rlen - nlen, blocksize);
-                de->rec_len = ext4_rec_len_to_disk(nlen, blocksize);
-                de = de1;
-        }
-        de->file_type = EXT4_FT_UNKNOWN;
-        de->inode = cpu_to_le32(inode->i_ino);
-        ext4_set_de_type(dir->i_sb, de, inode->i_mode);
-        de->name_len = namelen;
-        memcpy(de->name, name, namelen);
        /*
         * XXX shouldn't update any times until successful
         * completion of syscall, but too many callers depend
@@ -1831,6 +1897,17 @@ static int ext4_add_entry(handle_t *handle, struct dentry *dentry,
        blocksize = sb->s_blocksize;
        if (!dentry->d_name.len)
                return -EINVAL;
+        if (ext4_has_inline_data(dir)) {
+                retval = ext4_try_add_inline_entry(handle, dentry, inode);
+                if (retval < 0)
+                        return retval;
+                if (retval == 1) {
+                        retval = 0;
+                        return retval;
+                }
+        }
        if (is_dx(dir)) {
                retval = ext4_dx_add_entry(handle, dentry, inode);
                if (!retval || (retval != ERR_BAD_DX_DIR))
@@ -2036,36 +2113,29 @@ cleanup:
 }
 /*
- * ext4_delete_entry deletes a directory entry by merging it with the
+ * ext4_generic_delete_entry deletes a directory entry by merging it
- * previous entry
+ * with the previous entry
 */
-static int ext4_delete_entry(handle_t *handle,
+int ext4_generic_delete_entry(handle_t *handle,
-                             struct inode *dir,
+                              struct inode *dir,
-                             struct ext4_dir_entry_2 *de_del,
+                              struct ext4_dir_entry_2 *de_del,
-                             struct buffer_head *bh)
+                              struct buffer_head *bh,
+                              void *entry_buf,
+                              int buf_size,
+                              int csum_size)
 {
        struct ext4_dir_entry_2 *de, *pde;
        unsigned int blocksize = dir->i_sb->s_blocksize;
-        int csum_size = 0;
+        int i;
-        int i, err;
-        if (EXT4_HAS_RO_COMPAT_FEATURE(dir->i_sb,
-                                       EXT4_FEATURE_RO_COMPAT_METADATA_CSUM))
-                csum_size = sizeof(struct ext4_dir_entry_tail);
        i = 0;
        pde = NULL;
-        de = (struct ext4_dir_entry_2 *) bh->b_data;
+        de = (struct ext4_dir_entry_2 *)entry_buf;
-        while (i < bh->b_size - csum_size) {
+        while (i < buf_size - csum_size) {
-                if (ext4_check_dir_entry(dir, NULL, de, bh, i))
+                if (ext4_check_dir_entry(dir, NULL, de, bh,
+                                         bh->b_data, bh->b_size, i))
                        return -EIO;
                if (de == de_del)  {
-                        BUFFER_TRACE(bh, "get_write_access");
-                        err = ext4_journal_get_write_access(handle, bh);
-                        if (unlikely(err)) {
-                                ext4_std_error(dir->i_sb, err);
-                                return err;
-                        }
                        if (pde)
                                pde->rec_len = ext4_rec_len_to_disk(
                                        ext4_rec_len_from_disk(pde->rec_len,
@@ -2076,12 +2146,6 @@ static int ext4_delete_entry(handle_t *handle,
                        else
                                de->inode = 0;
                        dir->i_version++;
-                        BUFFER_TRACE(bh, "call ext4_handle_dirty_metadata");
-                        err = ext4_handle_dirty_dirent_node(handle, dir, bh);
-                        if (unlikely(err)) {
-                                ext4_std_error(dir->i_sb, err);
-                                return err;
-                        }
                        return 0;
                }
                i += ext4_rec_len_from_disk(de->rec_len, blocksize);
@@ -2091,6 +2155,48 @@ static int ext4_delete_entry(handle_t *handle,
        return -ENOENT;
 }
+static int ext4_delete_entry(handle_t *handle,
+                             struct inode *dir,
+                             struct ext4_dir_entry_2 *de_del,
+                             struct buffer_head *bh)
+{
+        int err, csum_size = 0;
+        if (ext4_has_inline_data(dir)) {
+                int has_inline_data = 1;
+                err = ext4_delete_inline_entry(handle, dir, de_del, bh,
+                                               &has_inline_data);
+                if (has_inline_data)
+                        return err;
+        }
+        if (EXT4_HAS_RO_COMPAT_FEATURE(dir->i_sb,
+                                       EXT4_FEATURE_RO_COMPAT_METADATA_CSUM))
+                csum_size = sizeof(struct ext4_dir_entry_tail);
+        BUFFER_TRACE(bh, "get_write_access");
+        err = ext4_journal_get_write_access(handle, bh);
+        if (unlikely(err))
+                goto out;
+        err = ext4_generic_delete_entry(handle, dir, de_del,
+                                        bh, bh->b_data,
+                                        dir->i_sb->s_blocksize, csum_size);
+        if (err)
+                goto out;
+        BUFFER_TRACE(bh, "call ext4_handle_dirty_metadata");
+        err = ext4_handle_dirty_dirent_node(handle, dir, bh);
+        if (unlikely(err))
+                goto out;
+        return 0;
+out:
+        if (err != -ENOENT)
+                ext4_std_error(dir->i_sb, err);
+        return err;
+}
 /*
 * DIR_NLINK feature is set if 1) nlinks > EXT4_LINK_MAX or 2) nlinks == 2,
 * since this indicates that nlinks count was previously 1.
@@ -2211,21 +2317,95 @@ retry:
        return err;
 }
-static int ext4_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode)
+struct ext4_dir_entry_2 *ext4_init_dot_dotdot(struct inode *inode,
+                          struct ext4_dir_entry_2 *de,
+                          int blocksize, int csum_size,
+                          unsigned int parent_ino, int dotdot_real_len)
+{
+        de->inode = cpu_to_le32(inode->i_ino);
+        de->name_len = 1;
+        de->rec_len = ext4_rec_len_to_disk(EXT4_DIR_REC_LEN(de->name_len),
+                                           blocksize);
+        strcpy(de->name, ".");
+        ext4_set_de_type(inode->i_sb, de, S_IFDIR);
+        de = ext4_next_entry(de, blocksize);
+        de->inode = cpu_to_le32(parent_ino);
+        de->name_len = 2;
+        if (!dotdot_real_len)
+                de->rec_len = ext4_rec_len_to_disk(blocksize -
+                                        (csum_size + EXT4_DIR_REC_LEN(1)),
+                                        blocksize);
+        else
+                de->rec_len = ext4_rec_len_to_disk(
+                                EXT4_DIR_REC_LEN(de->name_len), blocksize);
+        strcpy(de->name, "..");
+        ext4_set_de_type(inode->i_sb, de, S_IFDIR);
+        return ext4_next_entry(de, blocksize);
+}
+static int ext4_init_new_dir(handle_t *handle, struct inode *dir,
+                             struct inode *inode)
 {
-        handle_t *handle;
-        struct inode *inode;
        struct buffer_head *dir_block = NULL;
        struct ext4_dir_entry_2 *de;
        struct ext4_dir_entry_tail *t;
        unsigned int blocksize = dir->i_sb->s_blocksize;
        int csum_size = 0;
-        int err, retries = 0;
+        int err;
        if (EXT4_HAS_RO_COMPAT_FEATURE(dir->i_sb,
                                       EXT4_FEATURE_RO_COMPAT_METADATA_CSUM))
                csum_size = sizeof(struct ext4_dir_entry_tail);
+        if (ext4_test_inode_state(inode, EXT4_STATE_MAY_INLINE_DATA)) {
+                err = ext4_try_create_inline_dir(handle, dir, inode);
+                if (err < 0 && err != -ENOSPC)
+                        goto out;
+                if (!err)
+                        goto out;
+        }
+        inode->i_size = EXT4_I(inode)->i_disksize = blocksize;
+        dir_block = ext4_bread(handle, inode, 0, 1, &err);
+        if (!(dir_block = ext4_bread(handle, inode, 0, 1, &err))) {
+                if (!err) {
+                        err = -EIO;
+                        ext4_error(inode->i_sb,
+                                   "Directory hole detected on inode %lu\n",
+                                   inode->i_ino);
+                }
+                goto out;
+        }
+        BUFFER_TRACE(dir_block, "get_write_access");
+        err = ext4_journal_get_write_access(handle, dir_block);
+        if (err)
+                goto out;
+        de = (struct ext4_dir_entry_2 *)dir_block->b_data;
+        ext4_init_dot_dotdot(inode, de, blocksize, csum_size, dir->i_ino, 0);
+        set_nlink(inode, 2);
+        if (csum_size) {
+                t = EXT4_DIRENT_TAIL(dir_block->b_data, blocksize);
+                initialize_dirent_tail(t, blocksize);
+        }
+        BUFFER_TRACE(dir_block, "call ext4_handle_dirty_metadata");
+        err = ext4_handle_dirty_dirent_node(handle, inode, dir_block);
+        if (err)
+                goto out;
+        set_buffer_verified(dir_block);
+out:
+        brelse(dir_block);
+        return err;
+}
+static int ext4_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode)
+{
+        handle_t *handle;
+        struct inode *inode;
+        int err, retries = 0;
        if (EXT4_DIR_LINK_MAX(dir))
                return -EMLINK;
@@ -2249,47 +2429,9 @@ retry:
        inode->i_op = &ext4_dir_inode_operations;
        inode->i_fop = &ext4_dir_operations;
-        inode->i_size = EXT4_I(inode)->i_disksize = inode->i_sb->s_blocksize;
+        err = ext4_init_new_dir(handle, dir, inode);
-        if (!(dir_block = ext4_bread(handle, inode, 0, 1, &err))) {
-                if (!err) {
-                        err = -EIO;
-                        ext4_error(inode->i_sb,
-                                   "Directory hole detected on inode %lu\n",
-                                   inode->i_ino);
-                }
-                goto out_clear_inode;
-        }
-        BUFFER_TRACE(dir_block, "get_write_access");
-        err = ext4_journal_get_write_access(handle, dir_block);
-        if (err)
-                goto out_clear_inode;
-        de = (struct ext4_dir_entry_2 *) dir_block->b_data;
-        de->inode = cpu_to_le32(inode->i_ino);
-        de->name_len = 1;
-        de->rec_len = ext4_rec_len_to_disk(EXT4_DIR_REC_LEN(de->name_len),
-                                           blocksize);
-        strcpy(de->name, ".");
-        ext4_set_de_type(dir->i_sb, de, S_IFDIR);
-        de = ext4_next_entry(de, blocksize);
-        de->inode = cpu_to_le32(dir->i_ino);
-        de->rec_len = ext4_rec_len_to_disk(blocksize -
-                                           (csum_size + EXT4_DIR_REC_LEN(1)),
-                                           blocksize);
-        de->name_len = 2;
-        strcpy(de->name, "..");
-        ext4_set_de_type(dir->i_sb, de, S_IFDIR);
-        set_nlink(inode, 2);
-        if (csum_size) {
-                t = EXT4_DIRENT_TAIL(dir_block->b_data, blocksize);
-                initialize_dirent_tail(t, blocksize);
-        }
-        BUFFER_TRACE(dir_block, "call ext4_handle_dirty_metadata");
-        err = ext4_handle_dirty_dirent_node(handle, inode, dir_block);
        if (err)
                goto out_clear_inode;
-        set_buffer_verified(dir_block);
        err = ext4_mark_inode_dirty(handle, inode);
        if (!err)
                err = ext4_add_entry(handle, dentry, inode);
@@ -2309,7 +2451,6 @@ out_clear_inode:
        unlock_new_inode(inode);
        d_instantiate(dentry, inode);
 out_stop:
-        brelse(dir_block);
        ext4_journal_stop(handle);
        if (err == -ENOSPC && ext4_should_retry_alloc(dir->i_sb, &retries))
                goto retry;
@@ -2327,6 +2468,14 @@ static int empty_dir(struct inode *inode)
        struct super_block *sb;
        int err = 0;
+        if (ext4_has_inline_data(inode)) {
+                int has_inline_data = 1;
+                err = empty_inline_dir(inode, &has_inline_data);
+                if (has_inline_data)
+                        return err;
+        }
        sb = inode->i_sb;
        if (inode->i_size < EXT4_DIR_REC_LEN(1) + EXT4_DIR_REC_LEN(2) ||
            !(bh = ext4_bread(NULL, inode, 0, 0, &err))) {
@@ -2393,7 +2542,8 @@ static int empty_dir(struct inode *inode)
                        set_buffer_verified(bh);
                        de = (struct ext4_dir_entry_2 *) bh->b_data;
                }
-                if (ext4_check_dir_entry(inode, NULL, de, bh, offset)) {
+                if (ext4_check_dir_entry(inode, NULL, de, bh,
+                                         bh->b_data, bh->b_size, offset)) {
                        de = (struct ext4_dir_entry_2 *)(bh->b_data +
                                                         sb->s_blocksize);
                        offset = (offset | (sb->s_blocksize - 1)) + 1;
@@ -2579,7 +2729,7 @@ static int ext4_rmdir(struct inode *dir, struct dentry *dentry)
                return PTR_ERR(handle);
        retval = -ENOENT;
-        bh = ext4_find_entry(dir, &dentry->d_name, &de);
+        bh = ext4_find_entry(dir, &dentry->d_name, &de, NULL);
        if (!bh)
                goto end_rmdir;
@@ -2644,7 +2794,7 @@ static int ext4_unlink(struct inode *dir, struct dentry *dentry)
                ext4_handle_sync(handle);
        retval = -ENOENT;
-        bh = ext4_find_entry(dir, &dentry->d_name, &de);
+        bh = ext4_find_entry(dir, &dentry->d_name, &de, NULL);
        if (!bh)
                goto end_unlink;
@@ -2826,8 +2976,39 @@ retry:
        return err;
 }
-#define PARENT_INO(buffer, size) \
-        (ext4_next_entry((struct ext4_dir_entry_2 *)(buffer), size)->inode)
+/*
+ * Try to find buffer head where contains the parent block.
+ * It should be the inode block if it is inlined or the 1st block
+ * if it is a normal dir.
+ */
+static struct buffer_head *ext4_get_first_dir_block(handle_t *handle,
+                                        struct inode *inode,
+                                        int *retval,
+                                        struct ext4_dir_entry_2 **parent_de,
+                                        int *inlined)
+{
+        struct buffer_head *bh;
+        if (!ext4_has_inline_data(inode)) {
+                if (!(bh = ext4_bread(handle, inode, 0, 0, retval))) {
+                        if (!*retval) {
+                                *retval = -EIO;
+                                ext4_error(inode->i_sb,
+                                           "Directory hole detected on inode %lu\n",
+                                           inode->i_ino);
+                        }
+                        return NULL;
+                }
+                *parent_de = ext4_next_entry(
+                                        (struct ext4_dir_entry_2 *)bh->b_data,
+                                        inode->i_sb->s_blocksize);
+                return bh;
+        }
+        *inlined = 1;
+        return ext4_get_first_inline_block(inode, parent_de, retval);
+}
 /*
 * Anybody can rename anything with this: the permission checks are left to the
@@ -2841,6 +3022,8 @@ static int ext4_rename(struct inode *old_dir, struct dentry *old_dentry,
        struct buffer_head *old_bh, *new_bh, *dir_bh;
        struct ext4_dir_entry_2 *old_de, *new_de;
        int retval, force_da_alloc = 0;
+        int inlined = 0, new_inlined = 0;
+        struct ext4_dir_entry_2 *parent_de;
        dquot_initialize(old_dir);
        dquot_initialize(new_dir);
@@ -2860,7 +3043,7 @@ static int ext4_rename(struct inode *old_dir, struct dentry *old_dentry,
        if (IS_DIRSYNC(old_dir) || IS_DIRSYNC(new_dir))
                ext4_handle_sync(handle);
-        old_bh = ext4_find_entry(old_dir, &old_dentry->d_name, &old_de);
+        old_bh = ext4_find_entry(old_dir, &old_dentry->d_name, &old_de, NULL);
        /*
         *  Check for inode number is _not_ due to possible IO errors.
         *  We might rmdir the source, keep it as pwd of some process
@@ -2873,7 +3056,8 @@ static int ext4_rename(struct inode *old_dir, struct dentry *old_dentry,
                goto end_rename;
        new_inode = new_dentry->d_inode;
-        new_bh = ext4_find_entry(new_dir, &new_dentry->d_name, &new_de);
+        new_bh = ext4_find_entry(new_dir, &new_dentry->d_name,
+                                 &new_de, &new_inlined);
        if (new_bh) {
                if (!new_inode) {
                        brelse(new_bh);
@@ -2887,22 +3071,17 @@ static int ext4_rename(struct inode *old_dir, struct dentry *old_dentry,
                                goto end_rename;
                }
                retval = -EIO;
-                if (!(dir_bh = ext4_bread(handle, old_inode, 0, 0, &retval))) {
+                dir_bh = ext4_get_first_dir_block(handle, old_inode,
-                        if (!retval) {
+                                                  &retval, &parent_de,
-                                retval = -EIO;
+                                                  &inlined);
-                                ext4_error(old_inode->i_sb,
+                if (!dir_bh)
-                                           "Directory hole detected on inode %lu\n",
-                                           old_inode->i_ino);
-                        }
                        goto end_rename;
-                }
+                if (!inlined && !buffer_verified(dir_bh) &&
-                if (!buffer_verified(dir_bh) &&
                    !ext4_dirent_csum_verify(old_inode,
                                (struct ext4_dir_entry *)dir_bh->b_data))
                        goto end_rename;
                set_buffer_verified(dir_bh);
-                if (le32_to_cpu(PARENT_INO(dir_bh->b_data,
+                if (le32_to_cpu(parent_de->inode) != old_dir->i_ino)
-                                old_dir->i_sb->s_blocksize)) != old_dir->i_ino)
                        goto end_rename;
                retval = -EMLINK;
                if (!new_inode && new_dir != old_dir &&
@@ -2931,10 +3110,13 @@ static int ext4_rename(struct inode *old_dir, struct dentry *old_dentry,
                                        ext4_current_time(new_dir);
                ext4_mark_inode_dirty(handle, new_dir);
                BUFFER_TRACE(new_bh, "call ext4_handle_dirty_metadata");
-                retval = ext4_handle_dirty_dirent_node(handle, new_dir, new_bh);
+                if (!new_inlined) {
-                if (unlikely(retval)) {
+                        retval = ext4_handle_dirty_dirent_node(handle,
-                        ext4_std_error(new_dir->i_sb, retval);
+                                                               new_dir, new_bh);
-                        goto end_rename;
+                        if (unlikely(retval)) {
+                                ext4_std_error(new_dir->i_sb, retval);
+                                goto end_rename;
+                        }
                }
                brelse(new_bh);
                new_bh = NULL;
@@ -2962,7 +3144,8 @@ static int ext4_rename(struct inode *old_dir, struct dentry *old_dentry,
                struct buffer_head *old_bh2;
                struct ext4_dir_entry_2 *old_de2;
-                old_bh2 = ext4_find_entry(old_dir, &old_dentry->d_name, &old_de2);
+                old_bh2 = ext4_find_entry(old_dir, &old_dentry->d_name,
+                                          &old_de2, NULL);
                if (old_bh2) {
                        retval = ext4_delete_entry(handle, old_dir,
                                                   old_de2, old_bh2);
@@ -2982,17 +3165,19 @@ static int ext4_rename(struct inode *old_dir, struct dentry *old_dentry,
        old_dir->i_ctime = old_dir->i_mtime = ext4_current_time(old_dir);
        ext4_update_dx_flag(old_dir);
        if (dir_bh) {
-                PARENT_INO(dir_bh->b_data, new_dir->i_sb->s_blocksize) =
+                parent_de->inode = cpu_to_le32(new_dir->i_ino);
-                                                cpu_to_le32(new_dir->i_ino);
                BUFFER_TRACE(dir_bh, "call ext4_handle_dirty_metadata");
-                if (is_dx(old_inode)) {
+                if (!inlined) {
-                        retval = ext4_handle_dirty_dx_node(handle,
+                        if (is_dx(old_inode)) {
-                                                           old_inode,
+                                retval = ext4_handle_dirty_dx_node(handle,
-                                                           dir_bh);
+                                                                   old_inode,
+                                                                   dir_bh);
+                        } else {
+                                retval = ext4_handle_dirty_dirent_node(handle,
+                                                        old_inode, dir_bh);
+                        }
                } else {
-                        retval = ext4_handle_dirty_dirent_node(handle,
+                        retval = ext4_mark_inode_dirty(handle, old_inode);
-                                                               old_inode,
-                                                               dir_bh);
                }
                if (retval) {
                        ext4_std_error(old_dir->i_sb, retval);
@@ -3043,23 +3228,19 @@ const struct inode_operations ext4_dir_inode_operations = {
        .mknod          = ext4_mknod,
        .rename         = ext4_rename,
        .setattr        = ext4_setattr,
-#ifdef CONFIG_EXT4_FS_XATTR
        .setxattr       = generic_setxattr,
        .getxattr       = generic_getxattr,
        .listxattr      = ext4_listxattr,
        .removexattr    = generic_removexattr,
-#endif
        .get_acl        = ext4_get_acl,
        .fiemap         = ext4_fiemap,
 };
 const struct inode_operations ext4_special_inode_operations = {
        .setattr        = ext4_setattr,
-#ifdef CONFIG_EXT4_FS_XATTR
        .setxattr       = generic_setxattr,
        .getxattr       = generic_getxattr,
        .listxattr      = ext4_listxattr,
        .removexattr    = generic_removexattr,
-#endif
        .get_acl        = ext4_get_acl,
 };
diff --git a/fs/ext4/page-io.c b/fs/ext4/page-io.c
index 68e896e12a67..0016fbca2a40 100644
--- a/fs/ext4/page-io.c
+++ b/fs/ext4/page-io.c
@@ -27,7 +27,6 @@
 #include "ext4_jbd2.h"
 #include "xattr.h"
 #include "acl.h"
-#include "ext4_extents.h"
 static struct kmem_cache *io_page_cachep, *io_end_cachep;
@@ -111,7 +110,7 @@ static int ext4_end_io(ext4_io_end_t *io)
                inode_dio_done(inode);
        /* Wake up anyone waiting on unwritten extent conversion */
        if (atomic_dec_and_test(&EXT4_I(inode)->i_unwritten))
-                wake_up_all(ext4_ioend_wq(io->inode));
+                wake_up_all(ext4_ioend_wq(inode));
        return ret;
 }
diff --git a/fs/ext4/resize.c b/fs/ext4/resize.c
index 47bf06a2765d..d99387b89edd 100644
--- a/fs/ext4/resize.c
+++ b/fs/ext4/resize.c
@@ -783,7 +783,7 @@ static int add_new_gdb(handle_t *handle, struct inode *inode,
        err = ext4_journal_get_write_access(handle, gdb_bh);
        if (unlikely(err))
-                goto exit_sbh;
+                goto exit_dind;
        err = ext4_journal_get_write_access(handle, dind);
        if (unlikely(err))
@@ -792,7 +792,7 @@ static int add_new_gdb(handle_t *handle, struct inode *inode,
        /* ext4_reserve_inode_write() gets a reference on the iloc */
        err = ext4_reserve_inode_write(handle, inode, &iloc);
        if (unlikely(err))
-                goto exit_dindj;
+                goto exit_dind;
        n_group_desc = ext4_kvmalloc((gdb_num + 1) *
                                     sizeof(struct buffer_head *),
@@ -846,12 +846,7 @@ static int add_new_gdb(handle_t *handle, struct inode *inode,
 exit_inode:
        ext4_kvfree(n_group_desc);
-        /* ext4_handle_release_buffer(handle, iloc.bh); */
        brelse(iloc.bh);
-exit_dindj:
-        /* ext4_handle_release_buffer(handle, dind); */
-exit_sbh:
-        /* ext4_handle_release_buffer(handle, EXT4_SB(sb)->s_sbh); */
 exit_dind:
        brelse(dind);
 exit_bh:
@@ -969,14 +964,8 @@ static int reserve_backup_gdb(handle_t *handle, struct inode *inode,
        }
        for (i = 0; i < reserved_gdb; i++) {
-                if ((err = ext4_journal_get_write_access(handle, primary[i]))) {
+                if ((err = ext4_journal_get_write_access(handle, primary[i])))
-                        /*
-                        int j;
-                        for (j = 0; j < i; j++)
-                                ext4_handle_release_buffer(handle, primary[j]);
-                         */
                        goto exit_bh;
-                }
        }
        if ((err = ext4_reserve_inode_write(handle, inode, &iloc)))
diff --git a/fs/ext4/super.c b/fs/ext4/super.c
index 80928f716850..3cdb0a2fc648 100644
--- a/fs/ext4/super.c
+++ b/fs/ext4/super.c
@@ -45,7 +45,7 @@
 #include <linux/freezer.h>
 #include "ext4.h"
-#include "ext4_extents.h"
+#include "ext4_extents.h"       /* Needed for trace points definition */
 #include "ext4_jbd2.h"
 #include "xattr.h"
 #include "acl.h"
@@ -939,10 +939,11 @@ static struct inode *ext4_alloc_inode(struct super_block *sb)
                return NULL;
        ei->vfs_inode.i_version = 1;
-        ei->vfs_inode.i_data.writeback_index = 0;
        memset(&ei->i_cached_extent, 0, sizeof(struct ext4_ext_cache));
        INIT_LIST_HEAD(&ei->i_prealloc_list);
        spin_lock_init(&ei->i_prealloc_lock);
+        ext4_es_init_tree(&ei->i_es_tree);
+        rwlock_init(&ei->i_es_lock);
        ei->i_reserved_data_blocks = 0;
        ei->i_reserved_meta_blocks = 0;
        ei->i_allocated_meta_blocks = 0;
@@ -996,9 +997,7 @@ static void init_once(void *foo)
        struct ext4_inode_info *ei = (struct ext4_inode_info *) foo;
        INIT_LIST_HEAD(&ei->i_orphan);
-#ifdef CONFIG_EXT4_FS_XATTR
        init_rwsem(&ei->xattr_sem);
-#endif
        init_rwsem(&ei->i_data_sem);
        inode_init_once(&ei->vfs_inode);
 }
@@ -1031,6 +1030,7 @@ void ext4_clear_inode(struct inode *inode)
        clear_inode(inode);
        dquot_drop(inode);
        ext4_discard_preallocations(inode);
+        ext4_es_remove_extent(inode, 0, EXT_MAX_BLOCKS);
        if (EXT4_I(inode)->jinode) {
                jbd2_journal_release_jbd_inode(EXT4_JOURNAL(inode),
                                               EXT4_I(inode)->jinode);
@@ -1447,13 +1447,8 @@ static const struct mount_opts {
        {Opt_data_journal, EXT4_MOUNT_JOURNAL_DATA, MOPT_DATAJ},
        {Opt_data_ordered, EXT4_MOUNT_ORDERED_DATA, MOPT_DATAJ},
        {Opt_data_writeback, EXT4_MOUNT_WRITEBACK_DATA, MOPT_DATAJ},
-#ifdef CONFIG_EXT4_FS_XATTR
        {Opt_user_xattr, EXT4_MOUNT_XATTR_USER, MOPT_SET},
        {Opt_nouser_xattr, EXT4_MOUNT_XATTR_USER, MOPT_CLEAR},
-#else
-        {Opt_user_xattr, 0, MOPT_NOSUPPORT},
-        {Opt_nouser_xattr, 0, MOPT_NOSUPPORT},
-#endif
 #ifdef CONFIG_EXT4_FS_POSIX_ACL
        {Opt_acl, EXT4_MOUNT_POSIX_ACL, MOPT_SET},
        {Opt_noacl, EXT4_MOUNT_POSIX_ACL, MOPT_CLEAR},
@@ -3202,7 +3197,6 @@ int ext4_calculate_overhead(struct super_block *sb)
        ext4_fsblk_t overhead = 0;
        char *buf = (char *) get_zeroed_page(GFP_KERNEL);
-        memset(buf, 0, PAGE_SIZE);
        if (!buf)
                return -ENOMEM;
@@ -3256,7 +3250,7 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
        unsigned int i;
        int needs_recovery, has_huge_files, has_bigalloc;
        __u64 blocks_count;
-        int err;
+        int err = 0;
        unsigned int journal_ioprio = DEFAULT_JOURNAL_IOPRIO;
        ext4_group_t first_not_zeroed;
@@ -3272,9 +3266,6 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
        }
        sb->s_fs_info = sbi;
        sbi->s_sb = sb;
-        sbi->s_mount_opt = 0;
-        sbi->s_resuid = make_kuid(&init_user_ns, EXT4_DEF_RESUID);
-        sbi->s_resgid = make_kgid(&init_user_ns, EXT4_DEF_RESGID);
        sbi->s_inode_readahead_blks = EXT4_DEF_INODE_READAHEAD_BLKS;
        sbi->s_sb_block = sb_block;
        if (sb->s_bdev->bd_part)
@@ -3285,6 +3276,7 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
        for (cp = sb->s_id; (cp = strchr(cp, '/'));)
                *cp = '!';
+        /* -EINVAL is default */
        ret = -EINVAL;
        blocksize = sb_min_blocksize(sb, EXT4_MIN_BLOCK_SIZE);
        if (!blocksize) {
@@ -3369,9 +3361,7 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
        if (def_mount_opts & EXT4_DEFM_UID16)
                set_opt(sb, NO_UID32);
        /* xattr user namespace & acls are now defaulted on */
-#ifdef CONFIG_EXT4_FS_XATTR
        set_opt(sb, XATTR_USER);
-#endif
 #ifdef CONFIG_EXT4_FS_POSIX_ACL
        set_opt(sb, POSIX_ACL);
 #endif
@@ -3662,7 +3652,6 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
                         " too large to mount safely on this system");
                if (sizeof(sector_t) < 8)
                        ext4_msg(sb, KERN_WARNING, "CONFIG_LBDAF not enabled");
-                ret = err;
                goto failed_mount;
        }
@@ -3770,7 +3759,6 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
        }
        if (err) {
                ext4_msg(sb, KERN_ERR, "insufficient memory");
-                ret = err;
                goto failed_mount3;
        }
@@ -3801,7 +3789,6 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
        INIT_LIST_HEAD(&sbi->s_orphan); /* unlinked but open files */
        mutex_init(&sbi->s_orphan_lock);
-        sbi->s_resize_flags = 0;
        sb->s_root = NULL;
@@ -3897,8 +3884,8 @@ no_journal:
        if (es->s_overhead_clusters)
                sbi->s_overhead = le32_to_cpu(es->s_overhead_clusters);
        else {
-                ret = ext4_calculate_overhead(sb);
+                err = ext4_calculate_overhead(sb);
-                if (ret)
+                if (err)
                        goto failed_mount_wq;
        }
@@ -3910,6 +3897,7 @@ no_journal:
                alloc_workqueue("ext4-dio-unwritten", WQ_MEM_RECLAIM | WQ_UNBOUND, 1);
        if (!EXT4_SB(sb)->dio_unwritten_wq) {
                printk(KERN_ERR "EXT4-fs: failed to create DIO workqueue\n");
+                ret = -ENOMEM;
                goto failed_mount_wq;
        }
@@ -4012,12 +4000,20 @@ no_journal:
        /* Enable quota usage during mount. */
        if (EXT4_HAS_RO_COMPAT_FEATURE(sb, EXT4_FEATURE_RO_COMPAT_QUOTA) &&
            !(sb->s_flags & MS_RDONLY)) {
-                ret = ext4_enable_quotas(sb);
+                err = ext4_enable_quotas(sb);
-                if (ret)
+                if (err)
                        goto failed_mount7;
        }
 #endif  /* CONFIG_QUOTA */
+        if (test_opt(sb, DISCARD)) {
+                struct request_queue *q = bdev_get_queue(sb->s_bdev);
+                if (!blk_queue_discard(q))
+                        ext4_msg(sb, KERN_WARNING,
+                                 "mounting with \"discard\" option, but "
+                                 "the device does not support discard");
+        }
        ext4_msg(sb, KERN_INFO, "mounted filesystem with%s. "
                 "Opts: %s%s%s", descr, sbi->s_es->s_mount_opts,
                 *sbi->s_es->s_mount_opts ? "; " : "", orig_data);
@@ -4084,7 +4080,7 @@ out_fail:
        kfree(sbi);
 out_free_orig:
        kfree(orig_data);
-        return ret;
+        return err ? err : ret;
 }
 /*
@@ -4790,7 +4786,7 @@ static int ext4_statfs(struct dentry *dentry, struct kstatfs *buf)
        buf->f_type = EXT4_SUPER_MAGIC;
        buf->f_bsize = sb->s_blocksize;
-        buf->f_blocks = ext4_blocks_count(es) - EXT4_C2B(sbi, sbi->s_overhead);
+        buf->f_blocks = ext4_blocks_count(es) - EXT4_C2B(sbi, overhead);
        bfree = percpu_counter_sum_positive(&sbi->s_freeclusters_counter) -
                percpu_counter_sum_positive(&sbi->s_dirtyclusters_counter);
        /* prevent underflow in case that few free space is available */
@@ -5282,6 +5278,7 @@ static int __init ext4_init_fs(void)
        ext4_li_info = NULL;
        mutex_init(&ext4_li_mtx);
+        /* Build-time check for flags consistency */
        ext4_check_flag_values();
        for (i = 0; i < EXT4_WQ_HASH_SZ; i++) {
@@ -5289,9 +5286,14 @@ static int __init ext4_init_fs(void)
                init_waitqueue_head(&ext4__ioend_wq[i]);
        }
-        err = ext4_init_pageio();
+        err = ext4_init_es();
        if (err)
                return err;
+        err = ext4_init_pageio();
+        if (err)
+                goto out7;
        err = ext4_init_system_zone();
        if (err)
                goto out6;
@@ -5341,6 +5343,9 @@ out5:
        ext4_exit_system_zone();
 out6:
        ext4_exit_pageio();
+out7:
+        ext4_exit_es();
        return err;
 }
diff --git a/fs/ext4/symlink.c b/fs/ext4/symlink.c
index ed9354aff279..ff3711932018 100644
--- a/fs/ext4/symlink.c
+++ b/fs/ext4/symlink.c
@@ -35,22 +35,18 @@ const struct inode_operations ext4_symlink_inode_operations = {
        .follow_link    = page_follow_link_light,
        .put_link       = page_put_link,
        .setattr        = ext4_setattr,
-#ifdef CONFIG_EXT4_FS_XATTR
        .setxattr       = generic_setxattr,
        .getxattr       = generic_getxattr,
        .listxattr      = ext4_listxattr,
        .removexattr    = generic_removexattr,
-#endif
 };
 const struct inode_operations ext4_fast_symlink_inode_operations = {
        .readlink       = generic_readlink,
        .follow_link    = ext4_follow_link,
        .setattr        = ext4_setattr,
-#ifdef CONFIG_EXT4_FS_XATTR
        .setxattr       = generic_setxattr,
        .getxattr       = generic_getxattr,
        .listxattr      = ext4_listxattr,
        .removexattr    = generic_removexattr,
-#endif
 };
diff --git a/fs/ext4/xattr.c b/fs/ext4/xattr.c
index 2cdb98d62980..3a91ebc2b66f 100644
--- a/fs/ext4/xattr.c
+++ b/fs/ext4/xattr.c
@@ -61,11 +61,6 @@
 #include "xattr.h"
 #include "acl.h"
-#define BHDR(bh) ((struct ext4_xattr_header *)((bh)->b_data))
-#define ENTRY(ptr) ((struct ext4_xattr_entry *)(ptr))
-#define BFIRST(bh) ENTRY(BHDR(bh)+1)
-#define IS_LAST_ENTRY(entry) (*(__u32 *)(entry) == 0)
 #ifdef EXT4_XATTR_DEBUG
 # define ea_idebug(inode, f...) do { \
                printk(KERN_DEBUG "inode %s:%lu: ", \
@@ -312,7 +307,7 @@ cleanup:
        return error;
 }
-static int
+int
 ext4_xattr_ibody_get(struct inode *inode, int name_index, const char *name,
                     void *buffer, size_t buffer_size)
 {
@@ -581,21 +576,6 @@ static size_t ext4_xattr_free_space(struct ext4_xattr_entry *last,
        return (*min_offs - ((void *)last - base) - sizeof(__u32));
 }
-struct ext4_xattr_info {
-        int name_index;
-        const char *name;
-        const void *value;
-        size_t value_len;
-};
-struct ext4_xattr_search {
-        struct ext4_xattr_entry *first;
-        void *base;
-        void *end;
-        struct ext4_xattr_entry *here;
-        int not_found;
-};
 static int
 ext4_xattr_set_entry(struct ext4_xattr_info *i, struct ext4_xattr_search *s)
 {
@@ -648,9 +628,14 @@ ext4_xattr_set_entry(struct ext4_xattr_info *i, struct ext4_xattr_search *s)
                                   size. Just replace. */
                                s->here->e_value_size =
                                        cpu_to_le32(i->value_len);
-                                memset(val + size - EXT4_XATTR_PAD, 0,
+                                if (i->value == EXT4_ZERO_XATTR_VALUE) {
-                                       EXT4_XATTR_PAD); /* Clear pad bytes. */
+                                        memset(val, 0, size);
-                                memcpy(val, i->value, i->value_len);
+                                } else {
+                                        /* Clear pad bytes first. */
+                                        memset(val + size - EXT4_XATTR_PAD, 0,
+                                               EXT4_XATTR_PAD);
+                                        memcpy(val, i->value, i->value_len);
+                                }
                                return 0;
                        }
@@ -689,9 +674,14 @@ ext4_xattr_set_entry(struct ext4_xattr_info *i, struct ext4_xattr_search *s)
                        size_t size = EXT4_XATTR_SIZE(i->value_len);
                        void *val = s->base + min_offs - size;
                        s->here->e_value_offs = cpu_to_le16(min_offs - size);
-                        memset(val + size - EXT4_XATTR_PAD, 0,
+                        if (i->value == EXT4_ZERO_XATTR_VALUE) {
-                               EXT4_XATTR_PAD); /* Clear the pad bytes. */
+                                memset(val, 0, size);
-                        memcpy(val, i->value, i->value_len);
+                        } else {
+                                /* Clear the pad bytes first. */
+                                memset(val + size - EXT4_XATTR_PAD, 0,
+                                       EXT4_XATTR_PAD);
+                                memcpy(val, i->value, i->value_len);
+                        }
                }
        }
        return 0;
@@ -794,7 +784,6 @@ ext4_xattr_block_set(handle_t *handle, struct inode *inode,
                        int offset = (char *)s->here - bs->bh->b_data;
                        unlock_buffer(bs->bh);
-                        ext4_handle_release_buffer(handle, bs->bh);
                        if (ce) {
                                mb_cache_entry_release(ce);
                                ce = NULL;
@@ -950,14 +939,8 @@ bad_block:
 #undef header
 }
-struct ext4_xattr_ibody_find {
+int ext4_xattr_ibody_find(struct inode *inode, struct ext4_xattr_info *i,
-        struct ext4_xattr_search s;
+                          struct ext4_xattr_ibody_find *is)
-        struct ext4_iloc iloc;
-};
-static int
-ext4_xattr_ibody_find(struct inode *inode, struct ext4_xattr_info *i,
-                      struct ext4_xattr_ibody_find *is)
 {
        struct ext4_xattr_ibody_header *header;
        struct ext4_inode *raw_inode;
@@ -985,10 +968,47 @@ ext4_xattr_ibody_find(struct inode *inode, struct ext4_xattr_info *i,
        return 0;
 }
-static int
+int ext4_xattr_ibody_inline_set(handle_t *handle, struct inode *inode,
-ext4_xattr_ibody_set(handle_t *handle, struct inode *inode,
+                                struct ext4_xattr_info *i,
-                     struct ext4_xattr_info *i,
+                                struct ext4_xattr_ibody_find *is)
-                     struct ext4_xattr_ibody_find *is)
+{
+        struct ext4_xattr_ibody_header *header;
+        struct ext4_xattr_search *s = &is->s;
+        int error;
+        if (EXT4_I(inode)->i_extra_isize == 0)
+                return -ENOSPC;
+        error = ext4_xattr_set_entry(i, s);
+        if (error) {
+                if (error == -ENOSPC &&
+                    ext4_has_inline_data(inode)) {
+                        error = ext4_try_to_evict_inline_data(handle, inode,
+                                        EXT4_XATTR_LEN(strlen(i->name) +
+                                        EXT4_XATTR_SIZE(i->value_len)));
+                        if (error)
+                                return error;
+                        error = ext4_xattr_ibody_find(inode, i, is);
+                        if (error)
+                                return error;
+                        error = ext4_xattr_set_entry(i, s);
+                }
+                if (error)
+                        return error;
+        }
+        header = IHDR(inode, ext4_raw_inode(&is->iloc));
+        if (!IS_LAST_ENTRY(s->first)) {
+                header->h_magic = cpu_to_le32(EXT4_XATTR_MAGIC);
+                ext4_set_inode_state(inode, EXT4_STATE_XATTR);
+        } else {
+                header->h_magic = cpu_to_le32(0);
+                ext4_clear_inode_state(inode, EXT4_STATE_XATTR);
+        }
+        return 0;
+}
+static int ext4_xattr_ibody_set(handle_t *handle, struct inode *inode,
+                                struct ext4_xattr_info *i,
+                                struct ext4_xattr_ibody_find *is)
 {
        struct ext4_xattr_ibody_header *header;
        struct ext4_xattr_search *s = &is->s;
@@ -1144,9 +1164,17 @@ ext4_xattr_set(struct inode *inode, int name_index, const char *name,
 {
        handle_t *handle;
        int error, retries = 0;
+        int credits = EXT4_DATA_TRANS_BLOCKS(inode->i_sb);
 retry:
-        handle = ext4_journal_start(inode, EXT4_DATA_TRANS_BLOCKS(inode->i_sb));
+        /*
+         * In case of inline data, we may push out the data to a block,
+         * So reserve the journal space first.
+         */
+        if (ext4_has_inline_data(inode))
+                credits += ext4_writepage_trans_blocks(inode) + 1;
+        handle = ext4_journal_start(inode, credits);
        if (IS_ERR(handle)) {
                error = PTR_ERR(handle);
        } else {
diff --git a/fs/ext4/xattr.h b/fs/ext4/xattr.h
index 91f31ca7d9af..69eda787a96a 100644
--- a/fs/ext4/xattr.h
+++ b/fs/ext4/xattr.h
@@ -21,6 +21,7 @@
 #define EXT4_XATTR_INDEX_TRUSTED                4
 #define EXT4_XATTR_INDEX_LUSTRE                 5
 #define EXT4_XATTR_INDEX_SECURITY               6
+#define EXT4_XATTR_INDEX_SYSTEM                 7
 struct ext4_xattr_header {
        __le32  h_magic;        /* magic number for identification */
@@ -65,7 +66,32 @@ struct ext4_xattr_entry {
                EXT4_I(inode)->i_extra_isize))
 #define IFIRST(hdr) ((struct ext4_xattr_entry *)((hdr)+1))
-# ifdef CONFIG_EXT4_FS_XATTR
+#define BHDR(bh) ((struct ext4_xattr_header *)((bh)->b_data))
+#define ENTRY(ptr) ((struct ext4_xattr_entry *)(ptr))
+#define BFIRST(bh) ENTRY(BHDR(bh)+1)
+#define IS_LAST_ENTRY(entry) (*(__u32 *)(entry) == 0)
+#define EXT4_ZERO_XATTR_VALUE ((void *)-1)
+struct ext4_xattr_info {
+        int name_index;
+        const char *name;
+        const void *value;
+        size_t value_len;
+};
+struct ext4_xattr_search {
+        struct ext4_xattr_entry *first;
+        void *base;
+        void *end;
+        struct ext4_xattr_entry *here;
+        int not_found;
+};
+struct ext4_xattr_ibody_find {
+        struct ext4_xattr_search s;
+        struct ext4_iloc iloc;
+};
 extern const struct xattr_handler ext4_xattr_user_handler;
 extern const struct xattr_handler ext4_xattr_trusted_handler;
@@ -90,60 +116,82 @@ extern void ext4_exit_xattr(void);
 extern const struct xattr_handler *ext4_xattr_handlers[];
-# else  /* CONFIG_EXT4_FS_XATTR */
+extern int ext4_xattr_ibody_find(struct inode *inode, struct ext4_xattr_info *i,
+                                 struct ext4_xattr_ibody_find *is);
-static inline int
+extern int ext4_xattr_ibody_get(struct inode *inode, int name_index,
-ext4_xattr_get(struct inode *inode, int name_index, const char *name,
+                                const char *name,
-               void *buffer, size_t size, int flags)
+                                void *buffer, size_t buffer_size);
-{
+extern int ext4_xattr_ibody_inline_set(handle_t *handle, struct inode *inode,
-        return -EOPNOTSUPP;
+                                       struct ext4_xattr_info *i,
-}
+                                       struct ext4_xattr_ibody_find *is);
-static inline int
+extern int ext4_has_inline_data(struct inode *inode);
-ext4_xattr_set(struct inode *inode, int name_index, const char *name,
+extern int ext4_get_inline_size(struct inode *inode);
-               const void *value, size_t size, int flags)
+extern int ext4_get_max_inline_size(struct inode *inode);
-{
+extern int ext4_find_inline_data_nolock(struct inode *inode);
-        return -EOPNOTSUPP;
+extern void ext4_write_inline_data(struct inode *inode,
-}
+                                   struct ext4_iloc *iloc,
+                                   void *buffer, loff_t pos,
-static inline int
+                                   unsigned int len);
-ext4_xattr_set_handle(handle_t *handle, struct inode *inode, int name_index,
+extern int ext4_prepare_inline_data(handle_t *handle, struct inode *inode,
-               const char *name, const void *value, size_t size, int flags)
+                                    unsigned int len);
-{
+extern int ext4_init_inline_data(handle_t *handle, struct inode *inode,
-        return -EOPNOTSUPP;
+                                 unsigned int len);
-}
+extern int ext4_destroy_inline_data(handle_t *handle, struct inode *inode);
-static inline void
+extern int ext4_readpage_inline(struct inode *inode, struct page *page);
-ext4_xattr_delete_inode(handle_t *handle, struct inode *inode)
+extern int ext4_try_to_write_inline_data(struct address_space *mapping,
-{
+                                         struct inode *inode,
-}
+                                         loff_t pos, unsigned len,
+                                         unsigned flags,
-static inline void
+                                         struct page **pagep);
-ext4_xattr_put_super(struct super_block *sb)
+extern int ext4_write_inline_data_end(struct inode *inode,
-{
+                                      loff_t pos, unsigned len,
-}
+                                      unsigned copied,
+                                      struct page *page);
-static __init inline int
+extern struct buffer_head *
-ext4_init_xattr(void)
+ext4_journalled_write_inline_data(struct inode *inode,
-{
+                                  unsigned len,
-        return 0;
+                                  struct page *page);
-}
+extern int ext4_da_write_inline_data_begin(struct address_space *mapping,
+                                           struct inode *inode,
-static inline void
+                                           loff_t pos, unsigned len,
-ext4_exit_xattr(void)
+                                           unsigned flags,
-{
+                                           struct page **pagep,
-}
+                                           void **fsdata);
+extern int ext4_da_write_inline_data_end(struct inode *inode, loff_t pos,
-static inline int
+                                         unsigned len, unsigned copied,
-ext4_expand_extra_isize_ea(struct inode *inode, int new_extra_isize,
+                                         struct page *page);
-                            struct ext4_inode *raw_inode, handle_t *handle)
+extern int ext4_try_add_inline_entry(handle_t *handle, struct dentry *dentry,
-{
+                                     struct inode *inode);
-        return -EOPNOTSUPP;
+extern int ext4_try_create_inline_dir(handle_t *handle,
-}
+                                      struct inode *parent,
+                                      struct inode *inode);
-#define ext4_xattr_handlers     NULL
+extern int ext4_read_inline_dir(struct file *filp,
+                                void *dirent, filldir_t filldir,
-# endif  /* CONFIG_EXT4_FS_XATTR */
+                                int *has_inline_data);
+extern struct buffer_head *ext4_find_inline_entry(struct inode *dir,
+                                        const struct qstr *d_name,
+                                        struct ext4_dir_entry_2 **res_dir,
+                                        int *has_inline_data);
+extern int ext4_delete_inline_entry(handle_t *handle,
+                                    struct inode *dir,
+                                    struct ext4_dir_entry_2 *de_del,
+                                    struct buffer_head *bh,
+                                    int *has_inline_data);
+extern int empty_inline_dir(struct inode *dir, int *has_inline_data);
+extern struct buffer_head *ext4_get_first_inline_block(struct inode *inode,
+                                        struct ext4_dir_entry_2 **parent_de,
+                                        int *retval);
+extern int ext4_inline_data_fiemap(struct inode *inode,
+                                   struct fiemap_extent_info *fieinfo,
+                                   int *has_inline);
+extern int ext4_try_to_evict_inline_data(handle_t *handle,
+                                         struct inode *inode,
+                                         int needed);
+extern void ext4_inline_data_truncate(struct inode *inode, int *has_inline);
+extern int ext4_convert_inline_data(struct inode *inode);
 #ifdef CONFIG_EXT4_FS_SECURITY
 extern int ext4_init_security(handle_t *handle, struct inode *inode,
diff --git a/fs/fat/fat.h b/fs/fat/fat.h
index 623f36f0423b..12701a567752 100644
--- a/fs/fat/fat.h
+++ b/fs/fat/fat.h
@@ -29,6 +29,7 @@ struct fat_mount_options {
        unsigned short fs_fmask;
        unsigned short fs_dmask;
        unsigned short codepage;   /* Codepage for shortname conversions */
+        int time_offset;           /* Offset of timestamps from UTC (in minutes) */
        char *iocharset;           /* Charset used for filename input/display */
        unsigned short shortname;  /* flags for shortname display/create rule */
        unsigned char name_check;  /* r = relaxed, n = normal, s = strict */
@@ -45,7 +46,7 @@ struct fat_mount_options {
                 flush:1,          /* write things quickly */
                 nocase:1,         /* Does this need case conversion? 0=need case conversion*/
                 usefree:1,        /* Use free_clusters for FAT32 */
-                 tz_utc:1,         /* Filesystem timestamps are in UTC */
+                 tz_set:1,         /* Filesystem timestamps' offset set */
                 rodir:1,          /* allow ATTR_RO for directory */
                 discard:1,        /* Issue discard requests on deletions */
                 nfs:1;            /* Do extra work needed for NFS export */
diff --git a/fs/fat/inode.c b/fs/fat/inode.c
index 5bafaad00530..35806813ea4e 100644
--- a/fs/fat/inode.c
+++ b/fs/fat/inode.c
@@ -26,6 +26,7 @@
 #include <linux/writeback.h>
 #include <linux/log2.h>
 #include <linux/hash.h>
+#include <linux/blkdev.h>
 #include <asm/unaligned.h>
 #include "fat.h"
@@ -725,7 +726,8 @@ static int fat_show_options(struct seq_file *m, struct dentry *root)
        if (opts->allow_utime)
                seq_printf(m, ",allow_utime=%04o", opts->allow_utime);
        if (sbi->nls_disk)
-                seq_printf(m, ",codepage=%s", sbi->nls_disk->charset);
+                /* strip "cp" prefix from displayed option */
+                seq_printf(m, ",codepage=%s", &sbi->nls_disk->charset[2]);
        if (isvfat) {
                if (sbi->nls_io)
                        seq_printf(m, ",iocharset=%s", sbi->nls_io->charset);
@@ -777,8 +779,12 @@ static int fat_show_options(struct seq_file *m, struct dentry *root)
        }
        if (opts->flush)
                seq_puts(m, ",flush");
-        if (opts->tz_utc)
+        if (opts->tz_set) {
-                seq_puts(m, ",tz=UTC");
+                if (opts->time_offset)
+                        seq_printf(m, ",time_offset=%d", opts->time_offset);
+                else
+                        seq_puts(m, ",tz=UTC");
+        }
        if (opts->errors == FAT_ERRORS_CONT)
                seq_puts(m, ",errors=continue");
        else if (opts->errors == FAT_ERRORS_PANIC)
@@ -800,7 +806,8 @@ enum {
        Opt_shortname_winnt, Opt_shortname_mixed, Opt_utf8_no, Opt_utf8_yes,
        Opt_uni_xl_no, Opt_uni_xl_yes, Opt_nonumtail_no, Opt_nonumtail_yes,
        Opt_obsolete, Opt_flush, Opt_tz_utc, Opt_rodir, Opt_err_cont,
-        Opt_err_panic, Opt_err_ro, Opt_discard, Opt_nfs, Opt_err,
+        Opt_err_panic, Opt_err_ro, Opt_discard, Opt_nfs, Opt_time_offset,
+        Opt_err,
 };
 static const match_table_t fat_tokens = {
@@ -825,6 +832,7 @@ static const match_table_t fat_tokens = {
        {Opt_immutable, "sys_immutable"},
        {Opt_flush, "flush"},
        {Opt_tz_utc, "tz=UTC"},
+        {Opt_time_offset, "time_offset=%d"},
        {Opt_err_cont, "errors=continue"},
        {Opt_err_panic, "errors=panic"},
        {Opt_err_ro, "errors=remount-ro"},
@@ -909,7 +917,7 @@ static int parse_options(struct super_block *sb, char *options, int is_vfat,
        opts->utf8 = opts->unicode_xlate = 0;
        opts->numtail = 1;
        opts->usefree = opts->nocase = 0;
-        opts->tz_utc = 0;
+        opts->tz_set = 0;
        opts->nfs = 0;
        opts->errors = FAT_ERRORS_RO;
        *debug = 0;
@@ -965,48 +973,57 @@ static int parse_options(struct super_block *sb, char *options, int is_vfat,
                        break;
                case Opt_uid:
                        if (match_int(&args[0], &option))
-                                return 0;
+                                return -EINVAL;
                        opts->fs_uid = make_kuid(current_user_ns(), option);
                        if (!uid_valid(opts->fs_uid))
-                                return 0;
+                                return -EINVAL;
                        break;
                case Opt_gid:
                        if (match_int(&args[0], &option))
-                                return 0;
+                                return -EINVAL;
                        opts->fs_gid = make_kgid(current_user_ns(), option);
                        if (!gid_valid(opts->fs_gid))
-                                return 0;
+                                return -EINVAL;
                        break;
                case Opt_umask:
                        if (match_octal(&args[0], &option))
-                                return 0;
+                                return -EINVAL;
                        opts->fs_fmask = opts->fs_dmask = option;
                        break;
                case Opt_dmask:
                        if (match_octal(&args[0], &option))
-                                return 0;
+                                return -EINVAL;
                        opts->fs_dmask = option;
                        break;
                case Opt_fmask:
                        if (match_octal(&args[0], &option))
-                                return 0;
+                                return -EINVAL;
                        opts->fs_fmask = option;
                        break;
                case Opt_allow_utime:
                        if (match_octal(&args[0], &option))
-                                return 0;
+                                return -EINVAL;
                        opts->allow_utime = option & (S_IWGRP | S_IWOTH);
                        break;
                case Opt_codepage:
                        if (match_int(&args[0], &option))
-                                return 0;
+                                return -EINVAL;
                        opts->codepage = option;
                        break;
                case Opt_flush:
                        opts->flush = 1;
                        break;
+                case Opt_time_offset:
+                        if (match_int(&args[0], &option))
+                                return -EINVAL;
+                        if (option < -12 * 60 || option > 12 * 60)
+                                return -EINVAL;
+                        opts->tz_set = 1;
+                        opts->time_offset = option;
+                        break;
                case Opt_tz_utc:
-                        opts->tz_utc = 1;
+                        opts->tz_set = 1;
+                        opts->time_offset = 0;
                        break;
                case Opt_err_cont:
                        opts->errors = FAT_ERRORS_CONT;
@@ -1431,6 +1448,14 @@ int fat_fill_super(struct super_block *sb, void *data, int silent, int isvfat,
                goto out_fail;
        }
+        if (sbi->options.discard) {
+                struct request_queue *q = bdev_get_queue(sb->s_bdev);
+                if (!blk_queue_discard(q))
+                        fat_msg(sb, KERN_WARNING,
+                                        "mounting with \"discard\" option, but "
+                                        "the device does not support discard");
+        }
        return 0;
 out_invalid:
diff --git a/fs/fat/misc.c b/fs/fat/misc.c
index 6d93360ca0cc..5eb600dc43a9 100644
--- a/fs/fat/misc.c
+++ b/fs/fat/misc.c
@@ -212,8 +212,10 @@ void fat_time_fat2unix(struct msdos_sb_info *sbi, struct timespec *ts,
                   + days_in_year[month] + day
                   + DAYS_DELTA) * SECS_PER_DAY;
-        if (!sbi->options.tz_utc)
+        if (!sbi->options.tz_set)
                second += sys_tz.tz_minuteswest * SECS_PER_MIN;
+        else
+                second -= sbi->options.time_offset * SECS_PER_MIN;
        if (time_cs) {
                ts->tv_sec = second + (time_cs / 100);
@@ -229,8 +231,9 @@ void fat_time_unix2fat(struct msdos_sb_info *sbi, struct timespec *ts,
                       __le16 *time, __le16 *date, u8 *time_cs)
 {
        struct tm tm;
-        time_to_tm(ts->tv_sec, sbi->options.tz_utc ? 0 :
+        time_to_tm(ts->tv_sec,
-                   -sys_tz.tz_minuteswest * 60, &tm);
+                   (sbi->options.tz_set ? sbi->options.time_offset :
+                   -sys_tz.tz_minuteswest) * SECS_PER_MIN, &tm);
        /*  FAT can only support year between 1980 to 2107 */
        if (tm.tm_year < 1980 - 1900) {
diff --git a/fs/fhandle.c b/fs/fhandle.c
index f775bfdd6e4a..cccdc874bb55 100644
--- a/fs/fhandle.c
+++ b/fs/fhandle.c
@@ -22,7 +22,7 @@ static long do_sys_name_to_handle(struct path *path,
        struct file_handle *handle = NULL;
        /*
-         * We need t make sure wether the file system
+         * We need to make sure whether the file system
         * support decoding of the file handle
         */
        if (!path->dentry->d_sb->s_export_op ||
@@ -40,7 +40,7 @@ static long do_sys_name_to_handle(struct path *path,
        if (!handle)
                return -ENOMEM;
-        /* convert handle size to  multiple of sizeof(u32) */
+        /* convert handle size to multiple of sizeof(u32) */
        handle_dwords = f_handle.handle_bytes >> 2;
        /* we ask for a non connected handle */
diff --git a/fs/file.c b/fs/file.c
index 708d997a7748..15cb8618e95d 100644
--- a/fs/file.c
+++ b/fs/file.c
@@ -519,12 +519,6 @@ struct files_struct init_files = {
        .file_lock      = __SPIN_LOCK_UNLOCKED(init_task.file_lock),
 };
-void daemonize_descriptors(void)
-{
-        atomic_inc(&init_files.count);
-        reset_files_struct(&init_files);
-}
 /*
 * allocate a file descriptor, mark it busy.
 */
@@ -685,7 +679,6 @@ void do_close_on_exec(struct files_struct *files)
        struct fdtable *fdt;
        /* exec unshares first */
-        BUG_ON(atomic_read(&files->count) != 1);
        spin_lock(&files->file_lock);
        for (i = 0; ; i++) {
                unsigned long set;
@@ -995,16 +988,18 @@ int iterate_fd(struct files_struct *files, unsigned n,
                const void *p)
 {
        struct fdtable *fdt;
-        struct file *file;
        int res = 0;
        if (!files)
                return 0;
        spin_lock(&files->file_lock);
-        fdt = files_fdtable(files);
+        for (fdt = files_fdtable(files); n < fdt->max_fds; n++) {
-        while (!res && n < fdt->max_fds) {
+                struct file *file;
-                file = rcu_dereference_check_fdtable(files, fdt->fd[n++]);
+                file = rcu_dereference_check_fdtable(files, fdt->fd[n]);
-                if (file)
+                if (!file)
-                        res = f(p, file, n);
+                        continue;
+                res = f(p, file, n);
+                if (res)
+                        break;
        }
        spin_unlock(&files->file_lock);
        return res;
diff --git a/fs/fs-writeback.c b/fs/fs-writeback.c
index 51ea267d444c..310972b72a66 100644
--- a/fs/fs-writeback.c
+++ b/fs/fs-writeback.c
@@ -228,6 +228,8 @@ static void requeue_io(struct inode *inode, struct bdi_writeback *wb)
 static void inode_sync_complete(struct inode *inode)
 {
        inode->i_state &= ~I_SYNC;
+        /* If inode is clean an unused, put it into LRU now... */
+        inode_add_lru(inode);
        /* Waiters must see I_SYNC cleared before being woken up */
        smp_mb();
        wake_up_bit(&inode->i_state, __I_SYNC);
@@ -1032,7 +1034,7 @@ int bdi_writeback_thread(void *data)
        while (!kthread_freezable_should_stop(NULL)) {
                /*
                 * Remove own delayed wake-up timer, since we are already awake
-                 * and we'll take care of the preriodic write-back.
+                 * and we'll take care of the periodic write-back.
                 */
                del_timer(&wb->wakeup_timer);
diff --git a/fs/fs_struct.c b/fs/fs_struct.c
index 5df4775fea03..fe6ca583bbc0 100644
--- a/fs/fs_struct.c
+++ b/fs/fs_struct.c
@@ -164,27 +164,3 @@ struct fs_struct init_fs = {
        .seq            = SEQCNT_ZERO,
        .umask          = 0022,
 };
-void daemonize_fs_struct(void)
-{
-        struct fs_struct *fs = current->fs;
-        if (fs) {
-                int kill;
-                task_lock(current);
-                spin_lock(&init_fs.lock);
-                init_fs.users++;
-                spin_unlock(&init_fs.lock);
-                spin_lock(&fs->lock);
-                current->fs = &init_fs;
-                kill = !--fs->users;
-                spin_unlock(&fs->lock);
-                task_unlock(current);
-                if (kill)
-                        free_fs_struct(fs);
-        }
-}
diff --git a/fs/fuse/dev.c b/fs/fuse/dev.c
index 8c23fa7a91e6..c16335315e5d 100644
--- a/fs/fuse/dev.c
+++ b/fs/fuse/dev.c
@@ -92,8 +92,8 @@ static void __fuse_put_request(struct fuse_req *req)
 static void fuse_req_init_context(struct fuse_req *req)
 {
-        req->in.h.uid = current_fsuid();
+        req->in.h.uid = from_kuid_munged(&init_user_ns, current_fsuid());
-        req->in.h.gid = current_fsgid();
+        req->in.h.gid = from_kgid_munged(&init_user_ns, current_fsgid());
        req->in.h.pid = current->pid;
 }
diff --git a/fs/fuse/dir.c b/fs/fuse/dir.c
index 324bc0850534..b7c09f9eb40c 100644
--- a/fs/fuse/dir.c
+++ b/fs/fuse/dir.c
@@ -818,8 +818,8 @@ static void fuse_fillattr(struct inode *inode, struct fuse_attr *attr,
        stat->ino = attr->ino;
        stat->mode = (inode->i_mode & S_IFMT) | (attr->mode & 07777);
        stat->nlink = attr->nlink;
-        stat->uid = attr->uid;
+        stat->uid = make_kuid(&init_user_ns, attr->uid);
-        stat->gid = attr->gid;
+        stat->gid = make_kgid(&init_user_ns, attr->gid);
        stat->rdev = inode->i_rdev;
        stat->atime.tv_sec = attr->atime;
        stat->atime.tv_nsec = attr->atimensec;
@@ -1007,12 +1007,12 @@ int fuse_allow_task(struct fuse_conn *fc, struct task_struct *task)
        rcu_read_lock();
        ret = 0;
        cred = __task_cred(task);
-        if (cred->euid == fc->user_id &&
+        if (uid_eq(cred->euid, fc->user_id) &&
-            cred->suid == fc->user_id &&
+            uid_eq(cred->suid, fc->user_id) &&
-            cred->uid  == fc->user_id &&
+            uid_eq(cred->uid,  fc->user_id) &&
-            cred->egid == fc->group_id &&
+            gid_eq(cred->egid, fc->group_id) &&
-            cred->sgid == fc->group_id &&
+            gid_eq(cred->sgid, fc->group_id) &&
-            cred->gid  == fc->group_id)
+            gid_eq(cred->gid,  fc->group_id))
                ret = 1;
        rcu_read_unlock();
@@ -1306,9 +1306,9 @@ static void iattr_to_fattr(struct iattr *iattr, struct fuse_setattr_in *arg)
        if (ivalid & ATTR_MODE)
                arg->valid |= FATTR_MODE,   arg->mode = iattr->ia_mode;
        if (ivalid & ATTR_UID)
-                arg->valid |= FATTR_UID,    arg->uid = iattr->ia_uid;
+                arg->valid |= FATTR_UID,    arg->uid = from_kuid(&init_user_ns, iattr->ia_uid);
        if (ivalid & ATTR_GID)
-                arg->valid |= FATTR_GID,    arg->gid = iattr->ia_gid;
+                arg->valid |= FATTR_GID,    arg->gid = from_kgid(&init_user_ns, iattr->ia_gid);
        if (ivalid & ATTR_SIZE)
                arg->valid |= FATTR_SIZE,   arg->size = iattr->ia_size;
        if (ivalid & ATTR_ATIME) {
diff --git a/fs/fuse/file.c b/fs/fuse/file.c
index 78d2837bc940..e21d4d8f87e3 100644
--- a/fs/fuse/file.c
+++ b/fs/fuse/file.c
@@ -1599,19 +1599,19 @@ static sector_t fuse_bmap(struct address_space *mapping, sector_t block)
        return err ? 0 : outarg.block;
 }
-static loff_t fuse_file_llseek(struct file *file, loff_t offset, int origin)
+static loff_t fuse_file_llseek(struct file *file, loff_t offset, int whence)
 {
        loff_t retval;
        struct inode *inode = file->f_path.dentry->d_inode;
        /* No i_mutex protection necessary for SEEK_CUR and SEEK_SET */
-        if (origin == SEEK_CUR || origin == SEEK_SET)
+        if (whence == SEEK_CUR || whence == SEEK_SET)
-                return generic_file_llseek(file, offset, origin);
+                return generic_file_llseek(file, offset, whence);
        mutex_lock(&inode->i_mutex);
        retval = fuse_update_attributes(inode, NULL, file, NULL);
        if (!retval)
-                retval = generic_file_llseek(file, offset, origin);
+                retval = generic_file_llseek(file, offset, whence);
        mutex_unlock(&inode->i_mutex);
        return retval;
diff --git a/fs/fuse/fuse_i.h b/fs/fuse/fuse_i.h
index e24dd74e3068..e105a53fc72d 100644
--- a/fs/fuse/fuse_i.h
+++ b/fs/fuse/fuse_i.h
@@ -333,10 +333,10 @@ struct fuse_conn {
        atomic_t count;
        /** The user id for this mount */
-        uid_t user_id;
+        kuid_t user_id;
        /** The group id for this mount */
-        gid_t group_id;
+        kgid_t group_id;
        /** The fuse mount flags for this mount */
        unsigned flags;
diff --git a/fs/fuse/inode.c b/fs/fuse/inode.c
index f0eda124cffb..73ca6b72beaf 100644
--- a/fs/fuse/inode.c
+++ b/fs/fuse/inode.c
@@ -60,8 +60,8 @@ MODULE_PARM_DESC(max_user_congthresh,
 struct fuse_mount_data {
        int fd;
        unsigned rootmode;
-        unsigned user_id;
+        kuid_t user_id;
-        unsigned group_id;
+        kgid_t group_id;
        unsigned fd_present:1;
        unsigned rootmode_present:1;
        unsigned user_id_present:1;
@@ -164,8 +164,8 @@ void fuse_change_attributes_common(struct inode *inode, struct fuse_attr *attr,
        inode->i_ino     = fuse_squash_ino(attr->ino);
        inode->i_mode    = (inode->i_mode & S_IFMT) | (attr->mode & 07777);
        set_nlink(inode, attr->nlink);
-        inode->i_uid     = attr->uid;
+        inode->i_uid     = make_kuid(&init_user_ns, attr->uid);
-        inode->i_gid     = attr->gid;
+        inode->i_gid     = make_kgid(&init_user_ns, attr->gid);
        inode->i_blocks  = attr->blocks;
        inode->i_atime.tv_sec   = attr->atime;
        inode->i_atime.tv_nsec  = attr->atimensec;
@@ -492,14 +492,18 @@ static int parse_fuse_opt(char *opt, struct fuse_mount_data *d, int is_bdev)
                case OPT_USER_ID:
                        if (match_int(&args[0], &value))
                                return 0;
-                        d->user_id = value;
+                        d->user_id = make_kuid(current_user_ns(), value);
+                        if (!uid_valid(d->user_id))
+                                return 0;
                        d->user_id_present = 1;
                        break;
                case OPT_GROUP_ID:
                        if (match_int(&args[0], &value))
                                return 0;
-                        d->group_id = value;
+                        d->group_id = make_kgid(current_user_ns(), value);
+                        if (!gid_valid(d->group_id))
+                                return 0;
                        d->group_id_present = 1;
                        break;
@@ -540,8 +544,8 @@ static int fuse_show_options(struct seq_file *m, struct dentry *root)
        struct super_block *sb = root->d_sb;
        struct fuse_conn *fc = get_fuse_conn_super(sb);
-        seq_printf(m, ",user_id=%u", fc->user_id);
+        seq_printf(m, ",user_id=%u", from_kuid_munged(&init_user_ns, fc->user_id));
-        seq_printf(m, ",group_id=%u", fc->group_id);
+        seq_printf(m, ",group_id=%u", from_kgid_munged(&init_user_ns, fc->group_id));
        if (fc->flags & FUSE_DEFAULT_PERMISSIONS)
                seq_puts(m, ",default_permissions");
        if (fc->flags & FUSE_ALLOW_OTHER)
@@ -989,7 +993,8 @@ static int fuse_fill_super(struct super_block *sb, void *data, int silent)
        if (!file)
                goto err;
-        if (file->f_op != &fuse_dev_operations)
+        if ((file->f_op != &fuse_dev_operations) ||
+            (file->f_cred->user_ns != &init_user_ns))
                goto err_fput;
        fc = kmalloc(sizeof(*fc), GFP_KERNEL);
diff --git a/fs/gfs2/aops.c b/fs/gfs2/aops.c
index 01c4975da4bc..30de4f2a2ea9 100644
--- a/fs/gfs2/aops.c
+++ b/fs/gfs2/aops.c
@@ -643,7 +643,7 @@ static int gfs2_write_begin(struct file *file, struct address_space *mapping,
                        goto out_unlock;
                requested = data_blocks + ind_blocks;
-                error = gfs2_inplace_reserve(ip, requested);
+                error = gfs2_inplace_reserve(ip, requested, 0);
                if (error)
                        goto out_qunlock;
        }
diff --git a/fs/gfs2/bmap.c b/fs/gfs2/bmap.c
index 1fd3ae237bdd..a68e91bcef3d 100644
--- a/fs/gfs2/bmap.c
+++ b/fs/gfs2/bmap.c
@@ -991,6 +991,41 @@ unlock:
        return err;
 }
+/**
+ * gfs2_journaled_truncate - Wrapper for truncate_pagecache for jdata files
+ * @inode: The inode being truncated
+ * @oldsize: The original (larger) size
+ * @newsize: The new smaller size
+ *
+ * With jdata files, we have to journal a revoke for each block which is
+ * truncated. As a result, we need to split this into separate transactions
+ * if the number of pages being truncated gets too large.
+ */
+#define GFS2_JTRUNC_REVOKES 8192
+static int gfs2_journaled_truncate(struct inode *inode, u64 oldsize, u64 newsize)
+{
+        struct gfs2_sbd *sdp = GFS2_SB(inode);
+        u64 max_chunk = GFS2_JTRUNC_REVOKES * sdp->sd_vfs->s_blocksize;
+        u64 chunk;
+        int error;
+        while (oldsize != newsize) {
+                chunk = oldsize - newsize;
+                if (chunk > max_chunk)
+                        chunk = max_chunk;
+                truncate_pagecache(inode, oldsize, oldsize - chunk);
+                oldsize -= chunk;
+                gfs2_trans_end(sdp);
+                error = gfs2_trans_begin(sdp, RES_DINODE, GFS2_JTRUNC_REVOKES);
+                if (error)
+                        return error;
+        }
+        return 0;
+}
 static int trunc_start(struct inode *inode, u64 oldsize, u64 newsize)
 {
        struct gfs2_inode *ip = GFS2_I(inode);
@@ -1000,8 +1035,10 @@ static int trunc_start(struct inode *inode, u64 oldsize, u64 newsize)
        int journaled = gfs2_is_jdata(ip);
        int error;
-        error = gfs2_trans_begin(sdp,
+        if (journaled)
-                                 RES_DINODE + (journaled ? RES_JDATA : 0), 0);
+                error = gfs2_trans_begin(sdp, RES_DINODE + RES_JDATA, GFS2_JTRUNC_REVOKES);
+        else
+                error = gfs2_trans_begin(sdp, RES_DINODE, 0);
        if (error)
                return error;
@@ -1026,7 +1063,16 @@ static int trunc_start(struct inode *inode, u64 oldsize, u64 newsize)
        ip->i_inode.i_mtime = ip->i_inode.i_ctime = CURRENT_TIME;
        gfs2_dinode_out(ip, dibh->b_data);
-        truncate_pagecache(inode, oldsize, newsize);
+        if (journaled)
+                error = gfs2_journaled_truncate(inode, oldsize, newsize);
+        else
+                truncate_pagecache(inode, oldsize, newsize);
+        if (error) {
+                brelse(dibh);
+                return error;
+        }
 out_brelse:
        brelse(dibh);
 out:
@@ -1178,7 +1224,7 @@ static int do_grow(struct inode *inode, u64 size)
                if (error)
                        return error;
-                error = gfs2_inplace_reserve(ip, 1);
+                error = gfs2_inplace_reserve(ip, 1, 0);
                if (error)
                        goto do_grow_qunlock;
                unstuff = 1;
diff --git a/fs/gfs2/dir.c b/fs/gfs2/dir.c
index 259b088cfc4c..9a35670fdc38 100644
--- a/fs/gfs2/dir.c
+++ b/fs/gfs2/dir.c
@@ -1676,16 +1676,11 @@ int gfs2_dir_add(struct inode *inode, const struct qstr *name,
                                be16_add_cpu(&leaf->lf_entries, 1);
                        }
                        brelse(bh);
-                        error = gfs2_meta_inode_buffer(ip, &bh);
-                        if (error)
-                                break;
-                        gfs2_trans_add_bh(ip->i_gl, bh, 1);
                        ip->i_entries++;
                        ip->i_inode.i_mtime = ip->i_inode.i_ctime = CURRENT_TIME;
                        if (S_ISDIR(nip->i_inode.i_mode))
                                inc_nlink(&ip->i_inode);
-                        gfs2_dinode_out(ip, bh->b_data);
+                        mark_inode_dirty(inode);
-                        brelse(bh);
                        error = 0;
                        break;
                }
diff --git a/fs/gfs2/file.c b/fs/gfs2/file.c
index 0def0504afc1..991ab2d484dd 100644
--- a/fs/gfs2/file.c
+++ b/fs/gfs2/file.c
@@ -44,7 +44,7 @@
 * gfs2_llseek - seek to a location in a file
 * @file: the file
 * @offset: the offset
- * @origin: Where to seek from (SEEK_SET, SEEK_CUR, or SEEK_END)
+ * @whence: Where to seek from (SEEK_SET, SEEK_CUR, or SEEK_END)
 *
 * SEEK_END requires the glock for the file because it references the
 * file's size.
@@ -52,26 +52,26 @@
 * Returns: The new offset, or errno
 */
-static loff_t gfs2_llseek(struct file *file, loff_t offset, int origin)
+static loff_t gfs2_llseek(struct file *file, loff_t offset, int whence)
 {
        struct gfs2_inode *ip = GFS2_I(file->f_mapping->host);
        struct gfs2_holder i_gh;
        loff_t error;
-        switch (origin) {
+        switch (whence) {
        case SEEK_END: /* These reference inode->i_size */
        case SEEK_DATA:
        case SEEK_HOLE:
                error = gfs2_glock_nq_init(ip->i_gl, LM_ST_SHARED, LM_FLAG_ANY,
                                           &i_gh);
                if (!error) {
-                        error = generic_file_llseek(file, offset, origin);
+                        error = generic_file_llseek(file, offset, whence);
                        gfs2_glock_dq_uninit(&i_gh);
                }
                break;
        case SEEK_CUR:
        case SEEK_SET:
-                error = generic_file_llseek(file, offset, origin);
+                error = generic_file_llseek(file, offset, whence);
                break;
        default:
                error = -EINVAL;
@@ -432,7 +432,7 @@ static int gfs2_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf)
        if (ret)
                goto out_unlock;
        gfs2_write_calc_reserv(ip, PAGE_CACHE_SIZE, &data_blocks, &ind_blocks);
-        ret = gfs2_inplace_reserve(ip, data_blocks + ind_blocks);
+        ret = gfs2_inplace_reserve(ip, data_blocks + ind_blocks, 0);
        if (ret)
                goto out_quota_unlock;
@@ -516,15 +516,13 @@ static int gfs2_mmap(struct file *file, struct vm_area_struct *vma)
                struct gfs2_holder i_gh;
                int error;
-                gfs2_holder_init(ip->i_gl, LM_ST_SHARED, LM_FLAG_ANY, &i_gh);
+                error = gfs2_glock_nq_init(ip->i_gl, LM_ST_SHARED, LM_FLAG_ANY,
-                error = gfs2_glock_nq(&i_gh);
+                                           &i_gh);
-                if (error == 0) {
-                        file_accessed(file);
-                        gfs2_glock_dq(&i_gh);
-                }
-                gfs2_holder_uninit(&i_gh);
                if (error)
                        return error;
+                /* grab lock to update inode */
+                gfs2_glock_dq_uninit(&i_gh);
+                file_accessed(file);
        }
        vma->vm_ops = &gfs2_vm_ops;
@@ -677,10 +675,8 @@ static ssize_t gfs2_file_aio_write(struct kiocb *iocb, const struct iovec *iov,
        size_t writesize = iov_length(iov, nr_segs);
        struct dentry *dentry = file->f_dentry;
        struct gfs2_inode *ip = GFS2_I(dentry->d_inode);
-        struct gfs2_sbd *sdp;
        int ret;
-        sdp = GFS2_SB(file->f_mapping->host);
        ret = gfs2_rs_alloc(ip);
        if (ret)
                return ret;
@@ -829,7 +825,7 @@ static long gfs2_fallocate(struct file *file, int mode, loff_t offset,
 retry:
                gfs2_write_calc_reserv(ip, bytes, &data_blocks, &ind_blocks);
-                error = gfs2_inplace_reserve(ip, data_blocks + ind_blocks);
+                error = gfs2_inplace_reserve(ip, data_blocks + ind_blocks, 0);
                if (error) {
                        if (error == -ENOSPC && bytes > sdp->sd_sb.sb_bsize) {
                                bytes >>= 1;
diff --git a/fs/gfs2/glock.c b/fs/gfs2/glock.c
index e6c2fd53cab2..992c5c0cb504 100644
--- a/fs/gfs2/glock.c
+++ b/fs/gfs2/glock.c
@@ -55,8 +55,6 @@ struct gfs2_glock_iter {
 typedef void (*glock_examiner) (struct gfs2_glock * gl);
-static int __dump_glock(struct seq_file *seq, const struct gfs2_glock *gl);
-#define GLOCK_BUG_ON(gl,x) do { if (unlikely(x)) { __dump_glock(NULL, gl); BUG(); } } while(0)
 static void do_xmote(struct gfs2_glock *gl, struct gfs2_holder *gh, unsigned int target);
 static struct dentry *gfs2_root;
@@ -107,10 +105,12 @@ static void gfs2_glock_dealloc(struct rcu_head *rcu)
 {
        struct gfs2_glock *gl = container_of(rcu, struct gfs2_glock, gl_rcu);
-        if (gl->gl_ops->go_flags & GLOF_ASPACE)
+        if (gl->gl_ops->go_flags & GLOF_ASPACE) {
                kmem_cache_free(gfs2_glock_aspace_cachep, gl);
-        else
+        } else {
+                kfree(gl->gl_lksb.sb_lvbptr);
                kmem_cache_free(gfs2_glock_cachep, gl);
+        }
 }
 void gfs2_glock_free(struct gfs2_glock *gl)
@@ -537,8 +537,8 @@ __acquires(&gl->gl_spin)
            (lck_flags & (LM_FLAG_TRY|LM_FLAG_TRY_1CB)))
                clear_bit(GLF_BLOCKING, &gl->gl_flags);
        spin_unlock(&gl->gl_spin);
-        if (glops->go_xmote_th)
+        if (glops->go_sync)
-                glops->go_xmote_th(gl);
+                glops->go_sync(gl);
        if (test_bit(GLF_INVALIDATE_IN_PROGRESS, &gl->gl_flags))
                glops->go_inval(gl, target == LM_ST_DEFERRED ? 0 : DIO_METADATA);
        clear_bit(GLF_INVALIDATE_IN_PROGRESS, &gl->gl_flags);
@@ -547,7 +547,10 @@ __acquires(&gl->gl_spin)
        if (sdp->sd_lockstruct.ls_ops->lm_lock) {
                /* lock_dlm */
                ret = sdp->sd_lockstruct.ls_ops->lm_lock(gl, target, lck_flags);
-                GLOCK_BUG_ON(gl, ret);
+                if (ret) {
+                        printk(KERN_ERR "GFS2: lm_lock ret %d\n", ret);
+                        GLOCK_BUG_ON(gl, 1);
+                }
        } else { /* lock_nolock */
                finish_xmote(gl, target);
                if (queue_delayed_work(glock_workqueue, &gl->gl_work, 0) == 0)
@@ -736,6 +739,16 @@ int gfs2_glock_get(struct gfs2_sbd *sdp, u64 number,
        if (!gl)
                return -ENOMEM;
+        memset(&gl->gl_lksb, 0, sizeof(struct dlm_lksb));
+        if (glops->go_flags & GLOF_LVB) {
+                gl->gl_lksb.sb_lvbptr = kzalloc(GFS2_MIN_LVB_SIZE, GFP_KERNEL);
+                if (!gl->gl_lksb.sb_lvbptr) {
+                        kmem_cache_free(cachep, gl);
+                        return -ENOMEM;
+                }
+        }
        atomic_inc(&sdp->sd_glock_disposal);
        gl->gl_sbd = sdp;
        gl->gl_flags = 0;
@@ -753,9 +766,6 @@ int gfs2_glock_get(struct gfs2_sbd *sdp, u64 number,
        preempt_enable();
        gl->gl_stats.stats[GFS2_LKS_DCOUNT] = 0;
        gl->gl_stats.stats[GFS2_LKS_QCOUNT] = 0;
-        memset(&gl->gl_lksb, 0, sizeof(struct dlm_lksb));
-        memset(gl->gl_lvb, 0, 32 * sizeof(char));
-        gl->gl_lksb.sb_lvbptr = gl->gl_lvb;
        gl->gl_tchange = jiffies;
        gl->gl_object = NULL;
        gl->gl_hold_time = GL_GLOCK_DFT_HOLD;
@@ -768,7 +778,7 @@ int gfs2_glock_get(struct gfs2_sbd *sdp, u64 number,
                mapping->host = s->s_bdev->bd_inode;
                mapping->flags = 0;
                mapping_set_gfp_mask(mapping, GFP_NOFS);
-                mapping->assoc_mapping = NULL;
+                mapping->private_data = NULL;
                mapping->backing_dev_info = s->s_bdi;
                mapping->writeback_index = 0;
        }
@@ -777,6 +787,7 @@ int gfs2_glock_get(struct gfs2_sbd *sdp, u64 number,
        tmp = search_bucket(hash, sdp, &name);
        if (tmp) {
                spin_unlock_bucket(hash);
+                kfree(gl->gl_lksb.sb_lvbptr);
                kmem_cache_free(cachep, gl);
                atomic_dec(&sdp->sd_glock_disposal);
                gl = tmp;
@@ -1013,7 +1024,7 @@ trap_recursive:
        printk(KERN_ERR "pid: %d\n", pid_nr(gh->gh_owner_pid));
        printk(KERN_ERR "lock type: %d req lock state : %d\n",
               gh->gh_gl->gl_name.ln_type, gh->gh_state);
-        __dump_glock(NULL, gl);
+        gfs2_dump_glock(NULL, gl);
        BUG();
 }
@@ -1508,7 +1519,7 @@ static int dump_glock(struct seq_file *seq, struct gfs2_glock *gl)
 {
        int ret;
        spin_lock(&gl->gl_spin);
-        ret = __dump_glock(seq, gl);
+        ret = gfs2_dump_glock(seq, gl);
        spin_unlock(&gl->gl_spin);
        return ret;
 }
@@ -1528,6 +1539,7 @@ static void dump_glock_func(struct gfs2_glock *gl)
 void gfs2_gl_hash_clear(struct gfs2_sbd *sdp)
 {
+        set_bit(SDF_SKIP_DLM_UNLOCK, &sdp->sd_flags);
        glock_hash_walk(clear_glock, sdp);
        flush_workqueue(glock_workqueue);
        wait_event(sdp->sd_glock_wait, atomic_read(&sdp->sd_glock_disposal) == 0);
@@ -1655,7 +1667,7 @@ static const char *gflags2str(char *buf, const struct gfs2_glock *gl)
 }
 /**
- * __dump_glock - print information about a glock
+ * gfs2_dump_glock - print information about a glock
 * @seq: The seq_file struct
 * @gl: the glock
 *
@@ -1672,7 +1684,7 @@ static const char *gflags2str(char *buf, const struct gfs2_glock *gl)
 * Returns: 0 on success, -ENOBUFS when we run out of space
 */
-static int __dump_glock(struct seq_file *seq, const struct gfs2_glock *gl)
+int gfs2_dump_glock(struct seq_file *seq, const struct gfs2_glock *gl)
 {
        const struct gfs2_glock_operations *glops = gl->gl_ops;
        unsigned long long dtime;
diff --git a/fs/gfs2/glock.h b/fs/gfs2/glock.h
index 307ac31df781..fd580b7861d5 100644
--- a/fs/gfs2/glock.h
+++ b/fs/gfs2/glock.h
@@ -178,33 +178,33 @@ static inline struct address_space *gfs2_glock2aspace(struct gfs2_glock *gl)
        return NULL;
 }
-int gfs2_glock_get(struct gfs2_sbd *sdp,
+extern int gfs2_glock_get(struct gfs2_sbd *sdp, u64 number,
-                   u64 number, const struct gfs2_glock_operations *glops,
+                          const struct gfs2_glock_operations *glops,
-                   int create, struct gfs2_glock **glp);
+                          int create, struct gfs2_glock **glp);
-void gfs2_glock_hold(struct gfs2_glock *gl);
+extern void gfs2_glock_hold(struct gfs2_glock *gl);
-void gfs2_glock_put_nolock(struct gfs2_glock *gl);
+extern void gfs2_glock_put_nolock(struct gfs2_glock *gl);
-void gfs2_glock_put(struct gfs2_glock *gl);
+extern void gfs2_glock_put(struct gfs2_glock *gl);
-void gfs2_holder_init(struct gfs2_glock *gl, unsigned int state, unsigned flags,
+extern void gfs2_holder_init(struct gfs2_glock *gl, unsigned int state,
-                      struct gfs2_holder *gh);
+                             unsigned flags, struct gfs2_holder *gh);
-void gfs2_holder_reinit(unsigned int state, unsigned flags,
+extern void gfs2_holder_reinit(unsigned int state, unsigned flags,
-                        struct gfs2_holder *gh);
+                               struct gfs2_holder *gh);
-void gfs2_holder_uninit(struct gfs2_holder *gh);
+extern void gfs2_holder_uninit(struct gfs2_holder *gh);
-int gfs2_glock_nq(struct gfs2_holder *gh);
+extern int gfs2_glock_nq(struct gfs2_holder *gh);
-int gfs2_glock_poll(struct gfs2_holder *gh);
+extern int gfs2_glock_poll(struct gfs2_holder *gh);
-int gfs2_glock_wait(struct gfs2_holder *gh);
+extern int gfs2_glock_wait(struct gfs2_holder *gh);
-void gfs2_glock_dq(struct gfs2_holder *gh);
+extern void gfs2_glock_dq(struct gfs2_holder *gh);
-void gfs2_glock_dq_wait(struct gfs2_holder *gh);
+extern void gfs2_glock_dq_wait(struct gfs2_holder *gh);
+extern void gfs2_glock_dq_uninit(struct gfs2_holder *gh);
-void gfs2_glock_dq_uninit(struct gfs2_holder *gh);
+extern int gfs2_glock_nq_num(struct gfs2_sbd *sdp, u64 number,
-int gfs2_glock_nq_num(struct gfs2_sbd *sdp,
+                             const struct gfs2_glock_operations *glops,
-                      u64 number, const struct gfs2_glock_operations *glops,
+                             unsigned int state, int flags,
-                      unsigned int state, int flags, struct gfs2_holder *gh);
+                             struct gfs2_holder *gh);
+extern int gfs2_glock_nq_m(unsigned int num_gh, struct gfs2_holder *ghs);
-int gfs2_glock_nq_m(unsigned int num_gh, struct gfs2_holder *ghs);
+extern void gfs2_glock_dq_m(unsigned int num_gh, struct gfs2_holder *ghs);
-void gfs2_glock_dq_m(unsigned int num_gh, struct gfs2_holder *ghs);
+extern void gfs2_glock_dq_uninit_m(unsigned int num_gh, struct gfs2_holder *ghs);
-void gfs2_glock_dq_uninit_m(unsigned int num_gh, struct gfs2_holder *ghs);
+extern int gfs2_dump_glock(struct seq_file *seq, const struct gfs2_glock *gl);
+#define GLOCK_BUG_ON(gl,x) do { if (unlikely(x)) { gfs2_dump_glock(NULL, gl); BUG(); } } while(0)
-__printf(2, 3)
+extern __printf(2, 3)
 void gfs2_print_dbg(struct seq_file *seq, const char *fmt, ...);
 /**
diff --git a/fs/gfs2/glops.c b/fs/gfs2/glops.c
index 32cc4fde975c..78d4184ffc7d 100644
--- a/fs/gfs2/glops.c
+++ b/fs/gfs2/glops.c
@@ -74,7 +74,7 @@ static void __gfs2_ail_flush(struct gfs2_glock *gl, bool fsync)
                gfs2_trans_add_revoke(sdp, bd);
        }
-        BUG_ON(!fsync && atomic_read(&gl->gl_ail_count));
+        GLOCK_BUG_ON(gl, !fsync && atomic_read(&gl->gl_ail_count));
        spin_unlock(&sdp->sd_ail_lock);
        gfs2_log_unlock(sdp);
 }
@@ -96,7 +96,7 @@ static void gfs2_ail_empty_gl(struct gfs2_glock *gl)
        tr.tr_ip = (unsigned long)__builtin_return_address(0);
        sb_start_intwrite(sdp->sd_vfs);
        gfs2_log_reserve(sdp, tr.tr_reserved);
-        BUG_ON(current->journal_info);
+        WARN_ON_ONCE(current->journal_info);
        current->journal_info = &tr;
        __gfs2_ail_flush(gl, 0);
@@ -139,7 +139,7 @@ static void rgrp_go_sync(struct gfs2_glock *gl)
        if (!test_and_clear_bit(GLF_DIRTY, &gl->gl_flags))
                return;
-        BUG_ON(gl->gl_state != LM_ST_EXCLUSIVE);
+        GLOCK_BUG_ON(gl, gl->gl_state != LM_ST_EXCLUSIVE);
        gfs2_log_flush(gl->gl_sbd, gl);
        filemap_fdatawrite(metamapping);
@@ -168,7 +168,7 @@ static void rgrp_go_inval(struct gfs2_glock *gl, int flags)
 {
        struct address_space *mapping = gfs2_glock2aspace(gl);
-        BUG_ON(!(flags & DIO_METADATA));
+        WARN_ON_ONCE(!(flags & DIO_METADATA));
        gfs2_assert_withdraw(gl->gl_sbd, !atomic_read(&gl->gl_ail_count));
        truncate_inode_pages(mapping, 0);
@@ -197,7 +197,7 @@ static void inode_go_sync(struct gfs2_glock *gl)
        if (!test_and_clear_bit(GLF_DIRTY, &gl->gl_flags))
                return;
-        BUG_ON(gl->gl_state != LM_ST_EXCLUSIVE);
+        GLOCK_BUG_ON(gl, gl->gl_state != LM_ST_EXCLUSIVE);
        gfs2_log_flush(gl->gl_sbd, gl);
        filemap_fdatawrite(metamapping);
@@ -536,7 +536,7 @@ const struct gfs2_glock_operations gfs2_meta_glops = {
 };
 const struct gfs2_glock_operations gfs2_inode_glops = {
-        .go_xmote_th = inode_go_sync,
+        .go_sync = inode_go_sync,
        .go_inval = inode_go_inval,
        .go_demote_ok = inode_go_demote_ok,
        .go_lock = inode_go_lock,
@@ -546,17 +546,17 @@ const struct gfs2_glock_operations gfs2_inode_glops = {
 };
 const struct gfs2_glock_operations gfs2_rgrp_glops = {
-        .go_xmote_th = rgrp_go_sync,
+        .go_sync = rgrp_go_sync,
        .go_inval = rgrp_go_inval,
        .go_lock = gfs2_rgrp_go_lock,
        .go_unlock = gfs2_rgrp_go_unlock,
        .go_dump = gfs2_rgrp_dump,
        .go_type = LM_TYPE_RGRP,
-        .go_flags = GLOF_ASPACE,
+        .go_flags = GLOF_ASPACE | GLOF_LVB,
 };
 const struct gfs2_glock_operations gfs2_trans_glops = {
-        .go_xmote_th = trans_go_sync,
+        .go_sync = trans_go_sync,
        .go_xmote_bh = trans_go_xmote_bh,
        .go_demote_ok = trans_go_demote_ok,
        .go_type = LM_TYPE_NONDISK,
@@ -577,6 +577,7 @@ const struct gfs2_glock_operations gfs2_nondisk_glops = {
 const struct gfs2_glock_operations gfs2_quota_glops = {
        .go_type = LM_TYPE_QUOTA,
+        .go_flags = GLOF_LVB,
 };
 const struct gfs2_glock_operations gfs2_journal_glops = {
diff --git a/fs/gfs2/incore.h b/fs/gfs2/incore.h
index 3d469d37345e..c373a24fedd9 100644
--- a/fs/gfs2/incore.h
+++ b/fs/gfs2/incore.h
@@ -205,7 +205,7 @@ struct lm_lockname {
 struct gfs2_glock_operations {
-        void (*go_xmote_th) (struct gfs2_glock *gl);
+        void (*go_sync) (struct gfs2_glock *gl);
        int (*go_xmote_bh) (struct gfs2_glock *gl, struct gfs2_holder *gh);
        void (*go_inval) (struct gfs2_glock *gl, int flags);
        int (*go_demote_ok) (const struct gfs2_glock *gl);
@@ -216,6 +216,7 @@ struct gfs2_glock_operations {
        const int go_type;
        const unsigned long go_flags;
 #define GLOF_ASPACE 1
+#define GLOF_LVB    2
 };
 enum {
@@ -321,7 +322,6 @@ struct gfs2_glock {
        ktime_t gl_dstamp;
        struct gfs2_lkstats gl_stats;
        struct dlm_lksb gl_lksb;
-        char gl_lvb[32];
        unsigned long gl_tchange;
        void *gl_object;
@@ -539,6 +539,7 @@ enum {
        SDF_DEMOTE              = 5,
        SDF_NOJOURNALID         = 6,
        SDF_RORECOVERY          = 7, /* read only recovery */
+        SDF_SKIP_DLM_UNLOCK     = 8,
 };
 #define GFS2_FSNAME_LEN         256
@@ -621,6 +622,7 @@ struct gfs2_sbd {
        u32 sd_hash_bsize_shift;
        u32 sd_hash_ptrs;       /* Number of pointers in a hash block */
        u32 sd_qc_per_block;
+        u32 sd_blocks_per_bitmap;
        u32 sd_max_dirres;      /* Max blocks needed to add a directory entry */
        u32 sd_max_height;      /* Max height of a file's metadata tree */
        u64 sd_heightsize[GFS2_MAX_META_HEIGHT + 1];
diff --git a/fs/gfs2/inode.c b/fs/gfs2/inode.c
index 381893ceefa4..2b6f5698ef18 100644
--- a/fs/gfs2/inode.c
+++ b/fs/gfs2/inode.c
@@ -364,34 +364,34 @@ static int create_ok(struct gfs2_inode *dip, const struct qstr *name,
        return 0;
 }
-static void munge_mode_uid_gid(struct gfs2_inode *dip, umode_t *mode,
+static void munge_mode_uid_gid(const struct gfs2_inode *dip,
-                               unsigned int *uid, unsigned int *gid)
+                               struct inode *inode)
 {
        if (GFS2_SB(&dip->i_inode)->sd_args.ar_suiddir &&
            (dip->i_inode.i_mode & S_ISUID) && dip->i_inode.i_uid) {
-                if (S_ISDIR(*mode))
+                if (S_ISDIR(inode->i_mode))
-                        *mode |= S_ISUID;
+                        inode->i_mode |= S_ISUID;
                else if (dip->i_inode.i_uid != current_fsuid())
-                        *mode &= ~07111;
+                        inode->i_mode &= ~07111;
-                *uid = dip->i_inode.i_uid;
+                inode->i_uid = dip->i_inode.i_uid;
        } else
-                *uid = current_fsuid();
+                inode->i_uid = current_fsuid();
        if (dip->i_inode.i_mode & S_ISGID) {
-                if (S_ISDIR(*mode))
+                if (S_ISDIR(inode->i_mode))
-                        *mode |= S_ISGID;
+                        inode->i_mode |= S_ISGID;
-                *gid = dip->i_inode.i_gid;
+                inode->i_gid = dip->i_inode.i_gid;
        } else
-                *gid = current_fsgid();
+                inode->i_gid = current_fsgid();
 }
-static int alloc_dinode(struct gfs2_inode *dip, u64 *no_addr, u64 *generation)
+static int alloc_dinode(struct gfs2_inode *ip, u32 flags)
 {
-        struct gfs2_sbd *sdp = GFS2_SB(&dip->i_inode);
+        struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode);
        int error;
        int dblocks = 1;
-        error = gfs2_inplace_reserve(dip, RES_DINODE);
+        error = gfs2_inplace_reserve(ip, RES_DINODE, flags);
        if (error)
                goto out;
@@ -399,12 +399,15 @@ static int alloc_dinode(struct gfs2_inode *dip, u64 *no_addr, u64 *generation)
        if (error)
                goto out_ipreserv;
-        error = gfs2_alloc_blocks(dip, no_addr, &dblocks, 1, generation);
+        error = gfs2_alloc_blocks(ip, &ip->i_no_addr, &dblocks, 1, &ip->i_generation);
+        ip->i_no_formal_ino = ip->i_generation;
+        ip->i_inode.i_ino = ip->i_no_addr;
+        ip->i_goal = ip->i_no_addr;
        gfs2_trans_end(sdp);
 out_ipreserv:
-        gfs2_inplace_release(dip);
+        gfs2_inplace_release(ip);
 out:
        return error;
 }
@@ -429,52 +432,42 @@ static void gfs2_init_dir(struct buffer_head *dibh,
 /**
 * init_dinode - Fill in a new dinode structure
 * @dip: The directory this inode is being created in
- * @gl: The glock covering the new inode
+ * @ip: The inode
- * @inum: The inode number
- * @mode: The file permissions
- * @uid: The uid of the new inode
- * @gid: The gid of the new inode
- * @generation: The generation number of the new inode
- * @dev: The device number (if a device node)
 * @symname: The symlink destination (if a symlink)
- * @size: The inode size (ignored for directories)
 * @bhp: The buffer head (returned to caller)
 *
 */
-static void init_dinode(struct gfs2_inode *dip, struct gfs2_glock *gl,
+static void init_dinode(struct gfs2_inode *dip, struct gfs2_inode *ip,
-                        const struct gfs2_inum_host *inum, umode_t mode,
+                        const char *symname, struct buffer_head **bhp)
-                        unsigned int uid, unsigned int gid,
-                        const u64 *generation, dev_t dev, const char *symname,
-                        unsigned size, struct buffer_head **bhp)
 {
        struct gfs2_sbd *sdp = GFS2_SB(&dip->i_inode);
        struct gfs2_dinode *di;
        struct buffer_head *dibh;
        struct timespec tv = CURRENT_TIME;
-        dibh = gfs2_meta_new(gl, inum->no_addr);
+        dibh = gfs2_meta_new(ip->i_gl, ip->i_no_addr);
-        gfs2_trans_add_bh(gl, dibh, 1);
+        gfs2_trans_add_bh(ip->i_gl, dibh, 1);
        gfs2_metatype_set(dibh, GFS2_METATYPE_DI, GFS2_FORMAT_DI);
        gfs2_buffer_clear_tail(dibh, sizeof(struct gfs2_dinode));
        di = (struct gfs2_dinode *)dibh->b_data;
-        di->di_num.no_formal_ino = cpu_to_be64(inum->no_formal_ino);
+        di->di_num.no_formal_ino = cpu_to_be64(ip->i_no_formal_ino);
-        di->di_num.no_addr = cpu_to_be64(inum->no_addr);
+        di->di_num.no_addr = cpu_to_be64(ip->i_no_addr);
-        di->di_mode = cpu_to_be32(mode);
+        di->di_mode = cpu_to_be32(ip->i_inode.i_mode);
-        di->di_uid = cpu_to_be32(uid);
+        di->di_uid = cpu_to_be32(ip->i_inode.i_uid);
-        di->di_gid = cpu_to_be32(gid);
+        di->di_gid = cpu_to_be32(ip->i_inode.i_gid);
        di->di_nlink = 0;
-        di->di_size = cpu_to_be64(size);
+        di->di_size = cpu_to_be64(ip->i_inode.i_size);
        di->di_blocks = cpu_to_be64(1);
        di->di_atime = di->di_mtime = di->di_ctime = cpu_to_be64(tv.tv_sec);
-        di->di_major = cpu_to_be32(MAJOR(dev));
+        di->di_major = cpu_to_be32(MAJOR(ip->i_inode.i_rdev));
-        di->di_minor = cpu_to_be32(MINOR(dev));
+        di->di_minor = cpu_to_be32(MINOR(ip->i_inode.i_rdev));
-        di->di_goal_meta = di->di_goal_data = cpu_to_be64(inum->no_addr);
+        di->di_goal_meta = di->di_goal_data = cpu_to_be64(ip->i_no_addr);
-        di->di_generation = cpu_to_be64(*generation);
+        di->di_generation = cpu_to_be64(ip->i_generation);
        di->di_flags = 0;
        di->__pad1 = 0;
-        di->di_payload_format = cpu_to_be32(S_ISDIR(mode) ? GFS2_FORMAT_DE : 0);
+        di->di_payload_format = cpu_to_be32(S_ISDIR(ip->i_inode.i_mode) ? GFS2_FORMAT_DE : 0);
        di->di_height = 0;
        di->__pad2 = 0;
        di->__pad3 = 0;
@@ -487,7 +480,7 @@ static void init_dinode(struct gfs2_inode *dip, struct gfs2_glock *gl,
        di->di_ctime_nsec = cpu_to_be32(tv.tv_nsec);
        memset(&di->di_reserved, 0, sizeof(di->di_reserved));
-        switch(mode & S_IFMT) { 
+        switch(ip->i_inode.i_mode & S_IFMT) {
        case S_IFREG:
                if ((dip->i_diskflags & GFS2_DIF_INHERIT_JDATA) ||
                    gfs2_tune_get(sdp, gt_new_files_jdata))
@@ -502,7 +495,7 @@ static void init_dinode(struct gfs2_inode *dip, struct gfs2_glock *gl,
                gfs2_init_dir(dibh, dip);
                break;
        case S_IFLNK:
-                memcpy(dibh->b_data + sizeof(struct gfs2_dinode), symname, size);
+                memcpy(dibh->b_data + sizeof(struct gfs2_dinode), symname, ip->i_inode.i_size);
                break;
        }
@@ -511,25 +504,22 @@ static void init_dinode(struct gfs2_inode *dip, struct gfs2_glock *gl,
        *bhp = dibh;
 }
-static int make_dinode(struct gfs2_inode *dip, struct gfs2_glock *gl,
+static int make_dinode(struct gfs2_inode *dip, struct gfs2_inode *ip,
-                       umode_t mode, const struct gfs2_inum_host *inum,
+                       const char *symname, struct buffer_head **bhp)
-                       const u64 *generation, dev_t dev, const char *symname,
-                       unsigned int size, struct buffer_head **bhp)
 {
+        struct inode *inode = &ip->i_inode;
        struct gfs2_sbd *sdp = GFS2_SB(&dip->i_inode);
-        unsigned int uid, gid;
        int error;
-        munge_mode_uid_gid(dip, &mode, &uid, &gid);
        error = gfs2_rindex_update(sdp);
        if (error)
                return error;
-        error = gfs2_quota_lock(dip, uid, gid);
+        error = gfs2_quota_lock(dip, inode->i_uid, inode->i_gid);
        if (error)
                return error;
-        error = gfs2_quota_check(dip, uid, gid);
+        error = gfs2_quota_check(dip, inode->i_uid, inode->i_gid);
        if (error)
                goto out_quota;
@@ -537,8 +527,8 @@ static int make_dinode(struct gfs2_inode *dip, struct gfs2_glock *gl,
        if (error)
                goto out_quota;
-        init_dinode(dip, gl, inum, mode, uid, gid, generation, dev, symname, size, bhp);
+        init_dinode(dip, ip, symname, bhp);
-        gfs2_quota_change(dip, +1, uid, gid);
+        gfs2_quota_change(dip, +1, inode->i_uid, inode->i_gid);
        gfs2_trans_end(sdp);
 out_quota:
@@ -570,7 +560,7 @@ static int link_dinode(struct gfs2_inode *dip, const struct qstr *name,
                if (error)
                        goto fail_quota_locks;
-                error = gfs2_inplace_reserve(dip, sdp->sd_max_dirres);
+                error = gfs2_inplace_reserve(dip, sdp->sd_max_dirres, 0);
                if (error)
                        goto fail_quota_locks;
@@ -657,19 +647,14 @@ static int gfs2_create_inode(struct inode *dir, struct dentry *dentry,
        struct inode *inode = NULL;
        struct gfs2_inode *dip = GFS2_I(dir), *ip;
        struct gfs2_sbd *sdp = GFS2_SB(&dip->i_inode);
-        struct gfs2_inum_host inum = { .no_addr = 0, .no_formal_ino = 0 };
+        struct gfs2_glock *io_gl;
        int error;
-        u64 generation;
        struct buffer_head *bh = NULL;
+        u32 aflags = 0;
        if (!name->len || name->len > GFS2_FNAMESIZE)
                return -ENAMETOOLONG;
-        /* We need a reservation to allocate the new dinode block. The
-           directory ip temporarily points to the reservation, but this is
-           being done to get a set of contiguous blocks for the new dinode.
-           Since this is a create, we don't have a sizehint yet, so it will
-           have to use the minimum reservation size. */
        error = gfs2_rs_alloc(dip);
        if (error)
                return error;
@@ -688,45 +673,72 @@ static int gfs2_create_inode(struct inode *dir, struct dentry *dentry,
        if (error)
                goto fail_gunlock;
-        error = alloc_dinode(dip, &inum.no_addr, &generation);
+        inode = new_inode(sdp->sd_vfs);
+        if (!inode) {
+                gfs2_glock_dq_uninit(ghs);
+                return -ENOMEM;
+        }
+        ip = GFS2_I(inode);
+        error = gfs2_rs_alloc(ip);
        if (error)
-                goto fail_gunlock;
+                goto fail_free_inode;
-        inum.no_formal_ino = generation;
+        set_bit(GIF_INVALID, &ip->i_flags);
+        inode->i_mode = mode;
+        inode->i_rdev = dev;
+        inode->i_size = size;
+        munge_mode_uid_gid(dip, inode);
+        ip->i_goal = dip->i_goal;
-        error = gfs2_glock_nq_num(sdp, inum.no_addr, &gfs2_inode_glops,
+        if ((GFS2_I(sdp->sd_root_dir->d_inode) == dip) ||
-                                  LM_ST_EXCLUSIVE, GL_SKIP, ghs + 1);
+            (dip->i_diskflags & GFS2_DIF_TOPDIR))
+                aflags |= GFS2_AF_ORLOV;
+        error = alloc_dinode(ip, aflags);
        if (error)
-                goto fail_gunlock;
+                goto fail_free_inode;
-        error = make_dinode(dip, ghs[1].gh_gl, mode, &inum, &generation, dev, symname, size, &bh);
+        error = gfs2_glock_get(sdp, ip->i_no_addr, &gfs2_inode_glops, CREATE, &ip->i_gl);
        if (error)
-                goto fail_gunlock2;
+                goto fail_free_inode;
-        inode = gfs2_inode_lookup(dir->i_sb, IF2DT(mode), inum.no_addr,
+        ip->i_gl->gl_object = ip;
-                                  inum.no_formal_ino, 0);
+        error = gfs2_glock_nq_init(ip->i_gl, LM_ST_EXCLUSIVE, GL_SKIP, ghs + 1);
-        if (IS_ERR(inode))
+        if (error)
+                goto fail_free_inode;
+        error = make_dinode(dip, ip, symname, &bh);
+        if (error)
                goto fail_gunlock2;
-        ip = GFS2_I(inode);
+        error = gfs2_glock_get(sdp, ip->i_no_addr, &gfs2_iopen_glops, CREATE, &io_gl);
-        error = gfs2_inode_refresh(ip);
        if (error)
                goto fail_gunlock2;
-        error = gfs2_rs_alloc(ip);
+        error = gfs2_glock_nq_init(io_gl, LM_ST_SHARED, GL_EXACT, &ip->i_iopen_gh);
        if (error)
                goto fail_gunlock2;
+        ip->i_iopen_gh.gh_gl->gl_object = ip;
+        gfs2_glock_put(io_gl);
+        gfs2_set_iop(inode);
+        insert_inode_hash(inode);
+        error = gfs2_inode_refresh(ip);
+        if (error)
+                goto fail_gunlock3;
        error = gfs2_acl_create(dip, inode);
        if (error)
-                goto fail_gunlock2;
+                goto fail_gunlock3;
        error = gfs2_security_init(dip, ip, name);
        if (error)
-                goto fail_gunlock2;
+                goto fail_gunlock3;
        error = link_dinode(dip, name, ip);
        if (error)
-                goto fail_gunlock2;
+                goto fail_gunlock3;
        if (bh)
                brelse(bh);
@@ -739,8 +751,20 @@ static int gfs2_create_inode(struct inode *dir, struct dentry *dentry,
        d_instantiate(dentry, inode);
        return 0;
+fail_gunlock3:
+        gfs2_glock_dq_uninit(ghs + 1);
+        if (ip->i_gl)
+                gfs2_glock_put(ip->i_gl);
+        goto fail_gunlock;
 fail_gunlock2:
        gfs2_glock_dq_uninit(ghs + 1);
+fail_free_inode:
+        if (ip->i_gl)
+                gfs2_glock_put(ip->i_gl);
+        gfs2_rs_delete(ip);
+        free_inode_nonrcu(inode);
+        inode = NULL;
 fail_gunlock:
        gfs2_glock_dq_uninit(ghs);
        if (inode && !IS_ERR(inode)) {
@@ -748,7 +772,6 @@ fail_gunlock:
                iput(inode);
        }
 fail:
-        gfs2_rs_delete(dip);
        if (bh)
                brelse(bh);
        return error;
@@ -884,7 +907,7 @@ static int gfs2_link(struct dentry *old_dentry, struct inode *dir,
                if (error)
                        goto out_gunlock;
-                error = gfs2_inplace_reserve(dip, sdp->sd_max_dirres);
+                error = gfs2_inplace_reserve(dip, sdp->sd_max_dirres, 0);
                if (error)
                        goto out_gunlock_q;
@@ -977,7 +1000,6 @@ static int gfs2_unlink_ok(struct gfs2_inode *dip, const struct qstr *name,
 * gfs2_unlink_inode - Removes an inode from its parent dir and unlinks it
 * @dip: The parent directory
 * @name: The name of the entry in the parent directory
- * @bh: The inode buffer for the inode to be removed
 * @inode: The inode to be removed
 *
 * Called with all the locks and in a transaction. This will only be
@@ -987,8 +1009,7 @@ static int gfs2_unlink_ok(struct gfs2_inode *dip, const struct qstr *name,
 */
 static int gfs2_unlink_inode(struct gfs2_inode *dip,
-                             const struct dentry *dentry,
+                             const struct dentry *dentry)
-                             struct buffer_head *bh)
 {
        struct inode *inode = dentry->d_inode;
        struct gfs2_inode *ip = GFS2_I(inode);
@@ -1028,7 +1049,6 @@ static int gfs2_unlink(struct inode *dir, struct dentry *dentry)
        struct gfs2_sbd *sdp = GFS2_SB(dir);
        struct inode *inode = dentry->d_inode;
        struct gfs2_inode *ip = GFS2_I(inode);
-        struct buffer_head *bh;
        struct gfs2_holder ghs[3];
        struct gfs2_rgrpd *rgd;
        int error;
@@ -1077,14 +1097,9 @@ static int gfs2_unlink(struct inode *dir, struct dentry *dentry)
        error = gfs2_trans_begin(sdp, 2*RES_DINODE + 3*RES_LEAF + RES_RG_BIT, 0);
        if (error)
-                goto out_gunlock;
-        error = gfs2_meta_inode_buffer(ip, &bh);
-        if (error)
                goto out_end_trans;
-        error = gfs2_unlink_inode(dip, dentry, bh);
+        error = gfs2_unlink_inode(dip, dentry);
-        brelse(bh);
 out_end_trans:
        gfs2_trans_end(sdp);
@@ -1365,7 +1380,7 @@ static int gfs2_rename(struct inode *odir, struct dentry *odentry,
                if (error)
                        goto out_gunlock;
-                error = gfs2_inplace_reserve(ndip, sdp->sd_max_dirres);
+                error = gfs2_inplace_reserve(ndip, sdp->sd_max_dirres, 0);
                if (error)
                        goto out_gunlock_q;
@@ -1384,14 +1399,8 @@ static int gfs2_rename(struct inode *odir, struct dentry *odentry,
        /* Remove the target file, if it exists */
-        if (nip) {
+        if (nip)
-                struct buffer_head *bh;
+                error = gfs2_unlink_inode(ndip, ndentry);
-                error = gfs2_meta_inode_buffer(nip, &bh);
-                if (error)
-                        goto out_end_trans;
-                error = gfs2_unlink_inode(ndip, ndentry, bh);
-                brelse(bh);
-        }
        if (dir_rename) {
                error = gfs2_dir_mvino(ip, &gfs2_qdotdot, ndip, DT_DIR);
diff --git a/fs/gfs2/lock_dlm.c b/fs/gfs2/lock_dlm.c
index 0fb6539b0c8c..8dad6b093716 100644
--- a/fs/gfs2/lock_dlm.c
+++ b/fs/gfs2/lock_dlm.c
@@ -120,8 +120,8 @@ static void gdlm_ast(void *arg)
        gfs2_update_reply_times(gl);
        BUG_ON(gl->gl_lksb.sb_flags & DLM_SBF_DEMOTED);
-        if (gl->gl_lksb.sb_flags & DLM_SBF_VALNOTVALID)
+        if ((gl->gl_lksb.sb_flags & DLM_SBF_VALNOTVALID) && gl->gl_lksb.sb_lvbptr)
-                memset(gl->gl_lvb, 0, GDLM_LVB_SIZE);
+                memset(gl->gl_lksb.sb_lvbptr, 0, GDLM_LVB_SIZE);
        switch (gl->gl_lksb.sb_status) {
        case -DLM_EUNLOCK: /* Unlocked, so glock can be freed */
@@ -203,8 +203,10 @@ static int make_mode(const unsigned int lmstate)
 static u32 make_flags(struct gfs2_glock *gl, const unsigned int gfs_flags,
                      const int req)
 {
-        u32 lkf = DLM_LKF_VALBLK;
+        u32 lkf = 0;
-        u32 lkid = gl->gl_lksb.sb_lkid;
+        if (gl->gl_lksb.sb_lvbptr)
+                lkf |= DLM_LKF_VALBLK;
        if (gfs_flags & LM_FLAG_TRY)
                lkf |= DLM_LKF_NOQUEUE;
@@ -228,7 +230,7 @@ static u32 make_flags(struct gfs2_glock *gl, const unsigned int gfs_flags,
                        BUG();
        }
-        if (lkid != 0) {
+        if (gl->gl_lksb.sb_lkid != 0) {
                lkf |= DLM_LKF_CONVERT;
                if (test_bit(GLF_BLOCKING, &gl->gl_flags))
                        lkf |= DLM_LKF_QUECVT;
@@ -289,6 +291,14 @@ static void gdlm_put_lock(struct gfs2_glock *gl)
        gfs2_glstats_inc(gl, GFS2_LKS_DCOUNT);
        gfs2_sbstats_inc(gl, GFS2_LKS_DCOUNT);
        gfs2_update_request_times(gl);
+        /* don't want to skip dlm_unlock writing the lvb when lock is ex */
+        if (test_bit(SDF_SKIP_DLM_UNLOCK, &sdp->sd_flags) &&
+            gl->gl_lksb.sb_lvbptr && (gl->gl_state != LM_ST_EXCLUSIVE)) {
+                gfs2_glock_free(gl);
+                return;
+        }
        error = dlm_unlock(ls->ls_dlm, gl->gl_lksb.sb_lkid, DLM_LKF_VALBLK,
                           NULL, gl);
        if (error) {
diff --git a/fs/gfs2/lops.c b/fs/gfs2/lops.c
index 8ff95a2d54ee..9ceccb1595a3 100644
--- a/fs/gfs2/lops.c
+++ b/fs/gfs2/lops.c
@@ -393,12 +393,10 @@ static void buf_lo_add(struct gfs2_sbd *sdp, struct gfs2_bufdata *bd)
        struct gfs2_meta_header *mh;
        struct gfs2_trans *tr;
-        lock_buffer(bd->bd_bh);
-        gfs2_log_lock(sdp);
        tr = current->journal_info;
        tr->tr_touched = 1;
        if (!list_empty(&bd->bd_list))
-                goto out;
+                return;
        set_bit(GLF_LFLUSH, &bd->bd_gl->gl_flags);
        set_bit(GLF_DIRTY, &bd->bd_gl->gl_flags);
        mh = (struct gfs2_meta_header *)bd->bd_bh->b_data;
@@ -414,9 +412,6 @@ static void buf_lo_add(struct gfs2_sbd *sdp, struct gfs2_bufdata *bd)
        sdp->sd_log_num_buf++;
        list_add(&bd->bd_list, &sdp->sd_log_le_buf);
        tr->tr_num_buf_new++;
-out:
-        gfs2_log_unlock(sdp);
-        unlock_buffer(bd->bd_bh);
 }
 static void gfs2_check_magic(struct buffer_head *bh)
@@ -621,7 +616,6 @@ static void revoke_lo_add(struct gfs2_sbd *sdp, struct gfs2_bufdata *bd)
 static void revoke_lo_before_commit(struct gfs2_sbd *sdp)
 {
-        struct gfs2_log_descriptor *ld;
        struct gfs2_meta_header *mh;
        unsigned int offset;
        struct list_head *head = &sdp->sd_log_le_revoke;
@@ -634,7 +628,6 @@ static void revoke_lo_before_commit(struct gfs2_sbd *sdp)
        length = gfs2_struct2blk(sdp, sdp->sd_log_num_revoke, sizeof(u64));
        page = gfs2_get_log_desc(sdp, GFS2_LOG_DESC_REVOKE, length, sdp->sd_log_num_revoke);
-        ld = page_address(page);
        offset = sizeof(struct gfs2_log_descriptor);
        list_for_each_entry(bd, head, bd_list) {
@@ -777,12 +770,10 @@ static void databuf_lo_add(struct gfs2_sbd *sdp, struct gfs2_bufdata *bd)
        struct address_space *mapping = bd->bd_bh->b_page->mapping;
        struct gfs2_inode *ip = GFS2_I(mapping->host);
-        lock_buffer(bd->bd_bh);
-        gfs2_log_lock(sdp);
        if (tr)
                tr->tr_touched = 1;
        if (!list_empty(&bd->bd_list))
-                goto out;
+                return;
        set_bit(GLF_LFLUSH, &bd->bd_gl->gl_flags);
        set_bit(GLF_DIRTY, &bd->bd_gl->gl_flags);
        if (gfs2_is_jdata(ip)) {
@@ -793,9 +784,6 @@ static void databuf_lo_add(struct gfs2_sbd *sdp, struct gfs2_bufdata *bd)
        } else {
                list_add_tail(&bd->bd_list, &sdp->sd_log_le_ordered);
        }
-out:
-        gfs2_log_unlock(sdp);
-        unlock_buffer(bd->bd_bh);
 }
 /**
diff --git a/fs/gfs2/ops_fstype.c b/fs/gfs2/ops_fstype.c
index e443966c8106..0e3554edb8f2 100644
--- a/fs/gfs2/ops_fstype.c
+++ b/fs/gfs2/ops_fstype.c
@@ -278,6 +278,9 @@ static int gfs2_read_sb(struct gfs2_sbd *sdp, int silent)
        sdp->sd_qc_per_block = (sdp->sd_sb.sb_bsize -
                                sizeof(struct gfs2_meta_header)) /
                                sizeof(struct gfs2_quota_change);
+        sdp->sd_blocks_per_bitmap = (sdp->sd_sb.sb_bsize -
+                                     sizeof(struct gfs2_meta_header))
+                * GFS2_NBBY; /* not the rgrp bitmap, subsequent bitmaps only */
        /* Compute maximum reservation required to add a entry to a directory */
diff --git a/fs/gfs2/quota.c b/fs/gfs2/quota.c
index 40c4b0d42fa8..ae55e248c3b7 100644
--- a/fs/gfs2/quota.c
+++ b/fs/gfs2/quota.c
@@ -497,8 +497,11 @@ int gfs2_quota_hold(struct gfs2_inode *ip, u32 uid, u32 gid)
        struct gfs2_quota_data **qd;
        int error;
-        if (ip->i_res == NULL)
+        if (ip->i_res == NULL) {
-                gfs2_rs_alloc(ip);
+                error = gfs2_rs_alloc(ip);
+                if (error)
+                        return error;
+        }
        qd = ip->i_res->rs_qa_qd;
@@ -813,7 +816,7 @@ static int do_sync(unsigned int num_qd, struct gfs2_quota_data **qda)
        blocks = num_qd * data_blocks + RES_DINODE + num_qd + 3;
        reserved = 1 + (nalloc * (data_blocks + ind_blocks));
-        error = gfs2_inplace_reserve(ip, reserved);
+        error = gfs2_inplace_reserve(ip, reserved, 0);
        if (error)
                goto out_alloc;
@@ -866,7 +869,7 @@ static int update_qd(struct gfs2_sbd *sdp, struct gfs2_quota_data *qd)
        if (error < 0)
                return error;
-        qlvb = (struct gfs2_quota_lvb *)qd->qd_gl->gl_lvb;
+        qlvb = (struct gfs2_quota_lvb *)qd->qd_gl->gl_lksb.sb_lvbptr;
        qlvb->qb_magic = cpu_to_be32(GFS2_MAGIC);
        qlvb->__pad = 0;
        qlvb->qb_limit = q.qu_limit;
@@ -890,7 +893,7 @@ restart:
        if (error)
                return error;
-        qd->qd_qb = *(struct gfs2_quota_lvb *)qd->qd_gl->gl_lvb;
+        qd->qd_qb = *(struct gfs2_quota_lvb *)qd->qd_gl->gl_lksb.sb_lvbptr;
        if (force_refresh || qd->qd_qb.qb_magic != cpu_to_be32(GFS2_MAGIC)) {
                gfs2_glock_dq_uninit(q_gh);
@@ -1503,7 +1506,7 @@ static int gfs2_get_dqblk(struct super_block *sb, struct kqid qid,
        if (error)
                goto out;
-        qlvb = (struct gfs2_quota_lvb *)qd->qd_gl->gl_lvb;
+        qlvb = (struct gfs2_quota_lvb *)qd->qd_gl->gl_lksb.sb_lvbptr;
        fdq->d_version = FS_DQUOT_VERSION;
        fdq->d_flags = (type == QUOTA_USER) ? FS_USER_QUOTA : FS_GROUP_QUOTA;
        fdq->d_id = from_kqid(&init_user_ns, qid);
@@ -1602,7 +1605,7 @@ static int gfs2_set_dqblk(struct super_block *sb, struct kqid qid,
                gfs2_write_calc_reserv(ip, sizeof(struct gfs2_quota),
                                       &data_blocks, &ind_blocks);
                blocks = 1 + data_blocks + ind_blocks;
-                error = gfs2_inplace_reserve(ip, blocks);
+                error = gfs2_inplace_reserve(ip, blocks, 0);
                if (error)
                        goto out_i;
                blocks += gfs2_rg_blocks(ip, blocks);
diff --git a/fs/gfs2/rgrp.c b/fs/gfs2/rgrp.c
index 3cc402ce6fea..37ee061d899e 100644
--- a/fs/gfs2/rgrp.c
+++ b/fs/gfs2/rgrp.c
@@ -16,6 +16,7 @@
 #include <linux/prefetch.h>
 #include <linux/blkdev.h>
 #include <linux/rbtree.h>
+#include <linux/random.h>
 #include "gfs2.h"
 #include "incore.h"
@@ -251,22 +252,25 @@ static u32 gfs2_bitfit(const u8 *buf, const unsigned int len,
 static int gfs2_rbm_from_block(struct gfs2_rbm *rbm, u64 block)
 {
        u64 rblock = block - rbm->rgd->rd_data0;
-        u32 goal = (u32)rblock;
+        u32 x;
-        int x;
        if (WARN_ON_ONCE(rblock > UINT_MAX))
                return -EINVAL;
        if (block >= rbm->rgd->rd_data0 + rbm->rgd->rd_data)
                return -E2BIG;
-        for (x = 0; x < rbm->rgd->rd_length; x++) {
+        rbm->bi = rbm->rgd->rd_bits;
-                rbm->bi = rbm->rgd->rd_bits + x;
+        rbm->offset = (u32)(rblock);
-                if (goal < (rbm->bi->bi_start + rbm->bi->bi_len) * GFS2_NBBY) {
+        /* Check if the block is within the first block */
-                        rbm->offset = goal - (rbm->bi->bi_start * GFS2_NBBY);
+        if (rbm->offset < (rbm->bi->bi_start + rbm->bi->bi_len) * GFS2_NBBY)
-                        break;
+                return 0;
-                }
-        }
+        /* Adjust for the size diff between gfs2_meta_header and gfs2_rgrp */
+        rbm->offset += (sizeof(struct gfs2_rgrp) -
+                        sizeof(struct gfs2_meta_header)) * GFS2_NBBY;
+        x = rbm->offset / rbm->rgd->rd_sbd->sd_blocks_per_bitmap;
+        rbm->offset -= x * rbm->rgd->rd_sbd->sd_blocks_per_bitmap;
+        rbm->bi += x;
        return 0;
 }
@@ -553,7 +557,6 @@ void gfs2_free_clones(struct gfs2_rgrpd *rgd)
 */
 int gfs2_rs_alloc(struct gfs2_inode *ip)
 {
-        int error = 0;
        struct gfs2_blkreserv *res;
        if (ip->i_res)
@@ -561,7 +564,7 @@ int gfs2_rs_alloc(struct gfs2_inode *ip)
        res = kmem_cache_zalloc(gfs2_rsrv_cachep, GFP_NOFS);
        if (!res)
-                error = -ENOMEM;
+                return -ENOMEM;
        RB_CLEAR_NODE(&res->rs_node);
@@ -571,7 +574,7 @@ int gfs2_rs_alloc(struct gfs2_inode *ip)
        else
                ip->i_res = res;
        up_write(&ip->i_rw_mutex);
-        return error;
+        return 0;
 }
 static void dump_rs(struct seq_file *seq, const struct gfs2_blkreserv *rs)
@@ -876,7 +879,7 @@ static int read_rindex_entry(struct gfs2_inode *ip)
                goto fail;
        rgd->rd_gl->gl_object = rgd;
-        rgd->rd_rgl = (struct gfs2_rgrp_lvb *)rgd->rd_gl->gl_lvb;
+        rgd->rd_rgl = (struct gfs2_rgrp_lvb *)rgd->rd_gl->gl_lksb.sb_lvbptr;
        rgd->rd_flags &= ~GFS2_RDF_UPTODATE;
        if (rgd->rd_data > sdp->sd_max_rg_data)
                sdp->sd_max_rg_data = rgd->rd_data;
@@ -1263,7 +1266,9 @@ int gfs2_fitrim(struct file *filp, void __user *argp)
        int ret = 0;
        u64 amt;
        u64 trimmed = 0;
+        u64 start, end, minlen;
        unsigned int x;
+        unsigned bs_shift = sdp->sd_sb.sb_bsize_shift;
        if (!capable(CAP_SYS_ADMIN))
                return -EPERM;
@@ -1271,19 +1276,25 @@ int gfs2_fitrim(struct file *filp, void __user *argp)
        if (!blk_queue_discard(q))
                return -EOPNOTSUPP;
-        if (argp == NULL) {
+        if (copy_from_user(&r, argp, sizeof(r)))
-                r.start = 0;
-                r.len = ULLONG_MAX;
-                r.minlen = 0;
-        } else if (copy_from_user(&r, argp, sizeof(r)))
                return -EFAULT;
        ret = gfs2_rindex_update(sdp);
        if (ret)
                return ret;
-        rgd = gfs2_blk2rgrpd(sdp, r.start, 0);
+        start = r.start >> bs_shift;
-        rgd_end = gfs2_blk2rgrpd(sdp, r.start + r.len, 0);
+        end = start + (r.len >> bs_shift);
+        minlen = max_t(u64, r.minlen,
+                       q->limits.discard_granularity) >> bs_shift;
+        rgd = gfs2_blk2rgrpd(sdp, start, 0);
+        rgd_end = gfs2_blk2rgrpd(sdp, end - 1, 0);
+        if (end <= start ||
+            minlen > sdp->sd_max_rg_data ||
+            start > rgd_end->rd_data0 + rgd_end->rd_data)
+                return -EINVAL;
        while (1) {
@@ -1295,7 +1306,9 @@ int gfs2_fitrim(struct file *filp, void __user *argp)
                        /* Trim each bitmap in the rgrp */
                        for (x = 0; x < rgd->rd_length; x++) {
                                struct gfs2_bitmap *bi = rgd->rd_bits + x;
-                                ret = gfs2_rgrp_send_discards(sdp, rgd->rd_data0, NULL, bi, r.minlen, &amt);
+                                ret = gfs2_rgrp_send_discards(sdp,
+                                                rgd->rd_data0, NULL, bi, minlen,
+                                                &amt);
                                if (ret) {
                                        gfs2_glock_dq_uninit(&gh);
                                        goto out;
@@ -1324,7 +1337,7 @@ int gfs2_fitrim(struct file *filp, void __user *argp)
 out:
        r.len = trimmed << 9;
-        if (argp && copy_to_user(argp, &r, sizeof(r)))
+        if (copy_to_user(argp, &r, sizeof(r)))
                return -EFAULT;
        return ret;
@@ -1669,13 +1682,105 @@ static void try_rgrp_unlink(struct gfs2_rgrpd *rgd, u64 *last_unlinked, u64 skip
        return;
 }
+/**
+ * gfs2_rgrp_congested - Use stats to figure out whether an rgrp is congested
+ * @rgd: The rgrp in question
+ * @loops: An indication of how picky we can be (0=very, 1=less so)
+ *
+ * This function uses the recently added glock statistics in order to
+ * figure out whether a parciular resource group is suffering from
+ * contention from multiple nodes. This is done purely on the basis
+ * of timings, since this is the only data we have to work with and
+ * our aim here is to reject a resource group which is highly contended
+ * but (very important) not to do this too often in order to ensure that
+ * we do not land up introducing fragmentation by changing resource
+ * groups when not actually required.
+ *
+ * The calculation is fairly simple, we want to know whether the SRTTB
+ * (i.e. smoothed round trip time for blocking operations) to acquire
+ * the lock for this rgrp's glock is significantly greater than the
+ * time taken for resource groups on average. We introduce a margin in
+ * the form of the variable @var which is computed as the sum of the two
+ * respective variences, and multiplied by a factor depending on @loops
+ * and whether we have a lot of data to base the decision on. This is
+ * then tested against the square difference of the means in order to
+ * decide whether the result is statistically significant or not.
+ *
+ * Returns: A boolean verdict on the congestion status
+ */
+static bool gfs2_rgrp_congested(const struct gfs2_rgrpd *rgd, int loops)
+{
+        const struct gfs2_glock *gl = rgd->rd_gl;
+        const struct gfs2_sbd *sdp = gl->gl_sbd;
+        struct gfs2_lkstats *st;
+        s64 r_dcount, l_dcount;
+        s64 r_srttb, l_srttb;
+        s64 srttb_diff;
+        s64 sqr_diff;
+        s64 var;
+        preempt_disable();
+        st = &this_cpu_ptr(sdp->sd_lkstats)->lkstats[LM_TYPE_RGRP];
+        r_srttb = st->stats[GFS2_LKS_SRTTB];
+        r_dcount = st->stats[GFS2_LKS_DCOUNT];
+        var = st->stats[GFS2_LKS_SRTTVARB] +
+              gl->gl_stats.stats[GFS2_LKS_SRTTVARB];
+        preempt_enable();
+        l_srttb = gl->gl_stats.stats[GFS2_LKS_SRTTB];
+        l_dcount = gl->gl_stats.stats[GFS2_LKS_DCOUNT];
+        if ((l_dcount < 1) || (r_dcount < 1) || (r_srttb == 0))
+                return false;
+        srttb_diff = r_srttb - l_srttb;
+        sqr_diff = srttb_diff * srttb_diff;
+        var *= 2;
+        if (l_dcount < 8 || r_dcount < 8)
+                var *= 2;
+        if (loops == 1)
+                var *= 2;
+        return ((srttb_diff < 0) && (sqr_diff > var));
+}
+/**
+ * gfs2_rgrp_used_recently
+ * @rs: The block reservation with the rgrp to test
+ * @msecs: The time limit in milliseconds
+ *
+ * Returns: True if the rgrp glock has been used within the time limit
+ */
+static bool gfs2_rgrp_used_recently(const struct gfs2_blkreserv *rs,
+                                    u64 msecs)
+{
+        u64 tdiff;
+        tdiff = ktime_to_ns(ktime_sub(ktime_get_real(),
+                            rs->rs_rbm.rgd->rd_gl->gl_dstamp));
+        return tdiff > (msecs * 1000 * 1000);
+}
+static u32 gfs2_orlov_skip(const struct gfs2_inode *ip)
+{
+        const struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode);
+        u32 skip;
+        get_random_bytes(&skip, sizeof(skip));
+        return skip % sdp->sd_rgrps;
+}
 static bool gfs2_select_rgrp(struct gfs2_rgrpd **pos, const struct gfs2_rgrpd *begin)
 {
        struct gfs2_rgrpd *rgd = *pos;
+        struct gfs2_sbd *sdp = rgd->rd_sbd;
        rgd = gfs2_rgrpd_get_next(rgd);
        if (rgd == NULL)
-                rgd = gfs2_rgrpd_get_next(NULL);
+                rgd = gfs2_rgrpd_get_first(sdp);
        *pos = rgd;
        if (rgd != begin) /* If we didn't wrap */
                return true;
@@ -1690,14 +1795,15 @@ static bool gfs2_select_rgrp(struct gfs2_rgrpd **pos, const struct gfs2_rgrpd *b
 * Returns: errno
 */
-int gfs2_inplace_reserve(struct gfs2_inode *ip, u32 requested)
+int gfs2_inplace_reserve(struct gfs2_inode *ip, u32 requested, u32 aflags)
 {
        struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode);
        struct gfs2_rgrpd *begin = NULL;
        struct gfs2_blkreserv *rs = ip->i_res;
-        int error = 0, rg_locked, flags = LM_FLAG_TRY;
+        int error = 0, rg_locked, flags = 0;
        u64 last_unlinked = NO_BLOCK;
        int loops = 0;
+        u32 skip = 0;
        if (sdp->sd_args.ar_rgrplvb)
                flags |= GL_SKIP;
@@ -1711,6 +1817,8 @@ int gfs2_inplace_reserve(struct gfs2_inode *ip, u32 requested)
        } else {
                rs->rs_rbm.rgd = begin = gfs2_blk2rgrpd(sdp, ip->i_goal, 1);
        }
+        if (S_ISDIR(ip->i_inode.i_mode) && (aflags & GFS2_AF_ORLOV))
+                skip = gfs2_orlov_skip(ip);
        if (rs->rs_rbm.rgd == NULL)
                return -EBADSLT;
@@ -1719,13 +1827,20 @@ int gfs2_inplace_reserve(struct gfs2_inode *ip, u32 requested)
                if (!gfs2_glock_is_locked_by_me(rs->rs_rbm.rgd->rd_gl)) {
                        rg_locked = 0;
+                        if (skip && skip--)
+                                goto next_rgrp;
+                        if (!gfs2_rs_active(rs) && (loops < 2) &&
+                             gfs2_rgrp_used_recently(rs, 1000) &&
+                             gfs2_rgrp_congested(rs->rs_rbm.rgd, loops))
+                                goto next_rgrp;
                        error = gfs2_glock_nq_init(rs->rs_rbm.rgd->rd_gl,
                                                   LM_ST_EXCLUSIVE, flags,
                                                   &rs->rs_rgd_gh);
-                        if (error == GLR_TRYFAILED)
-                                goto next_rgrp;
                        if (unlikely(error))
                                return error;
+                        if (!gfs2_rs_active(rs) && (loops < 2) &&
+                            gfs2_rgrp_congested(rs->rs_rbm.rgd, loops))
+                                goto skip_rgrp;
                        if (sdp->sd_args.ar_rgrplvb) {
                                error = update_rgrp_lvb(rs->rs_rbm.rgd);
                                if (unlikely(error)) {
@@ -1772,12 +1887,13 @@ next_rgrp:
                /* Find the next rgrp, and continue looking */
                if (gfs2_select_rgrp(&rs->rs_rbm.rgd, begin))
                        continue;
+                if (skip)
+                        continue;
                /* If we've scanned all the rgrps, but found no free blocks
                 * then this checks for some less likely conditions before
                 * trying again.
                 */
-                flags &= ~LM_FLAG_TRY;
                loops++;
                /* Check that fs hasn't grown if writing to rindex */
                if (ip == GFS2_I(sdp->sd_rindex) && !sdp->sd_rindex_uptodate) {
diff --git a/fs/gfs2/rgrp.h b/fs/gfs2/rgrp.h
index 24077958dcf6..842185853f6b 100644
--- a/fs/gfs2/rgrp.h
+++ b/fs/gfs2/rgrp.h
@@ -39,7 +39,8 @@ extern void gfs2_rgrp_go_unlock(struct gfs2_holder *gh);
 extern struct gfs2_alloc *gfs2_alloc_get(struct gfs2_inode *ip);
-extern int gfs2_inplace_reserve(struct gfs2_inode *ip, u32 requested);
+#define GFS2_AF_ORLOV 1
+extern int gfs2_inplace_reserve(struct gfs2_inode *ip, u32 requested, u32 flags);
 extern void gfs2_inplace_release(struct gfs2_inode *ip);
 extern int gfs2_alloc_blocks(struct gfs2_inode *ip, u64 *bn, unsigned int *n,
diff --git a/fs/gfs2/super.c b/fs/gfs2/super.c
index bc737261f234..d6488674d916 100644
--- a/fs/gfs2/super.c
+++ b/fs/gfs2/super.c
@@ -810,7 +810,8 @@ static void gfs2_dirty_inode(struct inode *inode, int flags)
                        return;
                }
                need_unlock = 1;
-        }
+        } else if (WARN_ON_ONCE(ip->i_gl->gl_state != LM_ST_EXCLUSIVE))
+                return;
        if (current->journal_info == NULL) {
                ret = gfs2_trans_begin(sdp, RES_DINODE, 0);
diff --git a/fs/gfs2/trace_gfs2.h b/fs/gfs2/trace_gfs2.h
index bbdc78af60ca..2ee13e841e9f 100644
--- a/fs/gfs2/trace_gfs2.h
+++ b/fs/gfs2/trace_gfs2.h
@@ -486,7 +486,7 @@ TRACE_EVENT(gfs2_block_alloc,
        ),
        TP_fast_assign(
-                __entry->dev            = ip->i_gl->gl_sbd->sd_vfs->s_dev;
+                __entry->dev            = rgd->rd_gl->gl_sbd->sd_vfs->s_dev;
                __entry->start          = block;
                __entry->inum           = ip->i_no_addr;
                __entry->len            = len;
diff --git a/fs/gfs2/trans.c b/fs/gfs2/trans.c
index adbd27875ef9..413627072f36 100644
--- a/fs/gfs2/trans.c
+++ b/fs/gfs2/trans.c
@@ -155,14 +155,22 @@ void gfs2_trans_add_bh(struct gfs2_glock *gl, struct buffer_head *bh, int meta)
        struct gfs2_sbd *sdp = gl->gl_sbd;
        struct gfs2_bufdata *bd;
+        lock_buffer(bh);
+        gfs2_log_lock(sdp);
        bd = bh->b_private;
        if (bd)
                gfs2_assert(sdp, bd->bd_gl == gl);
        else {
+                gfs2_log_unlock(sdp);
+                unlock_buffer(bh);
                gfs2_attach_bufdata(gl, bh, meta);
                bd = bh->b_private;
+                lock_buffer(bh);
+                gfs2_log_lock(sdp);
        }
        lops_add(sdp, bd);
+        gfs2_log_unlock(sdp);
+        unlock_buffer(bh);
 }
 void gfs2_trans_add_revoke(struct gfs2_sbd *sdp, struct gfs2_bufdata *bd)
diff --git a/fs/gfs2/xattr.c b/fs/gfs2/xattr.c
index db330e5518cd..76c144b3c9bb 100644
--- a/fs/gfs2/xattr.c
+++ b/fs/gfs2/xattr.c
@@ -734,7 +734,7 @@ static int ea_alloc_skeleton(struct gfs2_inode *ip, struct gfs2_ea_request *er,
        if (error)
                return error;
-        error = gfs2_inplace_reserve(ip, blks);
+        error = gfs2_inplace_reserve(ip, blks, 0);
        if (error)
                goto out_gunlock_q;
diff --git a/fs/hppfs/hppfs.c b/fs/hppfs/hppfs.c
index 78f21f8dc2ec..43b315f2002b 100644
--- a/fs/hppfs/hppfs.c
+++ b/fs/hppfs/hppfs.c
@@ -710,7 +710,7 @@ static int hppfs_fill_super(struct super_block *sb, void *d, int silent)
        struct vfsmount *proc_mnt;
        int err = -ENOENT;
-        proc_mnt = mntget(current->nsproxy->pid_ns->proc_mnt);
+        proc_mnt = mntget(task_active_pid_ns(current)->proc_mnt);
        if (IS_ERR(proc_mnt))
                goto out;
diff --git a/fs/hugetlbfs/inode.c b/fs/hugetlbfs/inode.c
index c5bc355d8243..78bde32ea951 100644
--- a/fs/hugetlbfs/inode.c
+++ b/fs/hugetlbfs/inode.c
@@ -1,7 +1,7 @@
 /*
 * hugetlbpage-backed filesystem.  Based on ramfs.
 *
- * William Irwin, 2002
+ * Nadia Yvette Chambers, 2002
 *
 * Copyright (C) 2002 Linus Torvalds.
 */
@@ -151,8 +151,8 @@ hugetlb_get_unmapped_area(struct file *file, unsigned long addr,
 {
        struct mm_struct *mm = current->mm;
        struct vm_area_struct *vma;
-        unsigned long start_addr;
        struct hstate *h = hstate_file(file);
+        struct vm_unmapped_area_info info;
        if (len & ~huge_page_mask(h))
                return -EINVAL;
@@ -173,39 +173,13 @@ hugetlb_get_unmapped_area(struct file *file, unsigned long addr,
                        return addr;
        }
-        if (len > mm->cached_hole_size)
+        info.flags = 0;
-                start_addr = mm->free_area_cache;
+        info.length = len;
-        else {
+        info.low_limit = TASK_UNMAPPED_BASE;
-                start_addr = TASK_UNMAPPED_BASE;
+        info.high_limit = TASK_SIZE;
-                mm->cached_hole_size = 0;
+        info.align_mask = PAGE_MASK & ~huge_page_mask(h);
-        }
+        info.align_offset = 0;
+        return vm_unmapped_area(&info);
-full_search:
-        addr = ALIGN(start_addr, huge_page_size(h));
-        for (vma = find_vma(mm, addr); ; vma = vma->vm_next) {
-                /* At this point:  (!vma || addr < vma->vm_end). */
-                if (TASK_SIZE - len < addr) {
-                        /*
-                         * Start a new search - just in case we missed
-                         * some holes.
-                         */
-                        if (start_addr != TASK_UNMAPPED_BASE) {
-                                start_addr = TASK_UNMAPPED_BASE;
-                                mm->cached_hole_size = 0;
-                                goto full_search;
-                        }
-                        return -ENOMEM;
-                }
-                if (!vma || addr + len <= vma->vm_start) {
-                        mm->free_area_cache = addr + len;
-                        return addr;
-                }
-                if (addr + mm->cached_hole_size < vma->vm_start)
-                        mm->cached_hole_size = vma->vm_start - addr;
-                addr = ALIGN(vma->vm_end, huge_page_size(h));
-        }
 }
 #endif
@@ -608,11 +582,11 @@ static int hugetlbfs_migrate_page(struct address_space *mapping,
        int rc;
        rc = migrate_huge_page_move_mapping(mapping, newpage, page);
-        if (rc)
+        if (rc != MIGRATEPAGE_SUCCESS)
                return rc;
        migrate_page_copy(newpage, page);
-        return 0;
+        return MIGRATEPAGE_SUCCESS;
 }
 static int hugetlbfs_statfs(struct dentry *dentry, struct kstatfs *buf)
@@ -923,7 +897,7 @@ static struct file_system_type hugetlbfs_fs_type = {
        .kill_sb        = kill_litter_super,
 };
-static struct vfsmount *hugetlbfs_vfsmount;
+static struct vfsmount *hugetlbfs_vfsmount[HUGE_MAX_HSTATE];
 static int can_do_hugetlb_shm(void)
 {
@@ -932,9 +906,22 @@ static int can_do_hugetlb_shm(void)
        return capable(CAP_IPC_LOCK) || in_group_p(shm_group);
 }
+static int get_hstate_idx(int page_size_log)
+{
+        struct hstate *h;
+        if (!page_size_log)
+                return default_hstate_idx;
+        h = size_to_hstate(1 << page_size_log);
+        if (!h)
+                return -1;
+        return h - hstates;
+}
 struct file *hugetlb_file_setup(const char *name, unsigned long addr,
                                size_t size, vm_flags_t acctflag,
-                                struct user_struct **user, int creat_flags)
+                                struct user_struct **user,
+                                int creat_flags, int page_size_log)
 {
        int error = -ENOMEM;
        struct file *file;
@@ -944,9 +931,14 @@ struct file *hugetlb_file_setup(const char *name, unsigned long addr,
        struct qstr quick_string;
        struct hstate *hstate;
        unsigned long num_pages;
+        int hstate_idx;
+        hstate_idx = get_hstate_idx(page_size_log);
+        if (hstate_idx < 0)
+                return ERR_PTR(-ENODEV);
        *user = NULL;
-        if (!hugetlbfs_vfsmount)
+        if (!hugetlbfs_vfsmount[hstate_idx])
                return ERR_PTR(-ENOENT);
        if (creat_flags == HUGETLB_SHMFS_INODE && !can_do_hugetlb_shm()) {
@@ -963,7 +955,7 @@ struct file *hugetlb_file_setup(const char *name, unsigned long addr,
                }
        }
-        root = hugetlbfs_vfsmount->mnt_root;
+        root = hugetlbfs_vfsmount[hstate_idx]->mnt_root;
        quick_string.name = name;
        quick_string.len = strlen(quick_string.name);
        quick_string.hash = 0;
@@ -971,7 +963,7 @@ struct file *hugetlb_file_setup(const char *name, unsigned long addr,
        if (!path.dentry)
                goto out_shm_unlock;
-        path.mnt = mntget(hugetlbfs_vfsmount);
+        path.mnt = mntget(hugetlbfs_vfsmount[hstate_idx]);
        error = -ENOSPC;
        inode = hugetlbfs_get_inode(root->d_sb, NULL, S_IFREG | S_IRWXUGO, 0);
        if (!inode)
@@ -1011,8 +1003,9 @@ out_shm_unlock:
 static int __init init_hugetlbfs_fs(void)
 {
+        struct hstate *h;
        int error;
-        struct vfsmount *vfsmount;
+        int i;
        error = bdi_init(&hugetlbfs_backing_dev_info);
        if (error)
@@ -1029,14 +1022,26 @@ static int __init init_hugetlbfs_fs(void)
        if (error)
                goto out;
-        vfsmount = kern_mount(&hugetlbfs_fs_type);
+        i = 0;
+        for_each_hstate(h) {
+                char buf[50];
+                unsigned ps_kb = 1U << (h->order + PAGE_SHIFT - 10);
-        if (!IS_ERR(vfsmount)) {
+                snprintf(buf, sizeof(buf), "pagesize=%uK", ps_kb);
-                hugetlbfs_vfsmount = vfsmount;
+                hugetlbfs_vfsmount[i] = kern_mount_data(&hugetlbfs_fs_type,
-                return 0;
+                                                        buf);
-        }
-        error = PTR_ERR(vfsmount);
+                if (IS_ERR(hugetlbfs_vfsmount[i])) {
+                        pr_err("hugetlb: Cannot mount internal hugetlbfs for "
+                                "page size %uK", ps_kb);
+                        error = PTR_ERR(hugetlbfs_vfsmount[i]);
+                        hugetlbfs_vfsmount[i] = NULL;
+                }
+                i++;
+        }
+        /* Non default hstates are optional */
+        if (!IS_ERR_OR_NULL(hugetlbfs_vfsmount[default_hstate_idx]))
+                return 0;
 out:
        kmem_cache_destroy(hugetlbfs_inode_cachep);
@@ -1047,13 +1052,19 @@ static int __init init_hugetlbfs_fs(void)
 static void __exit exit_hugetlbfs_fs(void)
 {
+        struct hstate *h;
+        int i;
        /*
         * Make sure all delayed rcu free inodes are flushed before we
         * destroy cache.
         */
        rcu_barrier();
        kmem_cache_destroy(hugetlbfs_inode_cachep);
-        kern_unmount(hugetlbfs_vfsmount);
+        i = 0;
+        for_each_hstate(h)
+                kern_unmount(hugetlbfs_vfsmount[i++]);
        unregister_filesystem(&hugetlbfs_fs_type);
        bdi_destroy(&hugetlbfs_backing_dev_info);
 }
diff --git a/fs/inode.c b/fs/inode.c
index b03c71957246..14084b72b259 100644
--- a/fs/inode.c
+++ b/fs/inode.c
@@ -165,7 +165,7 @@ int inode_init_always(struct super_block *sb, struct inode *inode)
        mapping->host = inode;
        mapping->flags = 0;
        mapping_set_gfp_mask(mapping, GFP_HIGHUSER_MOVABLE);
-        mapping->assoc_mapping = NULL;
+        mapping->private_data = NULL;
        mapping->backing_dev_info = &default_backing_dev_info;
        mapping->writeback_index = 0;
@@ -408,6 +408,19 @@ static void inode_lru_list_add(struct inode *inode)
        spin_unlock(&inode->i_sb->s_inode_lru_lock);
 }
+/*
+ * Add inode to LRU if needed (inode is unused and clean).
+ *
+ * Needs inode->i_lock held.
+ */
+void inode_add_lru(struct inode *inode)
+{
+        if (!(inode->i_state & (I_DIRTY | I_SYNC | I_FREEING | I_WILL_FREE)) &&
+            !atomic_read(&inode->i_count) && inode->i_sb->s_flags & MS_ACTIVE)
+                inode_lru_list_add(inode);
+}
 static void inode_lru_list_del(struct inode *inode)
 {
        spin_lock(&inode->i_sb->s_inode_lru_lock);
@@ -1390,8 +1403,7 @@ static void iput_final(struct inode *inode)
        if (!drop && (sb->s_flags & MS_ACTIVE)) {
                inode->i_state |= I_REFERENCED;
-                if (!(inode->i_state & (I_DIRTY|I_SYNC)))
+                inode_add_lru(inode);
-                        inode_lru_list_add(inode);
                spin_unlock(&inode->i_lock);
                return;
        }
diff --git a/fs/internal.h b/fs/internal.h
index 916b7cbf3e3e..2f6af7f645eb 100644
--- a/fs/internal.h
+++ b/fs/internal.h
@@ -110,6 +110,7 @@ extern int open_check_o_direct(struct file *f);
 * inode.c
 */
 extern spinlock_t inode_sb_list_lock;
+extern void inode_add_lru(struct inode *inode);
 /*
 * fs-writeback.c
diff --git a/fs/jbd/transaction.c b/fs/jbd/transaction.c
index 78b7f84241d4..071d6905f0dd 100644
--- a/fs/jbd/transaction.c
+++ b/fs/jbd/transaction.c
@@ -1259,7 +1259,7 @@ int journal_forget (handle_t *handle, struct buffer_head *bh)
                goto not_jbd;
        }
-        /* keep track of wether or not this transaction modified us */
+        /* keep track of whether or not this transaction modified us */
        was_modified = jh->b_modified;
        /*
@@ -1961,7 +1961,9 @@ retry:
                        spin_unlock(&journal->j_list_lock);
                        jbd_unlock_bh_state(bh);
                        spin_unlock(&journal->j_state_lock);
+                        unlock_buffer(bh);
                        log_wait_commit(journal, tid);
+                        lock_buffer(bh);
                        goto retry;
                }
                /*
diff --git a/fs/jbd2/journal.c b/fs/jbd2/journal.c
index 484b8d1c6cb6..dbf41f9452db 100644
--- a/fs/jbd2/journal.c
+++ b/fs/jbd2/journal.c
@@ -60,7 +60,6 @@ EXPORT_SYMBOL(jbd2_journal_get_create_access);
 EXPORT_SYMBOL(jbd2_journal_get_undo_access);
 EXPORT_SYMBOL(jbd2_journal_set_triggers);
 EXPORT_SYMBOL(jbd2_journal_dirty_metadata);
-EXPORT_SYMBOL(jbd2_journal_release_buffer);
 EXPORT_SYMBOL(jbd2_journal_forget);
 #if 0
 EXPORT_SYMBOL(journal_sync_buffer);
diff --git a/fs/jbd2/transaction.c b/fs/jbd2/transaction.c
index a74ba4659549..42f6615af0ac 100644
--- a/fs/jbd2/transaction.c
+++ b/fs/jbd2/transaction.c
@@ -1207,17 +1207,6 @@ out:
        return ret;
 }
-/*
- * jbd2_journal_release_buffer: undo a get_write_access without any buffer
- * updates, if the update decided in the end that it didn't need access.
- *
- */
-void
-jbd2_journal_release_buffer(handle_t *handle, struct buffer_head *bh)
-{
-        BUFFER_TRACE(bh, "entry");
-}
 /**
 * void jbd2_journal_forget() - bforget() for potentially-journaled buffers.
 * @handle: transaction handle
@@ -1261,7 +1250,7 @@ int jbd2_journal_forget (handle_t *handle, struct buffer_head *bh)
                goto not_jbd;
        }
-        /* keep track of wether or not this transaction modified us */
+        /* keep track of whether or not this transaction modified us */
        was_modified = jh->b_modified;
        /*
diff --git a/fs/libfs.c b/fs/libfs.c
index 7cc37ca19cd8..35fc6e74cd88 100644
--- a/fs/libfs.c
+++ b/fs/libfs.c
@@ -81,11 +81,11 @@ int dcache_dir_close(struct inode *inode, struct file *file)
        return 0;
 }
-loff_t dcache_dir_lseek(struct file *file, loff_t offset, int origin)
+loff_t dcache_dir_lseek(struct file *file, loff_t offset, int whence)
 {
        struct dentry *dentry = file->f_path.dentry;
        mutex_lock(&dentry->d_inode->i_mutex);
-        switch (origin) {
+        switch (whence) {
                case 1:
                        offset += file->f_pos;
                case 0:
diff --git a/fs/lockd/clnt4xdr.c b/fs/lockd/clnt4xdr.c
index 13ad1539fbf2..00ec0b9c94d1 100644
--- a/fs/lockd/clnt4xdr.c
+++ b/fs/lockd/clnt4xdr.c
@@ -64,10 +64,6 @@ static void nlm4_compute_offsets(const struct nlm_lock *lock,
 {
        const struct file_lock *fl = &lock->fl;
-        BUG_ON(fl->fl_start > NLM4_OFFSET_MAX);
-        BUG_ON(fl->fl_end > NLM4_OFFSET_MAX &&
-                                fl->fl_end != OFFSET_MAX);
        *l_offset = loff_t_to_s64(fl->fl_start);
        if (fl->fl_end == OFFSET_MAX)
                *l_len = 0;
@@ -122,7 +118,6 @@ static void encode_netobj(struct xdr_stream *xdr,
 {
        __be32 *p;
-        BUG_ON(length > XDR_MAX_NETOBJ);
        p = xdr_reserve_space(xdr, 4 + length);
        xdr_encode_opaque(p, data, length);
 }
@@ -156,7 +151,6 @@ out_overflow:
 static void encode_cookie(struct xdr_stream *xdr,
                          const struct nlm_cookie *cookie)
 {
-        BUG_ON(cookie->len > NLM_MAXCOOKIELEN);
        encode_netobj(xdr, (u8 *)&cookie->data, cookie->len);
 }
@@ -198,7 +192,6 @@ out_overflow:
 */
 static void encode_fh(struct xdr_stream *xdr, const struct nfs_fh *fh)
 {
-        BUG_ON(fh->size > NFS3_FHSIZE);
        encode_netobj(xdr, (u8 *)&fh->data, fh->size);
 }
@@ -336,7 +329,6 @@ static void encode_caller_name(struct xdr_stream *xdr, const char *name)
        u32 length = strlen(name);
        __be32 *p;
-        BUG_ON(length > NLM_MAXSTRLEN);
        p = xdr_reserve_space(xdr, 4 + length);
        xdr_encode_opaque(p, name, length);
 }
diff --git a/fs/lockd/clntproc.c b/fs/lockd/clntproc.c
index 05d29124c6ab..54f9e6ce0430 100644
--- a/fs/lockd/clntproc.c
+++ b/fs/lockd/clntproc.c
@@ -141,7 +141,7 @@ static void nlmclnt_setlockargs(struct nlm_rqst *req, struct file_lock *fl)
 static void nlmclnt_release_lockargs(struct nlm_rqst *req)
 {
-        BUG_ON(req->a_args.lock.fl.fl_ops != NULL);
+        WARN_ON_ONCE(req->a_args.lock.fl.fl_ops != NULL);
 }
 /**
@@ -465,7 +465,6 @@ static const struct file_lock_operations nlmclnt_lock_ops = {
 static void nlmclnt_locks_init_private(struct file_lock *fl, struct nlm_host *host)
 {
-        BUG_ON(fl->fl_ops != NULL);
        fl->fl_u.nfs_fl.state = 0;
        fl->fl_u.nfs_fl.owner = nlm_find_lockowner(host, fl->fl_owner);
        INIT_LIST_HEAD(&fl->fl_u.nfs_fl.list);
diff --git a/fs/lockd/clntxdr.c b/fs/lockd/clntxdr.c
index 982d2676e1f8..9a55797a1cd4 100644
--- a/fs/lockd/clntxdr.c
+++ b/fs/lockd/clntxdr.c
@@ -60,10 +60,6 @@ static void nlm_compute_offsets(const struct nlm_lock *lock,
 {
        const struct file_lock *fl = &lock->fl;
-        BUG_ON(fl->fl_start > NLM_OFFSET_MAX);
-        BUG_ON(fl->fl_end > NLM_OFFSET_MAX &&
-                                fl->fl_end != OFFSET_MAX);
        *l_offset = loff_t_to_s32(fl->fl_start);
        if (fl->fl_end == OFFSET_MAX)
                *l_len = 0;
@@ -119,7 +115,6 @@ static void encode_netobj(struct xdr_stream *xdr,
 {
        __be32 *p;
-        BUG_ON(length > XDR_MAX_NETOBJ);
        p = xdr_reserve_space(xdr, 4 + length);
        xdr_encode_opaque(p, data, length);
 }
@@ -153,7 +148,6 @@ out_overflow:
 static void encode_cookie(struct xdr_stream *xdr,
                          const struct nlm_cookie *cookie)
 {
-        BUG_ON(cookie->len > NLM_MAXCOOKIELEN);
        encode_netobj(xdr, (u8 *)&cookie->data, cookie->len);
 }
@@ -195,7 +189,6 @@ out_overflow:
 */
 static void encode_fh(struct xdr_stream *xdr, const struct nfs_fh *fh)
 {
-        BUG_ON(fh->size != NFS2_FHSIZE);
        encode_netobj(xdr, (u8 *)&fh->data, NFS2_FHSIZE);
 }
@@ -330,7 +323,6 @@ static void encode_caller_name(struct xdr_stream *xdr, const char *name)
        u32 length = strlen(name);
        __be32 *p;
-        BUG_ON(length > NLM_MAXSTRLEN);
        p = xdr_reserve_space(xdr, 4 + length);
        xdr_encode_opaque(p, name, length);
 }
diff --git a/fs/lockd/host.c b/fs/lockd/host.c
index f9b22e58f78f..0e17090c310f 100644
--- a/fs/lockd/host.c
+++ b/fs/lockd/host.c
@@ -177,9 +177,6 @@ static void nlm_destroy_host_locked(struct nlm_host *host)
        dprintk("lockd: destroy host %s\n", host->h_name);
-        BUG_ON(!list_empty(&host->h_lockowners));
-        BUG_ON(atomic_read(&host->h_count));
        hlist_del_init(&host->h_hash);
        nsm_unmonitor(host);
@@ -289,13 +286,12 @@ void nlmclnt_release_host(struct nlm_host *host)
        dprintk("lockd: release client host %s\n", host->h_name);
-        BUG_ON(atomic_read(&host->h_count) < 0);
+        WARN_ON_ONCE(host->h_server);
-        BUG_ON(host->h_server);
        if (atomic_dec_and_test(&host->h_count)) {
-                BUG_ON(!list_empty(&host->h_lockowners));
+                WARN_ON_ONCE(!list_empty(&host->h_lockowners));
-                BUG_ON(!list_empty(&host->h_granted));
+                WARN_ON_ONCE(!list_empty(&host->h_granted));
-                BUG_ON(!list_empty(&host->h_reclaim));
+                WARN_ON_ONCE(!list_empty(&host->h_reclaim));
                mutex_lock(&nlm_host_mutex);
                nlm_destroy_host_locked(host);
@@ -412,8 +408,7 @@ void nlmsvc_release_host(struct nlm_host *host)
        dprintk("lockd: release server host %s\n", host->h_name);
-        BUG_ON(atomic_read(&host->h_count) < 0);
+        WARN_ON_ONCE(!host->h_server);
-        BUG_ON(!host->h_server);
        atomic_dec(&host->h_count);
 }
diff --git a/fs/lockd/mon.c b/fs/lockd/mon.c
index 3d7e09bcc0e9..3c2cfc683631 100644
--- a/fs/lockd/mon.c
+++ b/fs/lockd/mon.c
@@ -154,8 +154,6 @@ static int nsm_mon_unmon(struct nsm_handle *nsm, u32 proc, struct nsm_res *res,
                .rpc_resp       = res,
        };
-        BUG_ON(clnt == NULL);
        memset(res, 0, sizeof(*res));
        msg.rpc_proc = &clnt->cl_procinfo[proc];
@@ -466,7 +464,6 @@ static void encode_nsm_string(struct xdr_stream *xdr, const char *string)
        const u32 len = strlen(string);
        __be32 *p;
-        BUG_ON(len > SM_MAXSTRLEN);
        p = xdr_reserve_space(xdr, 4 + len);
        xdr_encode_opaque(p, string, len);
 }
diff --git a/fs/logfs/inode.c b/fs/logfs/inode.c
index adb90116d36b..af49e2d6941a 100644
--- a/fs/logfs/inode.c
+++ b/fs/logfs/inode.c
@@ -33,7 +33,7 @@
 * are being written out - and waiting for GC to make progress, naturally.
 *
 * So we cannot just call iget() or some variant of it, but first have to check
- * wether the inode in question might be in I_FREEING state.  Therefore we
+ * whether the inode in question might be in I_FREEING state.  Therefore we
 * maintain our own per-sb list of "almost deleted" inodes and check against
 * that list first.  Normally this should be at most 1-2 entries long.
 *
diff --git a/fs/mount.h b/fs/mount.h
index 4f291f9de641..cd5007980400 100644
--- a/fs/mount.h
+++ b/fs/mount.h
@@ -4,8 +4,11 @@
 struct mnt_namespace {
        atomic_t                count;
+        unsigned int            proc_inum;
        struct mount *  root;
        struct list_head        list;
+        struct user_namespace   *user_ns;
+        u64                     seq;    /* Sequence number to prevent loops */
        wait_queue_head_t poll;
        int event;
 };
diff --git a/fs/namei.c b/fs/namei.c
index 937f9d50c84b..5f4cdf3ad913 100644
--- a/fs/namei.c
+++ b/fs/namei.c
@@ -2131,6 +2131,11 @@ struct dentry *lookup_one_len(const char *name, struct dentry *base, int len)
        if (!len)
                return ERR_PTR(-EACCES);
+        if (unlikely(name[0] == '.')) {
+                if (len < 2 || (len == 2 && name[1] == '.'))
+                        return ERR_PTR(-EACCES);
+        }
        while (len--) {
                c = *(const unsigned char *)name++;
                if (c == '/' || c == '\0')
diff --git a/fs/namespace.c b/fs/namespace.c
index 24960626bb6b..398a50ff2438 100644
--- a/fs/namespace.c
+++ b/fs/namespace.c
@@ -12,6 +12,7 @@
 #include <linux/export.h>
 #include <linux/capability.h>
 #include <linux/mnt_namespace.h>
+#include <linux/user_namespace.h>
 #include <linux/namei.h>
 #include <linux/security.h>
 #include <linux/idr.h>
@@ -20,6 +21,7 @@
 #include <linux/fs_struct.h>    /* get_fs_root et.al. */
 #include <linux/fsnotify.h>     /* fsnotify_vfsmount_delete */
 #include <linux/uaccess.h>
+#include <linux/proc_fs.h>
 #include "pnode.h"
 #include "internal.h"
@@ -784,7 +786,7 @@ static struct mount *clone_mnt(struct mount *old, struct dentry *root,
        if (!mnt)
                return ERR_PTR(-ENOMEM);
-        if (flag & (CL_SLAVE | CL_PRIVATE))
+        if (flag & (CL_SLAVE | CL_PRIVATE | CL_SHARED_TO_SLAVE))
                mnt->mnt_group_id = 0; /* not a peer of original */
        else
                mnt->mnt_group_id = old->mnt_group_id;
@@ -805,7 +807,8 @@ static struct mount *clone_mnt(struct mount *old, struct dentry *root,
        list_add_tail(&mnt->mnt_instance, &sb->s_mounts);
        br_write_unlock(&vfsmount_lock);
-        if (flag & CL_SLAVE) {
+        if ((flag & CL_SLAVE) ||
+            ((flag & CL_SHARED_TO_SLAVE) && IS_MNT_SHARED(old))) {
                list_add(&mnt->mnt_slave, &old->mnt_slave_list);
                mnt->mnt_master = old;
                CLEAR_MNT_SHARED(mnt);
@@ -1266,7 +1269,7 @@ SYSCALL_DEFINE2(umount, char __user *, name, int, flags)
                goto dput_and_out;
        retval = -EPERM;
-        if (!capable(CAP_SYS_ADMIN))
+        if (!ns_capable(mnt->mnt_ns->user_ns, CAP_SYS_ADMIN))
                goto dput_and_out;
        retval = do_umount(mnt, flags);
@@ -1292,7 +1295,7 @@ SYSCALL_DEFINE1(oldumount, char __user *, name)
 static int mount_is_safe(struct path *path)
 {
-        if (capable(CAP_SYS_ADMIN))
+        if (ns_capable(real_mount(path->mnt)->mnt_ns->user_ns, CAP_SYS_ADMIN))
                return 0;
        return -EPERM;
 #ifdef notyet
@@ -1308,6 +1311,26 @@ static int mount_is_safe(struct path *path)
 #endif
 }
+static bool mnt_ns_loop(struct path *path)
+{
+        /* Could bind mounting the mount namespace inode cause a
+         * mount namespace loop?
+         */
+        struct inode *inode = path->dentry->d_inode;
+        struct proc_inode *ei;
+        struct mnt_namespace *mnt_ns;
+        if (!proc_ns_inode(inode))
+                return false;
+        ei = PROC_I(inode);
+        if (ei->ns_ops != &mntns_operations)
+                return false;
+        mnt_ns = ei->ns;
+        return current->nsproxy->mnt_ns->seq >= mnt_ns->seq;
+}
 struct mount *copy_tree(struct mount *mnt, struct dentry *dentry,
                                        int flag)
 {
@@ -1610,7 +1633,7 @@ static int do_change_type(struct path *path, int flag)
        int type;
        int err = 0;
-        if (!capable(CAP_SYS_ADMIN))
+        if (!ns_capable(mnt->mnt_ns->user_ns, CAP_SYS_ADMIN))
                return -EPERM;
        if (path->dentry != path->mnt->mnt_root)
@@ -1655,6 +1678,10 @@ static int do_loopback(struct path *path, const char *old_name,
        if (err)
                return err;
+        err = -EINVAL;
+        if (mnt_ns_loop(&old_path))
+                goto out; 
        err = lock_mount(path);
        if (err)
                goto out;
@@ -1770,7 +1797,7 @@ static int do_move_mount(struct path *path, const char *old_name)
        struct mount *p;
        struct mount *old;
        int err = 0;
-        if (!capable(CAP_SYS_ADMIN))
+        if (!ns_capable(real_mount(path->mnt)->mnt_ns->user_ns, CAP_SYS_ADMIN))
                return -EPERM;
        if (!old_name || !*old_name)
                return -EINVAL;
@@ -1857,21 +1884,6 @@ static struct vfsmount *fs_set_subtype(struct vfsmount *mnt, const char *fstype)
        return ERR_PTR(err);
 }
-static struct vfsmount *
-do_kern_mount(const char *fstype, int flags, const char *name, void *data)
-{
-        struct file_system_type *type = get_fs_type(fstype);
-        struct vfsmount *mnt;
-        if (!type)
-                return ERR_PTR(-ENODEV);
-        mnt = vfs_kern_mount(type, flags, name, data);
-        if (!IS_ERR(mnt) && (type->fs_flags & FS_HAS_SUBTYPE) &&
-            !mnt->mnt_sb->s_subtype)
-                mnt = fs_set_subtype(mnt, fstype);
-        put_filesystem(type);
-        return mnt;
-}
 /*
 * add a mount into a namespace's mount tree
 */
@@ -1917,20 +1929,46 @@ unlock:
 * create a new mount for userspace and request it to be added into the
 * namespace's tree
 */
-static int do_new_mount(struct path *path, const char *type, int flags,
+static int do_new_mount(struct path *path, const char *fstype, int flags,
                        int mnt_flags, const char *name, void *data)
 {
+        struct file_system_type *type;
+        struct user_namespace *user_ns;
        struct vfsmount *mnt;
        int err;
-        if (!type)
+        if (!fstype)
                return -EINVAL;
        /* we need capabilities... */
-        if (!capable(CAP_SYS_ADMIN))
+        user_ns = real_mount(path->mnt)->mnt_ns->user_ns;
+        if (!ns_capable(user_ns, CAP_SYS_ADMIN))
                return -EPERM;
-        mnt = do_kern_mount(type, flags, name, data);
+        type = get_fs_type(fstype);
+        if (!type)
+                return -ENODEV;
+        if (user_ns != &init_user_ns) {
+                if (!(type->fs_flags & FS_USERNS_MOUNT)) {
+                        put_filesystem(type);
+                        return -EPERM;
+                }
+                /* Only in special cases allow devices from mounts
+                 * created outside the initial user namespace.
+                 */
+                if (!(type->fs_flags & FS_USERNS_DEV_MOUNT)) {
+                        flags |= MS_NODEV;
+                        mnt_flags |= MNT_NODEV;
+                }
+        }
+        mnt = vfs_kern_mount(type, flags, name, data);
+        if (!IS_ERR(mnt) && (type->fs_flags & FS_HAS_SUBTYPE) &&
+            !mnt->mnt_sb->s_subtype)
+                mnt = fs_set_subtype(mnt, fstype);
+        put_filesystem(type);
        if (IS_ERR(mnt))
                return PTR_ERR(mnt);
@@ -2261,18 +2299,42 @@ dput_out:
        return retval;
 }
-static struct mnt_namespace *alloc_mnt_ns(void)
+static void free_mnt_ns(struct mnt_namespace *ns)
+{
+        proc_free_inum(ns->proc_inum);
+        put_user_ns(ns->user_ns);
+        kfree(ns);
+}
+/*
+ * Assign a sequence number so we can detect when we attempt to bind
+ * mount a reference to an older mount namespace into the current
+ * mount namespace, preventing reference counting loops.  A 64bit
+ * number incrementing at 10Ghz will take 12,427 years to wrap which
+ * is effectively never, so we can ignore the possibility.
+ */
+static atomic64_t mnt_ns_seq = ATOMIC64_INIT(1);
+static struct mnt_namespace *alloc_mnt_ns(struct user_namespace *user_ns)
 {
        struct mnt_namespace *new_ns;
+        int ret;
        new_ns = kmalloc(sizeof(struct mnt_namespace), GFP_KERNEL);
        if (!new_ns)
                return ERR_PTR(-ENOMEM);
+        ret = proc_alloc_inum(&new_ns->proc_inum);
+        if (ret) {
+                kfree(new_ns);
+                return ERR_PTR(ret);
+        }
+        new_ns->seq = atomic64_add_return(1, &mnt_ns_seq);
        atomic_set(&new_ns->count, 1);
        new_ns->root = NULL;
        INIT_LIST_HEAD(&new_ns->list);
        init_waitqueue_head(&new_ns->poll);
        new_ns->event = 0;
+        new_ns->user_ns = get_user_ns(user_ns);
        return new_ns;
 }
@@ -2281,24 +2343,28 @@ static struct mnt_namespace *alloc_mnt_ns(void)
 * copied from the namespace of the passed in task structure.
 */
 static struct mnt_namespace *dup_mnt_ns(struct mnt_namespace *mnt_ns,
-                struct fs_struct *fs)
+                struct user_namespace *user_ns, struct fs_struct *fs)
 {
        struct mnt_namespace *new_ns;
        struct vfsmount *rootmnt = NULL, *pwdmnt = NULL;
        struct mount *p, *q;
        struct mount *old = mnt_ns->root;
        struct mount *new;
+        int copy_flags;
-        new_ns = alloc_mnt_ns();
+        new_ns = alloc_mnt_ns(user_ns);
        if (IS_ERR(new_ns))
                return new_ns;
        down_write(&namespace_sem);
        /* First pass: copy the tree topology */
-        new = copy_tree(old, old->mnt.mnt_root, CL_COPY_ALL | CL_EXPIRE);
+        copy_flags = CL_COPY_ALL | CL_EXPIRE;
+        if (user_ns != mnt_ns->user_ns)
+                copy_flags |= CL_SHARED_TO_SLAVE;
+        new = copy_tree(old, old->mnt.mnt_root, copy_flags);
        if (IS_ERR(new)) {
                up_write(&namespace_sem);
-                kfree(new_ns);
+                free_mnt_ns(new_ns);
                return ERR_CAST(new);
        }
        new_ns->root = new;
@@ -2339,7 +2405,7 @@ static struct mnt_namespace *dup_mnt_ns(struct mnt_namespace *mnt_ns,
 }
 struct mnt_namespace *copy_mnt_ns(unsigned long flags, struct mnt_namespace *ns,
-                struct fs_struct *new_fs)
+                struct user_namespace *user_ns, struct fs_struct *new_fs)
 {
        struct mnt_namespace *new_ns;
@@ -2349,7 +2415,7 @@ struct mnt_namespace *copy_mnt_ns(unsigned long flags, struct mnt_namespace *ns,
        if (!(flags & CLONE_NEWNS))
                return ns;
-        new_ns = dup_mnt_ns(ns, new_fs);
+        new_ns = dup_mnt_ns(ns, user_ns, new_fs);
        put_mnt_ns(ns);
        return new_ns;
@@ -2361,7 +2427,7 @@ struct mnt_namespace *copy_mnt_ns(unsigned long flags, struct mnt_namespace *ns,
 */
 static struct mnt_namespace *create_mnt_ns(struct vfsmount *m)
 {
-        struct mnt_namespace *new_ns = alloc_mnt_ns();
+        struct mnt_namespace *new_ns = alloc_mnt_ns(&init_user_ns);
        if (!IS_ERR(new_ns)) {
                struct mount *mnt = real_mount(m);
                mnt->mnt_ns = new_ns;
@@ -2501,7 +2567,7 @@ SYSCALL_DEFINE2(pivot_root, const char __user *, new_root,
        struct mount *new_mnt, *root_mnt;
        int error;
-        if (!capable(CAP_SYS_ADMIN))
+        if (!ns_capable(current->nsproxy->mnt_ns->user_ns, CAP_SYS_ADMIN))
                return -EPERM;
        error = user_path_dir(new_root, &new);
@@ -2583,8 +2649,13 @@ static void __init init_mount_tree(void)
        struct vfsmount *mnt;
        struct mnt_namespace *ns;
        struct path root;
+        struct file_system_type *type;
-        mnt = do_kern_mount("rootfs", 0, "rootfs", NULL);
+        type = get_fs_type("rootfs");
+        if (!type)
+                panic("Can't find rootfs type");
+        mnt = vfs_kern_mount(type, 0, "rootfs", NULL);
+        put_filesystem(type);
        if (IS_ERR(mnt))
                panic("Can't create rootfs");
@@ -2647,7 +2718,7 @@ void put_mnt_ns(struct mnt_namespace *ns)
        br_write_unlock(&vfsmount_lock);
        up_write(&namespace_sem);
        release_mounts(&umount_list);
-        kfree(ns);
+        free_mnt_ns(ns);
 }
 struct vfsmount *kern_mount_data(struct file_system_type *type, void *data)
@@ -2681,3 +2752,72 @@ bool our_mnt(struct vfsmount *mnt)
 {
        return check_mnt(real_mount(mnt));
 }
+static void *mntns_get(struct task_struct *task)
+{
+        struct mnt_namespace *ns = NULL;
+        struct nsproxy *nsproxy;
+        rcu_read_lock();
+        nsproxy = task_nsproxy(task);
+        if (nsproxy) {
+                ns = nsproxy->mnt_ns;
+                get_mnt_ns(ns);
+        }
+        rcu_read_unlock();
+        return ns;
+}
+static void mntns_put(void *ns)
+{
+        put_mnt_ns(ns);
+}
+static int mntns_install(struct nsproxy *nsproxy, void *ns)
+{
+        struct fs_struct *fs = current->fs;
+        struct mnt_namespace *mnt_ns = ns;
+        struct path root;
+        if (!ns_capable(mnt_ns->user_ns, CAP_SYS_ADMIN) ||
+            !nsown_capable(CAP_SYS_CHROOT) ||
+            !nsown_capable(CAP_SYS_ADMIN))
+                return -EPERM;
+        if (fs->users != 1)
+                return -EINVAL;
+        get_mnt_ns(mnt_ns);
+        put_mnt_ns(nsproxy->mnt_ns);
+        nsproxy->mnt_ns = mnt_ns;
+        /* Find the root */
+        root.mnt    = &mnt_ns->root->mnt;
+        root.dentry = mnt_ns->root->mnt.mnt_root;
+        path_get(&root);
+        while(d_mountpoint(root.dentry) && follow_down_one(&root))
+                ;
+        /* Update the pwd and root */
+        set_fs_pwd(fs, &root);
+        set_fs_root(fs, &root);
+        path_put(&root);
+        return 0;
+}
+static unsigned int mntns_inum(void *ns)
+{
+        struct mnt_namespace *mnt_ns = ns;
+        return mnt_ns->proc_inum;
+}
+const struct proc_ns_operations mntns_operations = {
+        .name           = "mnt",
+        .type           = CLONE_NEWNS,
+        .get            = mntns_get,
+        .put            = mntns_put,
+        .install        = mntns_install,
+        .inum           = mntns_inum,
+};
diff --git a/fs/ncpfs/mmap.c b/fs/ncpfs/mmap.c
index be20a7e171a0..63d14a99483d 100644
--- a/fs/ncpfs/mmap.c
+++ b/fs/ncpfs/mmap.c
@@ -89,7 +89,7 @@ static int ncp_file_mmap_fault(struct vm_area_struct *area,
        /*
         * If I understand ncp_read_kernel() properly, the above always
         * fetches from the network, here the analogue of disk.
-         * -- wli
+         * -- nyc
         */
        count_vm_event(PGMAJFAULT);
        mem_cgroup_count_vm_event(area->vm_mm, PGMAJFAULT);
diff --git a/fs/nfs/Makefile b/fs/nfs/Makefile
index b7db60897f91..cce2c057bd2d 100644
--- a/fs/nfs/Makefile
+++ b/fs/nfs/Makefile
@@ -24,7 +24,7 @@ nfsv4-y := nfs4proc.o nfs4xdr.o nfs4state.o nfs4renewd.o nfs4super.o nfs4file.o
          delegation.o idmap.o callback.o callback_xdr.o callback_proc.o \
          nfs4namespace.o nfs4getroot.o nfs4client.o
 nfsv4-$(CONFIG_SYSCTL)  += nfs4sysctl.o
-nfsv4-$(CONFIG_NFS_V4_1)        += pnfs.o pnfs_dev.o
+nfsv4-$(CONFIG_NFS_V4_1)        += nfs4session.o pnfs.o pnfs_dev.o
 obj-$(CONFIG_PNFS_FILE_LAYOUT) += nfs_layout_nfsv41_files.o
 nfs_layout_nfsv41_files-y := nfs4filelayout.o nfs4filelayoutdev.o
diff --git a/fs/nfs/blocklayout/blocklayout.c b/fs/nfs/blocklayout/blocklayout.c
index f1027b06a1a9..4fa788c93f46 100644
--- a/fs/nfs/blocklayout/blocklayout.c
+++ b/fs/nfs/blocklayout/blocklayout.c
@@ -40,6 +40,7 @@
 #include <linux/pagevec.h>
 #include "../pnfs.h"
+#include "../nfs4session.h"
 #include "../internal.h"
 #include "blocklayout.h"
diff --git a/fs/nfs/cache_lib.c b/fs/nfs/cache_lib.c
index dded26368111..862a2f16db64 100644
--- a/fs/nfs/cache_lib.c
+++ b/fs/nfs/cache_lib.c
@@ -118,7 +118,6 @@ int nfs_cache_register_sb(struct super_block *sb, struct cache_detail *cd)
        struct dentry *dir;
        dir = rpc_d_lookup_sb(sb, "cache");
-        BUG_ON(dir == NULL);
        ret = sunrpc_cache_register_pipefs(dir, cd->name, 0600, cd);
        dput(dir);
        return ret;
diff --git a/fs/nfs/callback.h b/fs/nfs/callback.h
index 4251c2ae06ad..efd54f0a4c46 100644
--- a/fs/nfs/callback.h
+++ b/fs/nfs/callback.h
@@ -142,7 +142,7 @@ extern __be32 nfs4_callback_recallany(struct cb_recallanyargs *args,
 struct cb_recallslotargs {
        struct sockaddr *crsa_addr;
-        uint32_t        crsa_target_max_slots;
+        uint32_t        crsa_target_highest_slotid;
 };
 extern __be32 nfs4_callback_recallslot(struct cb_recallslotargs *args,
                                         void *dummy,
@@ -167,8 +167,6 @@ extern __be32 nfs4_callback_layoutrecall(
        struct cb_layoutrecallargs *args,
        void *dummy, struct cb_process_state *cps);
-extern void nfs4_check_drain_bc_complete(struct nfs4_session *ses);
 struct cb_devicenotifyitem {
        uint32_t                cbd_notify_type;
        uint32_t                cbd_layout_type;
diff --git a/fs/nfs/callback_proc.c b/fs/nfs/callback_proc.c
index 76b4a7a3e559..c89b26bc9759 100644
--- a/fs/nfs/callback_proc.c
+++ b/fs/nfs/callback_proc.c
@@ -14,6 +14,7 @@
 #include "delegation.h"
 #include "internal.h"
 #include "pnfs.h"
+#include "nfs4session.h"
 #ifdef NFS_DEBUG
 #define NFSDBG_FACILITY NFSDBG_CALLBACK
@@ -216,7 +217,6 @@ static u32 initiate_bulk_draining(struct nfs_client *clp,
                        }
                        pnfs_get_layout_hdr(lo);
                        spin_unlock(&ino->i_lock);
-                        BUG_ON(!list_empty(&lo->plh_bulk_recall));
                        list_add(&lo->plh_bulk_recall, &recall_list);
                }
        }
@@ -562,23 +562,16 @@ __be32 nfs4_callback_recallslot(struct cb_recallslotargs *args, void *dummy,
        if (!cps->clp) /* set in cb_sequence */
                goto out;
-        dprintk_rcu("NFS: CB_RECALL_SLOT request from %s target max slots %d\n",
+        dprintk_rcu("NFS: CB_RECALL_SLOT request from %s target highest slotid %d\n",
                rpc_peeraddr2str(cps->clp->cl_rpcclient, RPC_DISPLAY_ADDR),
-                args->crsa_target_max_slots);
+                args->crsa_target_highest_slotid);
        fc_tbl = &cps->clp->cl_session->fc_slot_table;
-        status = htonl(NFS4ERR_BAD_HIGH_SLOT);
-        if (args->crsa_target_max_slots > fc_tbl->max_slots ||
-            args->crsa_target_max_slots < 1)
-                goto out;
        status = htonl(NFS4_OK);
-        if (args->crsa_target_max_slots == fc_tbl->max_slots)
-                goto out;
-        fc_tbl->target_max_slots = args->crsa_target_max_slots;
+        nfs41_set_target_slotid(fc_tbl, args->crsa_target_highest_slotid);
-        nfs41_handle_recall_slot(cps->clp);
+        nfs41_server_notify_target_slotid_update(cps->clp);
 out:
        dprintk("%s: exit with status = %d\n", __func__, ntohl(status));
        return status;
diff --git a/fs/nfs/callback_xdr.c b/fs/nfs/callback_xdr.c
index 742ff4ffced7..59461c957d9d 100644
--- a/fs/nfs/callback_xdr.c
+++ b/fs/nfs/callback_xdr.c
@@ -16,6 +16,7 @@
 #include "nfs4_fs.h"
 #include "callback.h"
 #include "internal.h"
+#include "nfs4session.h"
 #define CB_OP_TAGLEN_MAXSZ      (512)
 #define CB_OP_HDR_RES_MAXSZ     (2 + CB_OP_TAGLEN_MAXSZ)
@@ -520,7 +521,7 @@ static __be32 decode_recallslot_args(struct svc_rqst *rqstp,
        p = read_buf(xdr, 4);
        if (unlikely(p == NULL))
                return htonl(NFS4ERR_BADXDR);
-        args->crsa_target_max_slots = ntohl(*p++);
+        args->crsa_target_highest_slotid = ntohl(*p++);
        return 0;
 }
@@ -762,7 +763,7 @@ static void nfs4_callback_free_slot(struct nfs4_session *session)
         * A single slot, so highest used slotid is either 0 or -1
         */
        tbl->highest_used_slotid = NFS4_NO_SLOT;
-        nfs4_check_drain_bc_complete(session);
+        nfs4_session_drain_complete(session, tbl);
        spin_unlock(&tbl->slot_tbl_lock);
 }
diff --git a/fs/nfs/client.c b/fs/nfs/client.c
index 8b39a42ac35e..9f3c66438d0e 100644
--- a/fs/nfs/client.c
+++ b/fs/nfs/client.c
@@ -277,7 +277,7 @@ void nfs_put_client(struct nfs_client *clp)
                nfs_cb_idr_remove_locked(clp);
                spin_unlock(&nn->nfs_client_lock);
-                BUG_ON(!list_empty(&clp->cl_superblocks));
+                WARN_ON_ONCE(!list_empty(&clp->cl_superblocks));
                clp->rpc_ops->free_client(clp);
        }
@@ -615,8 +615,7 @@ EXPORT_SYMBOL_GPL(nfs_create_rpc_client);
 */
 static void nfs_destroy_server(struct nfs_server *server)
 {
-        if (!(server->flags & NFS_MOUNT_LOCAL_FLOCK) ||
+        if (server->nlm_host)
-                        !(server->flags & NFS_MOUNT_LOCAL_FCNTL))
                nlmclnt_done(server->nlm_host);
 }
@@ -1061,10 +1060,6 @@ struct nfs_server *nfs_create_server(struct nfs_mount_info *mount_info,
        if (error < 0)
                goto error;
-        BUG_ON(!server->nfs_client);
-        BUG_ON(!server->nfs_client->rpc_ops);
-        BUG_ON(!server->nfs_client->rpc_ops->file_inode_ops);
        /* Probe the root fh to retrieve its FSID */
        error = nfs_probe_fsinfo(server, mount_info->mntfh, fattr);
        if (error < 0)
diff --git a/fs/nfs/dir.c b/fs/nfs/dir.c
index ce8cb926526b..32e6c53520e2 100644
--- a/fs/nfs/dir.c
+++ b/fs/nfs/dir.c
@@ -450,7 +450,8 @@ void nfs_prime_dcache(struct dentry *parent, struct nfs_entry *entry)
                        nfs_refresh_inode(dentry->d_inode, entry->fattr);
                        goto out;
                } else {
-                        d_drop(dentry);
+                        if (d_invalidate(dentry) != 0)
+                                goto out;
                        dput(dentry);
                }
        }
@@ -870,7 +871,7 @@ out:
        return res;
 }
-static loff_t nfs_llseek_dir(struct file *filp, loff_t offset, int origin)
+static loff_t nfs_llseek_dir(struct file *filp, loff_t offset, int whence)
 {
        struct dentry *dentry = filp->f_path.dentry;
        struct inode *inode = dentry->d_inode;
@@ -879,10 +880,10 @@ static loff_t nfs_llseek_dir(struct file *filp, loff_t offset, int origin)
        dfprintk(FILE, "NFS: llseek dir(%s/%s, %lld, %d)\n",
                        dentry->d_parent->d_name.name,
                        dentry->d_name.name,
-                        offset, origin);
+                        offset, whence);
        mutex_lock(&inode->i_mutex);
-        switch (origin) {
+        switch (whence) {
                case 1:
                        offset += filp->f_pos;
                case 0:
@@ -978,10 +979,11 @@ static int nfs_is_exclusive_create(struct inode *dir, unsigned int flags)
 * particular file and the "nocto" mount flag is not set.
 *
 */
-static inline
+static
 int nfs_lookup_verify_inode(struct inode *inode, unsigned int flags)
 {
        struct nfs_server *server = NFS_SERVER(inode);
+        int ret;
        if (IS_AUTOMOUNT(inode))
                return 0;
@@ -992,9 +994,13 @@ int nfs_lookup_verify_inode(struct inode *inode, unsigned int flags)
        if ((flags & LOOKUP_OPEN) && !(server->flags & NFS_MOUNT_NOCTO) &&
            (S_ISREG(inode->i_mode) || S_ISDIR(inode->i_mode)))
                goto out_force;
-        return 0;
+out:
+        return (inode->i_nlink == 0) ? -ENOENT : 0;
 out_force:
-        return __nfs_revalidate_inode(server, inode);
+        ret = __nfs_revalidate_inode(server, inode);
+        if (ret != 0)
+                return ret;
+        goto out;
 }
 /*
@@ -1100,6 +1106,8 @@ out_set_verifier:
 out_zap_parent:
        nfs_zap_caches(dir);
 out_bad:
+        nfs_free_fattr(fattr);
+        nfs_free_fhandle(fhandle);
        nfs_mark_for_revalidate(dir);
        if (inode && S_ISDIR(inode->i_mode)) {
                /* Purge readdir caches. */
@@ -1112,8 +1120,6 @@ out_zap_parent:
                shrink_dcache_parent(dentry);
        }
        d_drop(dentry);
-        nfs_free_fattr(fattr);
-        nfs_free_fhandle(fhandle);
        dput(parent);
        dfprintk(LOOKUPCACHE, "NFS: %s(%s/%s) is invalid\n",
                        __func__, dentry->d_parent->d_name.name,
@@ -1155,11 +1161,14 @@ static int nfs_dentry_delete(const struct dentry *dentry)
 }
+/* Ensure that we revalidate inode->i_nlink */
 static void nfs_drop_nlink(struct inode *inode)
 {
        spin_lock(&inode->i_lock);
-        if (inode->i_nlink > 0)
+        /* drop the inode if we're reasonably sure this is the last link */
-                drop_nlink(inode);
+        if (inode->i_nlink == 1)
+                clear_nlink(inode);
+        NFS_I(inode)->cache_validity |= NFS_INO_INVALID_ATTR;
        spin_unlock(&inode->i_lock);
 }
@@ -1174,8 +1183,8 @@ static void nfs_dentry_iput(struct dentry *dentry, struct inode *inode)
                NFS_I(inode)->cache_validity |= NFS_INO_INVALID_DATA;
        if (dentry->d_flags & DCACHE_NFSFS_RENAMED) {
-                drop_nlink(inode);
                nfs_complete_unlink(dentry, inode);
+                nfs_drop_nlink(inode);
        }
        iput(inode);
 }
@@ -1646,10 +1655,8 @@ static int nfs_safe_remove(struct dentry *dentry)
        if (inode != NULL) {
                NFS_PROTO(inode)->return_delegation(inode);
                error = NFS_PROTO(dir)->remove(dir, &dentry->d_name);
-                /* The VFS may want to delete this inode */
                if (error == 0)
                        nfs_drop_nlink(inode);
-                nfs_mark_for_revalidate(inode);
        } else
                error = NFS_PROTO(dir)->remove(dir, &dentry->d_name);
        if (error == -ENOENT)
diff --git a/fs/nfs/direct.c b/fs/nfs/direct.c
index cae26cbd59ee..0bd7a55a5f07 100644
--- a/fs/nfs/direct.c
+++ b/fs/nfs/direct.c
@@ -266,21 +266,8 @@ static void nfs_direct_read_completion(struct nfs_pgio_header *hdr)
                struct nfs_page *req = nfs_list_entry(hdr->pages.next);
                struct page *page = req->wb_page;
-                if (test_bit(NFS_IOHDR_EOF, &hdr->flags)) {
+                if (!PageCompound(page) && bytes < hdr->good_bytes)
-                        if (bytes > hdr->good_bytes)
+                        set_page_dirty(page);
-                                zero_user(page, 0, PAGE_SIZE);
-                        else if (hdr->good_bytes - bytes < PAGE_SIZE)
-                                zero_user_segment(page,
-                                        hdr->good_bytes & ~PAGE_MASK,
-                                        PAGE_SIZE);
-                }
-                if (!PageCompound(page)) {
-                        if (test_bit(NFS_IOHDR_ERROR, &hdr->flags)) {
-                                if (bytes < hdr->good_bytes)
-                                        set_page_dirty(page);
-                        } else
-                                set_page_dirty(page);
-                }
                bytes += req->wb_bytes;
                nfs_list_remove_request(req);
                nfs_direct_readpage_release(req);
diff --git a/fs/nfs/file.c b/fs/nfs/file.c
index 582bb8866131..3c2b893665ba 100644
--- a/fs/nfs/file.c
+++ b/fs/nfs/file.c
@@ -119,18 +119,18 @@ force_reval:
        return __nfs_revalidate_inode(server, inode);
 }
-loff_t nfs_file_llseek(struct file *filp, loff_t offset, int origin)
+loff_t nfs_file_llseek(struct file *filp, loff_t offset, int whence)
 {
        dprintk("NFS: llseek file(%s/%s, %lld, %d)\n",
                        filp->f_path.dentry->d_parent->d_name.name,
                        filp->f_path.dentry->d_name.name,
-                        offset, origin);
+                        offset, whence);
        /*
-         * origin == SEEK_END || SEEK_DATA || SEEK_HOLE => we must revalidate
+         * whence == SEEK_END || SEEK_DATA || SEEK_HOLE => we must revalidate
         * the cached file length
         */
-        if (origin != SEEK_SET && origin != SEEK_CUR) {
+        if (whence != SEEK_SET && whence != SEEK_CUR) {
                struct inode *inode = filp->f_mapping->host;
                int retval = nfs_revalidate_file_size(inode, filp);
@@ -138,7 +138,7 @@ loff_t nfs_file_llseek(struct file *filp, loff_t offset, int origin)
                        return (loff_t)retval;
        }
-        return generic_file_llseek(filp, offset, origin);
+        return generic_file_llseek(filp, offset, whence);
 }
 EXPORT_SYMBOL_GPL(nfs_file_llseek);
diff --git a/fs/nfs/idmap.c b/fs/nfs/idmap.c
index 9cc4a3fbf4b0..bc3968fa81e5 100644
--- a/fs/nfs/idmap.c
+++ b/fs/nfs/idmap.c
@@ -193,19 +193,15 @@ static int nfs_idmap_init_keyring(void)
        if (!cred)
                return -ENOMEM;
-        keyring = key_alloc(&key_type_keyring, ".id_resolver", 0, 0, cred,
+        keyring = keyring_alloc(".id_resolver", 0, 0, cred,
-                             (KEY_POS_ALL & ~KEY_POS_SETATTR) |
+                                (KEY_POS_ALL & ~KEY_POS_SETATTR) |
-                             KEY_USR_VIEW | KEY_USR_READ,
+                                KEY_USR_VIEW | KEY_USR_READ,
-                             KEY_ALLOC_NOT_IN_QUOTA);
+                                KEY_ALLOC_NOT_IN_QUOTA, NULL);
        if (IS_ERR(keyring)) {
                ret = PTR_ERR(keyring);
                goto failed_put_cred;
        }
-        ret = key_instantiate_and_link(keyring, NULL, 0, NULL, NULL);
-        if (ret < 0)
-                goto failed_put_key;
        ret = register_key_type(&key_type_id_resolver);
        if (ret < 0)
                goto failed_put_key;
diff --git a/fs/nfs/inode.c b/fs/nfs/inode.c
index 6fa01aea2488..2faae14d89f4 100644
--- a/fs/nfs/inode.c
+++ b/fs/nfs/inode.c
@@ -107,13 +107,19 @@ u64 nfs_compat_user_ino64(u64 fileid)
        return ino;
 }
+int nfs_drop_inode(struct inode *inode)
+{
+        return NFS_STALE(inode) || generic_drop_inode(inode);
+}
+EXPORT_SYMBOL_GPL(nfs_drop_inode);
 void nfs_clear_inode(struct inode *inode)
 {
        /*
         * The following should never happen...
         */
-        BUG_ON(nfs_have_writebacks(inode));
+        WARN_ON_ONCE(nfs_have_writebacks(inode));
-        BUG_ON(!list_empty(&NFS_I(inode)->open_files));
+        WARN_ON_ONCE(!list_empty(&NFS_I(inode)->open_files));
        nfs_zap_acl_cache(inode);
        nfs_access_zap_cache(inode);
        nfs_fscache_release_inode_cookie(inode);
diff --git a/fs/nfs/internal.h b/fs/nfs/internal.h
index 05521cadac2e..f0e6c7df1a07 100644
--- a/fs/nfs/internal.h
+++ b/fs/nfs/internal.h
@@ -18,27 +18,6 @@ struct nfs_string;
 */
 #define NFS_MAX_READAHEAD       (RPC_DEF_SLOT_TABLE - 1)
-/*
- * Determine if sessions are in use.
- */
-static inline int nfs4_has_session(const struct nfs_client *clp)
-{
-#ifdef CONFIG_NFS_V4_1
-        if (clp->cl_session)
-                return 1;
-#endif /* CONFIG_NFS_V4_1 */
-        return 0;
-}
-static inline int nfs4_has_persistent_session(const struct nfs_client *clp)
-{
-#ifdef CONFIG_NFS_V4_1
-        if (nfs4_has_session(clp))
-                return (clp->cl_session->flags & SESSION4_PERSIST);
-#endif /* CONFIG_NFS_V4_1 */
-        return 0;
-}
 static inline void nfs_attr_check_mountpoint(struct super_block *parent, struct nfs_fattr *fattr)
 {
        if (!nfs_fsid_equal(&NFS_SB(parent)->fsid, &fattr->fsid))
@@ -276,8 +255,6 @@ extern const u32 nfs41_maxwrite_overhead;
 extern struct rpc_procinfo nfs4_procedures[];
 #endif
-extern int nfs4_init_ds_session(struct nfs_client *, unsigned long);
 /* proc.c */
 void nfs_close_context(struct nfs_open_context *ctx, int is_sync);
 extern struct nfs_client *nfs_init_client(struct nfs_client *clp,
@@ -319,6 +296,7 @@ extern struct workqueue_struct *nfsiod_workqueue;
 extern struct inode *nfs_alloc_inode(struct super_block *sb);
 extern void nfs_destroy_inode(struct inode *);
 extern int nfs_write_inode(struct inode *, struct writeback_control *);
+extern int nfs_drop_inode(struct inode *);
 extern void nfs_clear_inode(struct inode *);
 extern void nfs_evict_inode(struct inode *);
 void nfs_zap_acl_cache(struct inode *inode);
@@ -386,9 +364,6 @@ extern int nfs_initiate_read(struct rpc_clnt *clnt,
 extern void nfs_read_prepare(struct rpc_task *task, void *calldata);
 extern int nfs_generic_pagein(struct nfs_pageio_descriptor *desc,
                              struct nfs_pgio_header *hdr);
-extern void nfs_pageio_init_read(struct nfs_pageio_descriptor *pgio,
-                        struct inode *inode,
-                        const struct nfs_pgio_completion_ops *compl_ops);
 extern void nfs_pageio_reset_read_mds(struct nfs_pageio_descriptor *pgio);
 extern void nfs_readdata_release(struct nfs_read_data *rdata);
@@ -411,9 +386,6 @@ extern struct nfs_write_header *nfs_writehdr_alloc(void);
 extern void nfs_writehdr_free(struct nfs_pgio_header *hdr);
 extern int nfs_generic_flush(struct nfs_pageio_descriptor *desc,
                             struct nfs_pgio_header *hdr);
-extern void nfs_pageio_init_write(struct nfs_pageio_descriptor *pgio,
-                        struct inode *inode, int ioflags,
-                        const struct nfs_pgio_completion_ops *compl_ops);
 extern void nfs_pageio_reset_write_mds(struct nfs_pageio_descriptor *pgio);
 extern void nfs_writedata_release(struct nfs_write_data *wdata);
 extern void nfs_commit_free(struct nfs_commit_data *p);
@@ -474,18 +446,6 @@ extern struct nfs_client *nfs4_init_client(struct nfs_client *clp,
                            const struct rpc_timeout *timeparms,
                            const char *ip_addr,
                            rpc_authflavor_t authflavour);
-extern int _nfs4_call_sync(struct rpc_clnt *clnt,
-                           struct nfs_server *server,
-                           struct rpc_message *msg,
-                           struct nfs4_sequence_args *args,
-                           struct nfs4_sequence_res *res,
-                           int cache_reply);
-extern int _nfs4_call_sync_session(struct rpc_clnt *clnt,
-                                   struct nfs_server *server,
-                                   struct rpc_message *msg,
-                                   struct nfs4_sequence_args *args,
-                                   struct nfs4_sequence_res *res,
-                                   int cache_reply);
 extern int nfs40_walk_client_list(struct nfs_client *clp,
                                struct nfs_client **result,
                                struct rpc_cred *cred);
diff --git a/fs/nfs/mount_clnt.c b/fs/nfs/mount_clnt.c
index 015f71f8f62c..91a6faf811ac 100644
--- a/fs/nfs/mount_clnt.c
+++ b/fs/nfs/mount_clnt.c
@@ -169,6 +169,9 @@ int nfs_mount(struct nfs_mount_request *info)
                (info->hostname ? info->hostname : "server"),
                        info->dirpath);
+        if (strlen(info->dirpath) > MNTPATHLEN)
+                return -ENAMETOOLONG;
        if (info->noresvport)
                args.flags |= RPC_CLNT_CREATE_NONPRIVPORT;
@@ -242,6 +245,9 @@ void nfs_umount(const struct nfs_mount_request *info)
        struct rpc_clnt *clnt;
        int status;
+        if (strlen(info->dirpath) > MNTPATHLEN)
+                return;
        if (info->noresvport)
                args.flags |= RPC_CLNT_CREATE_NONPRIVPORT;
@@ -283,7 +289,6 @@ static void encode_mntdirpath(struct xdr_stream *xdr, const char *pathname)
        const u32 pathname_len = strlen(pathname);
        __be32 *p;
-        BUG_ON(pathname_len > MNTPATHLEN);
        p = xdr_reserve_space(xdr, 4 + pathname_len);
        xdr_encode_opaque(p, pathname, pathname_len);
 }
diff --git a/fs/nfs/nfs2xdr.c b/fs/nfs/nfs2xdr.c
index d04f0df7be55..06b9df49f7f7 100644
--- a/fs/nfs/nfs2xdr.c
+++ b/fs/nfs/nfs2xdr.c
@@ -195,7 +195,6 @@ static void encode_fhandle(struct xdr_stream *xdr, const struct nfs_fh *fh)
 {
        __be32 *p;
-        BUG_ON(fh->size != NFS2_FHSIZE);
        p = xdr_reserve_space(xdr, NFS2_FHSIZE);
        memcpy(p, fh->data, NFS2_FHSIZE);
 }
@@ -388,7 +387,7 @@ static void encode_filename(struct xdr_stream *xdr,
 {
        __be32 *p;
-        BUG_ON(length > NFS2_MAXNAMLEN);
+        WARN_ON_ONCE(length > NFS2_MAXNAMLEN);
        p = xdr_reserve_space(xdr, 4 + length);
        xdr_encode_opaque(p, name, length);
 }
@@ -428,7 +427,6 @@ static void encode_path(struct xdr_stream *xdr, struct page **pages, u32 length)
 {
        __be32 *p;
-        BUG_ON(length > NFS2_MAXPATHLEN);
        p = xdr_reserve_space(xdr, 4);
        *p = cpu_to_be32(length);
        xdr_write_pages(xdr, pages, 0, length);
diff --git a/fs/nfs/nfs3proc.c b/fs/nfs/nfs3proc.c
index 69322096c325..70efb63b1e42 100644
--- a/fs/nfs/nfs3proc.c
+++ b/fs/nfs/nfs3proc.c
@@ -24,14 +24,14 @@
 #define NFSDBG_FACILITY         NFSDBG_PROC
-/* A wrapper to handle the EJUKEBOX and EKEYEXPIRED error messages */
+/* A wrapper to handle the EJUKEBOX error messages */
 static int
 nfs3_rpc_wrapper(struct rpc_clnt *clnt, struct rpc_message *msg, int flags)
 {
        int res;
        do {
                res = rpc_call_sync(clnt, msg, flags);
-                if (res != -EJUKEBOX && res != -EKEYEXPIRED)
+                if (res != -EJUKEBOX)
                        break;
                freezable_schedule_timeout_killable(NFS_JUKEBOX_RETRY_TIME);
                res = -ERESTARTSYS;
@@ -44,7 +44,7 @@ nfs3_rpc_wrapper(struct rpc_clnt *clnt, struct rpc_message *msg, int flags)
 static int
 nfs3_async_handle_jukebox(struct rpc_task *task, struct inode *inode)
 {
-        if (task->tk_status != -EJUKEBOX && task->tk_status != -EKEYEXPIRED)
+        if (task->tk_status != -EJUKEBOX)
                return 0;
        if (task->tk_status == -EJUKEBOX)
                nfs_inc_stats(inode, NFSIOS_DELAY);
diff --git a/fs/nfs/nfs3xdr.c b/fs/nfs/nfs3xdr.c
index 6cbe89400dfc..bffc32406fbf 100644
--- a/fs/nfs/nfs3xdr.c
+++ b/fs/nfs/nfs3xdr.c
@@ -198,7 +198,7 @@ static void encode_filename3(struct xdr_stream *xdr,
 {
        __be32 *p;
-        BUG_ON(length > NFS3_MAXNAMLEN);
+        WARN_ON_ONCE(length > NFS3_MAXNAMLEN);
        p = xdr_reserve_space(xdr, 4 + length);
        xdr_encode_opaque(p, name, length);
 }
@@ -238,7 +238,6 @@ out_overflow:
 static void encode_nfspath3(struct xdr_stream *xdr, struct page **pages,
                            const u32 length)
 {
-        BUG_ON(length > NFS3_MAXPATHLEN);
        encode_uint32(xdr, length);
        xdr_write_pages(xdr, pages, 0, length);
 }
@@ -388,7 +387,6 @@ out_overflow:
 */
 static void encode_ftype3(struct xdr_stream *xdr, const u32 type)
 {
-        BUG_ON(type > NF3FIFO);
        encode_uint32(xdr, type);
 }
@@ -443,7 +441,7 @@ static void encode_nfs_fh3(struct xdr_stream *xdr, const struct nfs_fh *fh)
 {
        __be32 *p;
-        BUG_ON(fh->size > NFS3_FHSIZE);
+        WARN_ON_ONCE(fh->size > NFS3_FHSIZE);
        p = xdr_reserve_space(xdr, 4 + fh->size);
        xdr_encode_opaque(p, fh->data, fh->size);
 }
@@ -1339,6 +1337,7 @@ static void nfs3_xdr_enc_setacl3args(struct rpc_rqst *req,
        error = nfsacl_encode(xdr->buf, base, args->inode,
                            (args->mask & NFS_ACL) ?
                            args->acl_access : NULL, 1, 0);
+        /* FIXME: this is just broken */
        BUG_ON(error < 0);
        error = nfsacl_encode(xdr->buf, base + error, args->inode,
                            (args->mask & NFS_DFACL) ?
diff --git a/fs/nfs/nfs4_fs.h b/fs/nfs/nfs4_fs.h
index a525fdefccde..a3f488b074a2 100644
--- a/fs/nfs/nfs4_fs.h
+++ b/fs/nfs/nfs4_fs.h
@@ -11,6 +11,8 @@
 #if IS_ENABLED(CONFIG_NFS_V4)
+#define NFS4_MAX_LOOP_ON_RECOVER (10)
 struct idmap;
 enum nfs4_client_state {
@@ -21,18 +23,12 @@ enum nfs4_client_state {
        NFS4CLNT_RECLAIM_NOGRACE,
        NFS4CLNT_DELEGRETURN,
        NFS4CLNT_SESSION_RESET,
-        NFS4CLNT_RECALL_SLOT,
        NFS4CLNT_LEASE_CONFIRM,
        NFS4CLNT_SERVER_SCOPE_MISMATCH,
        NFS4CLNT_PURGE_STATE,
        NFS4CLNT_BIND_CONN_TO_SESSION,
 };
-enum nfs4_session_state {
-        NFS4_SESSION_INITING,
-        NFS4_SESSION_DRAINING,
-};
 #define NFS4_RENEW_TIMEOUT              0x01
 #define NFS4_RENEW_DELEGATION_CB        0x02
@@ -43,8 +39,7 @@ struct nfs4_minor_version_ops {
                        struct nfs_server *server,
                        struct rpc_message *msg,
                        struct nfs4_sequence_args *args,
-                        struct nfs4_sequence_res *res,
+                        struct nfs4_sequence_res *res);
-                        int cache_reply);
        bool    (*match_stateid)(const nfs4_stateid *,
                        const nfs4_stateid *);
        int     (*find_root_sec)(struct nfs_server *, struct nfs_fh *,
@@ -241,18 +236,14 @@ static inline struct nfs4_session *nfs4_get_session(const struct nfs_server *ser
        return server->nfs_client->cl_session;
 }
-extern bool nfs4_set_task_privileged(struct rpc_task *task, void *dummy);
 extern int nfs4_setup_sequence(const struct nfs_server *server,
                struct nfs4_sequence_args *args, struct nfs4_sequence_res *res,
                struct rpc_task *task);
 extern int nfs41_setup_sequence(struct nfs4_session *session,
                struct nfs4_sequence_args *args, struct nfs4_sequence_res *res,
                struct rpc_task *task);
-extern void nfs4_destroy_session(struct nfs4_session *session);
-extern struct nfs4_session *nfs4_alloc_session(struct nfs_client *clp);
 extern int nfs4_proc_create_session(struct nfs_client *, struct rpc_cred *);
 extern int nfs4_proc_destroy_session(struct nfs4_session *, struct rpc_cred *);
-extern int nfs4_init_session(struct nfs_server *server);
 extern int nfs4_proc_get_lease_time(struct nfs_client *clp,
                struct nfs_fsinfo *fsinfo);
 extern int nfs4_proc_layoutcommit(struct nfs4_layoutcommit_data *data,
@@ -280,11 +271,7 @@ static inline int nfs4_setup_sequence(const struct nfs_server *server,
                struct nfs4_sequence_args *args, struct nfs4_sequence_res *res,
                struct rpc_task *task)
 {
-        return 0;
+        rpc_call_start(task);
-}
-static inline int nfs4_init_session(struct nfs_server *server)
-{
        return 0;
 }
@@ -321,17 +308,20 @@ extern void nfs4_renew_state(struct work_struct *);
 /* nfs4state.c */
 struct rpc_cred *nfs4_get_setclientid_cred(struct nfs_client *clp);
+struct rpc_cred *nfs4_get_machine_cred_locked(struct nfs_client *clp);
 struct rpc_cred *nfs4_get_renew_cred_locked(struct nfs_client *clp);
 int nfs4_discover_server_trunking(struct nfs_client *clp,
                        struct nfs_client **);
 int nfs40_discover_server_trunking(struct nfs_client *clp,
                        struct nfs_client **, struct rpc_cred *);
 #if defined(CONFIG_NFS_V4_1)
-struct rpc_cred *nfs4_get_machine_cred_locked(struct nfs_client *clp);
 struct rpc_cred *nfs4_get_exchange_id_cred(struct nfs_client *clp);
 int nfs41_discover_server_trunking(struct nfs_client *clp,
                        struct nfs_client **, struct rpc_cred *);
 extern void nfs4_schedule_session_recovery(struct nfs4_session *, int);
+extern void nfs41_server_notify_target_slotid_update(struct nfs_client *clp);
+extern void nfs41_server_notify_highest_slotid_update(struct nfs_client *clp);
 #else
 static inline void nfs4_schedule_session_recovery(struct nfs4_session *session, int err)
 {
@@ -349,11 +339,12 @@ extern void nfs4_state_set_mode_locked(struct nfs4_state *, fmode_t);
 extern void nfs_inode_find_state_and_recover(struct inode *inode,
                const nfs4_stateid *stateid);
 extern void nfs4_schedule_lease_recovery(struct nfs_client *);
+extern int nfs4_wait_clnt_recover(struct nfs_client *clp);
+extern int nfs4_client_recover_expired_lease(struct nfs_client *clp);
 extern void nfs4_schedule_state_manager(struct nfs_client *);
 extern void nfs4_schedule_path_down_recovery(struct nfs_client *clp);
 extern void nfs4_schedule_stateid_recovery(const struct nfs_server *, struct nfs4_state *);
 extern void nfs41_handle_sequence_flag_errors(struct nfs_client *clp, u32 flags);
-extern void nfs41_handle_recall_slot(struct nfs_client *clp);
 extern void nfs41_handle_server_scope(struct nfs_client *,
                                      struct nfs41_server_scope **);
 extern void nfs4_put_lock_state(struct nfs4_lock_state *lsp);
diff --git a/fs/nfs/nfs4client.c b/fs/nfs/nfs4client.c
index 6bacfde1319a..acc347268124 100644
--- a/fs/nfs/nfs4client.c
+++ b/fs/nfs/nfs4client.c
@@ -12,6 +12,7 @@
 #include "internal.h"
 #include "callback.h"
 #include "delegation.h"
+#include "nfs4session.h"
 #include "pnfs.h"
 #include "netns.h"
@@ -713,10 +714,6 @@ static int nfs4_server_common_setup(struct nfs_server *server,
        struct nfs_fattr *fattr;
        int error;
-        BUG_ON(!server->nfs_client);
-        BUG_ON(!server->nfs_client->rpc_ops);
-        BUG_ON(!server->nfs_client->rpc_ops->file_inode_ops);
        /* data servers support only a subset of NFSv4.1 */
        if (is_ds_only_client(server->nfs_client))
                return -EPROTONOSUPPORT;
diff --git a/fs/nfs/nfs4file.c b/fs/nfs/nfs4file.c
index afddd6639afb..e7699308364a 100644
--- a/fs/nfs/nfs4file.c
+++ b/fs/nfs/nfs4file.c
@@ -20,7 +20,6 @@ nfs4_file_open(struct inode *inode, struct file *filp)
        struct iattr attr;
        int err;
-        BUG_ON(inode != dentry->d_inode);
        /*
         * If no cached dentry exists or if it's negative, NFSv4 handled the
         * opens in ->lookup() or ->create().
diff --git a/fs/nfs/nfs4filelayout.c b/fs/nfs/nfs4filelayout.c
index 2e45fd9c02a3..194c48410336 100644
--- a/fs/nfs/nfs4filelayout.c
+++ b/fs/nfs/nfs4filelayout.c
@@ -35,6 +35,7 @@
 #include <linux/sunrpc/metrics.h>
+#include "nfs4session.h"
 #include "internal.h"
 #include "delegation.h"
 #include "nfs4filelayout.h"
@@ -178,7 +179,6 @@ static int filelayout_async_handle_error(struct rpc_task *task,
                break;
        case -NFS4ERR_DELAY:
        case -NFS4ERR_GRACE:
-        case -EKEYEXPIRED:
                rpc_delay(task, FILELAYOUT_POLL_RETRY_MAX);
                break;
        case -NFS4ERR_RETRY_UNCACHED_REP:
@@ -306,12 +306,10 @@ static void filelayout_read_prepare(struct rpc_task *task, void *data)
        }
        rdata->read_done_cb = filelayout_read_done_cb;
-        if (nfs41_setup_sequence(rdata->ds_clp->cl_session,
+        nfs41_setup_sequence(rdata->ds_clp->cl_session,
-                                &rdata->args.seq_args, &rdata->res.seq_res,
+                        &rdata->args.seq_args,
-                                task))
+                        &rdata->res.seq_res,
-                return;
+                        task);
-        rpc_call_start(task);
 }
 static void filelayout_read_call_done(struct rpc_task *task, void *data)
@@ -408,12 +406,10 @@ static void filelayout_write_prepare(struct rpc_task *task, void *data)
                rpc_exit(task, 0);
                return;
        }
-        if (nfs41_setup_sequence(wdata->ds_clp->cl_session,
+        nfs41_setup_sequence(wdata->ds_clp->cl_session,
-                                &wdata->args.seq_args, &wdata->res.seq_res,
+                        &wdata->args.seq_args,
-                                task))
+                        &wdata->res.seq_res,
-                return;
+                        task);
-        rpc_call_start(task);
 }
 static void filelayout_write_call_done(struct rpc_task *task, void *data)
@@ -449,12 +445,10 @@ static void filelayout_commit_prepare(struct rpc_task *task, void *data)
 {
        struct nfs_commit_data *wdata = data;
-        if (nfs41_setup_sequence(wdata->ds_clp->cl_session,
+        nfs41_setup_sequence(wdata->ds_clp->cl_session,
-                                &wdata->args.seq_args, &wdata->res.seq_res,
+                        &wdata->args.seq_args,
-                                task))
+                        &wdata->res.seq_res,
-                return;
+                        task);
-        rpc_call_start(task);
 }
 static void filelayout_write_commit_done(struct rpc_task *task, void *data)
@@ -512,7 +506,6 @@ filelayout_read_pagelist(struct nfs_read_data *data)
        loff_t offset = data->args.offset;
        u32 j, idx;
        struct nfs_fh *fh;
-        int status;
        dprintk("--> %s ino %lu pgbase %u req %Zu@%llu\n",
                __func__, hdr->inode->i_ino,
@@ -538,9 +531,8 @@ filelayout_read_pagelist(struct nfs_read_data *data)
        data->mds_offset = offset;
        /* Perform an asynchronous read to ds */
-        status = nfs_initiate_read(ds->ds_clp->cl_rpcclient, data,
+        nfs_initiate_read(ds->ds_clp->cl_rpcclient, data,
                                  &filelayout_read_call_ops, RPC_TASK_SOFTCONN);
-        BUG_ON(status != 0);
        return PNFS_ATTEMPTED;
 }
@@ -554,7 +546,6 @@ filelayout_write_pagelist(struct nfs_write_data *data, int sync)
        loff_t offset = data->args.offset;
        u32 j, idx;
        struct nfs_fh *fh;
-        int status;
        /* Retrieve the correct rpc_client for the byte range */
        j = nfs4_fl_calc_j_index(lseg, offset);
@@ -579,10 +570,9 @@ filelayout_write_pagelist(struct nfs_write_data *data, int sync)
        data->args.offset = filelayout_get_dserver_offset(lseg, offset);
        /* Perform an asynchronous write */
-        status = nfs_initiate_write(ds->ds_clp->cl_rpcclient, data,
+        nfs_initiate_write(ds->ds_clp->cl_rpcclient, data,
                                    &filelayout_write_call_ops, sync,
                                    RPC_TASK_SOFTCONN);
-        BUG_ON(status != 0);
        return PNFS_ATTEMPTED;
 }
@@ -909,7 +899,7 @@ static void
 filelayout_pg_init_read(struct nfs_pageio_descriptor *pgio,
                        struct nfs_page *req)
 {
-        BUG_ON(pgio->pg_lseg != NULL);
+        WARN_ON_ONCE(pgio->pg_lseg != NULL);
        if (req->wb_offset != req->wb_pgbase) {
                /*
@@ -939,7 +929,7 @@ filelayout_pg_init_write(struct nfs_pageio_descriptor *pgio,
        struct nfs_commit_info cinfo;
        int status;
-        BUG_ON(pgio->pg_lseg != NULL);
+        WARN_ON_ONCE(pgio->pg_lseg != NULL);
        if (req->wb_offset != req->wb_pgbase)
                goto out_mds;
@@ -1187,7 +1177,6 @@ static void filelayout_recover_commit_reqs(struct list_head *dst,
         */
        for (i = 0, b = cinfo->ds->buckets; i < cinfo->ds->nbuckets; i++, b++) {
                if (transfer_commit_list(&b->written, dst, cinfo, 0)) {
-                        BUG_ON(!list_empty(&b->written));
                        pnfs_put_lseg(b->wlseg);
                        b->wlseg = NULL;
                }
diff --git a/fs/nfs/nfs4filelayoutdev.c b/fs/nfs/nfs4filelayoutdev.c
index a8eaa9b7bb0f..b720064bcd7f 100644
--- a/fs/nfs/nfs4filelayoutdev.c
+++ b/fs/nfs/nfs4filelayoutdev.c
@@ -33,6 +33,7 @@
 #include <linux/module.h>
 #include "internal.h"
+#include "nfs4session.h"
 #include "nfs4filelayout.h"
 #define NFSDBG_FACILITY         NFSDBG_PNFS_LD
@@ -162,8 +163,6 @@ nfs4_ds_connect(struct nfs_server *mds_srv, struct nfs4_pnfs_ds *ds)
        dprintk("--> %s DS %s au_flavor %d\n", __func__, ds->ds_remotestr,
                mds_srv->nfs_client->cl_rpcclient->cl_auth->au_flavor);
-        BUG_ON(list_empty(&ds->ds_addrs));
        list_for_each_entry(da, &ds->ds_addrs, da_node) {
                dprintk("%s: DS %s: trying address %s\n",
                        __func__, ds->ds_remotestr, da->da_remotestr);
diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c
index 5eec4429970c..493f0f41c554 100644
--- a/fs/nfs/nfs4proc.c
+++ b/fs/nfs/nfs4proc.c
@@ -52,7 +52,6 @@
 #include <linux/mount.h>
 #include <linux/module.h>
 #include <linux/nfs_idmap.h>
-#include <linux/sunrpc/bc_xprt.h>
 #include <linux/xattr.h>
 #include <linux/utsname.h>
 #include <linux/freezer.h>
@@ -64,14 +63,14 @@
 #include "callback.h"
 #include "pnfs.h"
 #include "netns.h"
+#include "nfs4session.h"
 #define NFSDBG_FACILITY         NFSDBG_PROC
 #define NFS4_POLL_RETRY_MIN     (HZ/10)
 #define NFS4_POLL_RETRY_MAX     (15*HZ)
-#define NFS4_MAX_LOOP_ON_RECOVER (10)
 struct nfs4_opendata;
 static int _nfs4_proc_open(struct nfs4_opendata *data);
 static int _nfs4_recover_proc_open(struct nfs4_opendata *data);
@@ -206,7 +205,6 @@ static void nfs4_setup_readdir(u64 cookie, __be32 *verifier, struct dentry *dent
 {
        __be32 *start, *p;
-        BUG_ON(readdir->count < 80);
        if (cookie > 2) {
                readdir->cookie = cookie;
                memcpy(&readdir->verifier, verifier, sizeof(readdir->verifier));
@@ -256,22 +254,6 @@ static void nfs4_setup_readdir(u64 cookie, __be32 *verifier, struct dentry *dent
        kunmap_atomic(start);
 }
-static int nfs4_wait_clnt_recover(struct nfs_client *clp)
-{
-        int res;
-        might_sleep();
-        res = wait_on_bit(&clp->cl_state, NFS4CLNT_MANAGER_RUNNING,
-                        nfs_wait_bit_killable, TASK_KILLABLE);
-        if (res)
-                return res;
-        if (clp->cl_cons_state < 0)
-                return clp->cl_cons_state;
-        return 0;
-}
 static int nfs4_delay(struct rpc_clnt *clnt, long *timeout)
 {
        int res = 0;
@@ -351,7 +333,6 @@ static int nfs4_handle_exception(struct nfs_server *server, int errorcode, struc
                        }
                case -NFS4ERR_GRACE:
                case -NFS4ERR_DELAY:
-                case -EKEYEXPIRED:
                        ret = nfs4_delay(server->client, &exception->timeout);
                        if (ret != 0)
                                break;
@@ -397,144 +378,136 @@ static void renew_lease(const struct nfs_server *server, unsigned long timestamp
 #if defined(CONFIG_NFS_V4_1)
-/*
- * nfs4_free_slot - free a slot and efficiently update slot table.
- *
- * freeing a slot is trivially done by clearing its respective bit
- * in the bitmap.
- * If the freed slotid equals highest_used_slotid we want to update it
- * so that the server would be able to size down the slot table if needed,
- * otherwise we know that the highest_used_slotid is still in use.
- * When updating highest_used_slotid there may be "holes" in the bitmap
- * so we need to scan down from highest_used_slotid to 0 looking for the now
- * highest slotid in use.
- * If none found, highest_used_slotid is set to NFS4_NO_SLOT.
- *
- * Must be called while holding tbl->slot_tbl_lock
- */
-static void
-nfs4_free_slot(struct nfs4_slot_table *tbl, u32 slotid)
-{
-        BUG_ON(slotid >= NFS4_MAX_SLOT_TABLE);
-        /* clear used bit in bitmap */
-        __clear_bit(slotid, tbl->used_slots);
-        /* update highest_used_slotid when it is freed */
-        if (slotid == tbl->highest_used_slotid) {
-                slotid = find_last_bit(tbl->used_slots, tbl->max_slots);
-                if (slotid < tbl->max_slots)
-                        tbl->highest_used_slotid = slotid;
-                else
-                        tbl->highest_used_slotid = NFS4_NO_SLOT;
-        }
-        dprintk("%s: slotid %u highest_used_slotid %d\n", __func__,
-                slotid, tbl->highest_used_slotid);
-}
-bool nfs4_set_task_privileged(struct rpc_task *task, void *dummy)
-{
-        rpc_task_set_priority(task, RPC_PRIORITY_PRIVILEGED);
-        return true;
-}
-/*
- * Signal state manager thread if session fore channel is drained
- */
-static void nfs4_check_drain_fc_complete(struct nfs4_session *ses)
-{
-        if (!test_bit(NFS4_SESSION_DRAINING, &ses->session_state)) {
-                rpc_wake_up_first(&ses->fc_slot_table.slot_tbl_waitq,
-                                nfs4_set_task_privileged, NULL);
-                return;
-        }
-        if (ses->fc_slot_table.highest_used_slotid != NFS4_NO_SLOT)
-                return;
-        dprintk("%s COMPLETE: Session Fore Channel Drained\n", __func__);
-        complete(&ses->fc_slot_table.complete);
-}
-/*
- * Signal state manager thread if session back channel is drained
- */
-void nfs4_check_drain_bc_complete(struct nfs4_session *ses)
-{
-        if (!test_bit(NFS4_SESSION_DRAINING, &ses->session_state) ||
-            ses->bc_slot_table.highest_used_slotid != NFS4_NO_SLOT)
-                return;
-        dprintk("%s COMPLETE: Session Back Channel Drained\n", __func__);
-        complete(&ses->bc_slot_table.complete);
-}
 static void nfs41_sequence_free_slot(struct nfs4_sequence_res *res)
 {
+        struct nfs4_session *session;
        struct nfs4_slot_table *tbl;
+        bool send_new_highest_used_slotid = false;
-        tbl = &res->sr_session->fc_slot_table;
        if (!res->sr_slot) {
                /* just wake up the next guy waiting since
                 * we may have not consumed a slot after all */
                dprintk("%s: No slot\n", __func__);
                return;
        }
+        tbl = res->sr_slot->table;
+        session = tbl->session;
        spin_lock(&tbl->slot_tbl_lock);
-        nfs4_free_slot(tbl, res->sr_slot - tbl->slots);
+        /* Be nice to the server: try to ensure that the last transmitted
-        nfs4_check_drain_fc_complete(res->sr_session);
+         * value for highest_user_slotid <= target_highest_slotid
+         */
+        if (tbl->highest_used_slotid > tbl->target_highest_slotid)
+                send_new_highest_used_slotid = true;
+        if (nfs41_wake_and_assign_slot(tbl, res->sr_slot)) {
+                send_new_highest_used_slotid = false;
+                goto out_unlock;
+        }
+        nfs4_free_slot(tbl, res->sr_slot);
+        if (tbl->highest_used_slotid != NFS4_NO_SLOT)
+                send_new_highest_used_slotid = false;
+out_unlock:
        spin_unlock(&tbl->slot_tbl_lock);
        res->sr_slot = NULL;
+        if (send_new_highest_used_slotid)
+                nfs41_server_notify_highest_slotid_update(session->clp);
 }
 static int nfs41_sequence_done(struct rpc_task *task, struct nfs4_sequence_res *res)
 {
-        unsigned long timestamp;
+        struct nfs4_session *session;
+        struct nfs4_slot *slot;
        struct nfs_client *clp;
+        bool interrupted = false;
-        /*
+        int ret = 1;
-         * sr_status remains 1 if an RPC level error occurred. The server
-         * may or may not have processed the sequence operation..
-         * Proceed as if the server received and processed the sequence
-         * operation.
-         */
-        if (res->sr_status == 1)
-                res->sr_status = NFS_OK;
        /* don't increment the sequence number if the task wasn't sent */
        if (!RPC_WAS_SENT(task))
                goto out;
+        slot = res->sr_slot;
+        session = slot->table->session;
+        if (slot->interrupted) {
+                slot->interrupted = 0;
+                interrupted = true;
+        }
        /* Check the SEQUENCE operation status */
        switch (res->sr_status) {
        case 0:
                /* Update the slot's sequence and clientid lease timer */
-                ++res->sr_slot->seq_nr;
+                ++slot->seq_nr;
-                timestamp = res->sr_renewal_time;
+                clp = session->clp;
-                clp = res->sr_session->clp;
+                do_renew_lease(clp, res->sr_timestamp);
-                do_renew_lease(clp, timestamp);
                /* Check sequence flags */
                if (res->sr_status_flags != 0)
                        nfs4_schedule_lease_recovery(clp);
+                nfs41_update_target_slotid(slot->table, slot, res);
                break;
+        case 1:
+                /*
+                 * sr_status remains 1 if an RPC level error occurred.
+                 * The server may or may not have processed the sequence
+                 * operation..
+                 * Mark the slot as having hosted an interrupted RPC call.
+                 */
+                slot->interrupted = 1;
+                goto out;
        case -NFS4ERR_DELAY:
                /* The server detected a resend of the RPC call and
                 * returned NFS4ERR_DELAY as per Section 2.10.6.2
                 * of RFC5661.
                 */
-                dprintk("%s: slot=%td seq=%d: Operation in progress\n",
+                dprintk("%s: slot=%u seq=%u: Operation in progress\n",
                        __func__,
-                        res->sr_slot - res->sr_session->fc_slot_table.slots,
+                        slot->slot_nr,
-                        res->sr_slot->seq_nr);
+                        slot->seq_nr);
                goto out_retry;
+        case -NFS4ERR_BADSLOT:
+                /*
+                 * The slot id we used was probably retired. Try again
+                 * using a different slot id.
+                 */
+                goto retry_nowait;
+        case -NFS4ERR_SEQ_MISORDERED:
+                /*
+                 * Was the last operation on this sequence interrupted?
+                 * If so, retry after bumping the sequence number.
+                 */
+                if (interrupted) {
+                        ++slot->seq_nr;
+                        goto retry_nowait;
+                }
+                /*
+                 * Could this slot have been previously retired?
+                 * If so, then the server may be expecting seq_nr = 1!
+                 */
+                if (slot->seq_nr != 1) {
+                        slot->seq_nr = 1;
+                        goto retry_nowait;
+                }
+                break;
+        case -NFS4ERR_SEQ_FALSE_RETRY:
+                ++slot->seq_nr;
+                goto retry_nowait;
        default:
                /* Just update the slot sequence no. */
-                ++res->sr_slot->seq_nr;
+                ++slot->seq_nr;
        }
 out:
        /* The session may be reset by one of the error handlers. */
        dprintk("%s: Error %d free the slot \n", __func__, res->sr_status);
        nfs41_sequence_free_slot(res);
-        return 1;
+        return ret;
+retry_nowait:
+        if (rpc_restart_call_prepare(task)) {
+                task->tk_status = 0;
+                ret = 0;
+        }
+        goto out;
 out_retry:
        if (!rpc_restart_call(task))
                goto out;
@@ -545,55 +518,27 @@ out_retry:
 static int nfs4_sequence_done(struct rpc_task *task,
                               struct nfs4_sequence_res *res)
 {
-        if (res->sr_session == NULL)
+        if (res->sr_slot == NULL)
                return 1;
        return nfs41_sequence_done(task, res);
 }
-/*
- * nfs4_find_slot - efficiently look for a free slot
- *
- * nfs4_find_slot looks for an unset bit in the used_slots bitmap.
- * If found, we mark the slot as used, update the highest_used_slotid,
- * and respectively set up the sequence operation args.
- * The slot number is returned if found, or NFS4_NO_SLOT otherwise.
- *
- * Note: must be called with under the slot_tbl_lock.
- */
-static u32
-nfs4_find_slot(struct nfs4_slot_table *tbl)
-{
-        u32 slotid;
-        u32 ret_id = NFS4_NO_SLOT;
-        dprintk("--> %s used_slots=%04lx highest_used=%u max_slots=%u\n",
-                __func__, tbl->used_slots[0], tbl->highest_used_slotid,
-                tbl->max_slots);
-        slotid = find_first_zero_bit(tbl->used_slots, tbl->max_slots);
-        if (slotid >= tbl->max_slots)
-                goto out;
-        __set_bit(slotid, tbl->used_slots);
-        if (slotid > tbl->highest_used_slotid ||
-                        tbl->highest_used_slotid == NFS4_NO_SLOT)
-                tbl->highest_used_slotid = slotid;
-        ret_id = slotid;
-out:
-        dprintk("<-- %s used_slots=%04lx highest_used=%d slotid=%d \n",
-                __func__, tbl->used_slots[0], tbl->highest_used_slotid, ret_id);
-        return ret_id;
-}
 static void nfs41_init_sequence(struct nfs4_sequence_args *args,
                struct nfs4_sequence_res *res, int cache_reply)
 {
-        args->sa_session = NULL;
+        args->sa_slot = NULL;
        args->sa_cache_this = 0;
+        args->sa_privileged = 0;
        if (cache_reply)
                args->sa_cache_this = 1;
-        res->sr_session = NULL;
        res->sr_slot = NULL;
 }
+static void nfs4_set_sequence_privileged(struct nfs4_sequence_args *args)
+{
+        args->sa_privileged = 1;
+}
 int nfs41_setup_sequence(struct nfs4_session *session,
                                struct nfs4_sequence_args *args,
                                struct nfs4_sequence_res *res,
@@ -601,59 +546,59 @@ int nfs41_setup_sequence(struct nfs4_session *session,
 {
        struct nfs4_slot *slot;
        struct nfs4_slot_table *tbl;
-        u32 slotid;
        dprintk("--> %s\n", __func__);
        /* slot already allocated? */
        if (res->sr_slot != NULL)
-                return 0;
+                goto out_success;
        tbl = &session->fc_slot_table;
+        task->tk_timeout = 0;
        spin_lock(&tbl->slot_tbl_lock);
        if (test_bit(NFS4_SESSION_DRAINING, &session->session_state) &&
-            !rpc_task_has_priority(task, RPC_PRIORITY_PRIVILEGED)) {
+            !args->sa_privileged) {
                /* The state manager will wait until the slot table is empty */
-                rpc_sleep_on(&tbl->slot_tbl_waitq, task, NULL);
-                spin_unlock(&tbl->slot_tbl_lock);
                dprintk("%s session is draining\n", __func__);
-                return -EAGAIN;
+                goto out_sleep;
        }
-        if (!rpc_queue_empty(&tbl->slot_tbl_waitq) &&
+        slot = nfs4_alloc_slot(tbl);
-            !rpc_task_has_priority(task, RPC_PRIORITY_PRIVILEGED)) {
+        if (IS_ERR(slot)) {
-                rpc_sleep_on(&tbl->slot_tbl_waitq, task, NULL);
+                /* If out of memory, try again in 1/4 second */
-                spin_unlock(&tbl->slot_tbl_lock);
+                if (slot == ERR_PTR(-ENOMEM))
-                dprintk("%s enforce FIFO order\n", __func__);
+                        task->tk_timeout = HZ >> 2;
-                return -EAGAIN;
-        }
-        slotid = nfs4_find_slot(tbl);
-        if (slotid == NFS4_NO_SLOT) {
-                rpc_sleep_on(&tbl->slot_tbl_waitq, task, NULL);
-                spin_unlock(&tbl->slot_tbl_lock);
                dprintk("<-- %s: no free slots\n", __func__);
-                return -EAGAIN;
+                goto out_sleep;
        }
        spin_unlock(&tbl->slot_tbl_lock);
-        rpc_task_set_priority(task, RPC_PRIORITY_NORMAL);
+        args->sa_slot = slot;
-        slot = tbl->slots + slotid;
-        args->sa_session = session;
-        args->sa_slotid = slotid;
-        dprintk("<-- %s slotid=%d seqid=%d\n", __func__, slotid, slot->seq_nr);
+        dprintk("<-- %s slotid=%d seqid=%d\n", __func__,
+                        slot->slot_nr, slot->seq_nr);
-        res->sr_session = session;
        res->sr_slot = slot;
-        res->sr_renewal_time = jiffies;
+        res->sr_timestamp = jiffies;
        res->sr_status_flags = 0;
        /*
         * sr_status is only set in decode_sequence, and so will remain
         * set to 1 if an rpc level failure occurs.
         */
        res->sr_status = 1;
+out_success:
+        rpc_call_start(task);
        return 0;
+out_sleep:
+        /* Privileged tasks are queued with top priority */
+        if (args->sa_privileged)
+                rpc_sleep_on_priority(&tbl->slot_tbl_waitq, task,
+                                NULL, RPC_PRIORITY_PRIVILEGED);
+        else
+                rpc_sleep_on(&tbl->slot_tbl_waitq, task, NULL);
+        spin_unlock(&tbl->slot_tbl_lock);
+        return -EAGAIN;
 }
 EXPORT_SYMBOL_GPL(nfs41_setup_sequence);
@@ -665,12 +610,14 @@ int nfs4_setup_sequence(const struct nfs_server *server,
        struct nfs4_session *session = nfs4_get_session(server);
        int ret = 0;
-        if (session == NULL)
+        if (session == NULL) {
+                rpc_call_start(task);
                goto out;
+        }
-        dprintk("--> %s clp %p session %p sr_slot %td\n",
+        dprintk("--> %s clp %p session %p sr_slot %d\n",
                __func__, session->clp, session, res->sr_slot ?
-                        res->sr_slot - session->fc_slot_table.slots : -1);
+                        res->sr_slot->slot_nr : -1);
        ret = nfs41_setup_sequence(session, args, res, task);
 out:
@@ -687,19 +634,11 @@ struct nfs41_call_sync_data {
 static void nfs41_call_sync_prepare(struct rpc_task *task, void *calldata)
 {
        struct nfs41_call_sync_data *data = calldata;
+        struct nfs4_session *session = nfs4_get_session(data->seq_server);
        dprintk("--> %s data->seq_server %p\n", __func__, data->seq_server);
-        if (nfs4_setup_sequence(data->seq_server, data->seq_args,
+        nfs41_setup_sequence(session, data->seq_args, data->seq_res, task);
-                                data->seq_res, task))
-                return;
-        rpc_call_start(task);
-}
-static void nfs41_call_priv_sync_prepare(struct rpc_task *task, void *calldata)
-{
-        rpc_task_set_priority(task, RPC_PRIORITY_PRIVILEGED);
-        nfs41_call_sync_prepare(task, calldata);
 }
 static void nfs41_call_sync_done(struct rpc_task *task, void *calldata)
@@ -714,17 +653,11 @@ static const struct rpc_call_ops nfs41_call_sync_ops = {
        .rpc_call_done = nfs41_call_sync_done,
 };
-static const struct rpc_call_ops nfs41_call_priv_sync_ops = {
-        .rpc_call_prepare = nfs41_call_priv_sync_prepare,
-        .rpc_call_done = nfs41_call_sync_done,
-};
 static int nfs4_call_sync_sequence(struct rpc_clnt *clnt,
                                   struct nfs_server *server,
                                   struct rpc_message *msg,
                                   struct nfs4_sequence_args *args,
-                                   struct nfs4_sequence_res *res,
+                                   struct nfs4_sequence_res *res)
-                                   int privileged)
 {
        int ret;
        struct rpc_task *task;
@@ -740,8 +673,6 @@ static int nfs4_call_sync_sequence(struct rpc_clnt *clnt,
                .callback_data = &data
        };
-        if (privileged)
-                task_setup.callback_ops = &nfs41_call_priv_sync_ops;
        task = rpc_run_task(&task_setup);
        if (IS_ERR(task))
                ret = PTR_ERR(task);
@@ -752,24 +683,18 @@ static int nfs4_call_sync_sequence(struct rpc_clnt *clnt,
        return ret;
 }
-int _nfs4_call_sync_session(struct rpc_clnt *clnt,
-                            struct nfs_server *server,
-                            struct rpc_message *msg,
-                            struct nfs4_sequence_args *args,
-                            struct nfs4_sequence_res *res,
-                            int cache_reply)
-{
-        nfs41_init_sequence(args, res, cache_reply);
-        return nfs4_call_sync_sequence(clnt, server, msg, args, res, 0);
-}
 #else
-static inline
+static
 void nfs41_init_sequence(struct nfs4_sequence_args *args,
                struct nfs4_sequence_res *res, int cache_reply)
 {
 }
+static void nfs4_set_sequence_privileged(struct nfs4_sequence_args *args)
+{
+}
 static int nfs4_sequence_done(struct rpc_task *task,
                               struct nfs4_sequence_res *res)
 {
@@ -777,18 +702,17 @@ static int nfs4_sequence_done(struct rpc_task *task,
 }
 #endif /* CONFIG_NFS_V4_1 */
+static
 int _nfs4_call_sync(struct rpc_clnt *clnt,
                    struct nfs_server *server,
                    struct rpc_message *msg,
                    struct nfs4_sequence_args *args,
-                    struct nfs4_sequence_res *res,
+                    struct nfs4_sequence_res *res)
-                    int cache_reply)
 {
-        nfs41_init_sequence(args, res, cache_reply);
        return rpc_call_sync(clnt, msg, 0);
 }
-static inline
+static
 int nfs4_call_sync(struct rpc_clnt *clnt,
                   struct nfs_server *server,
                   struct rpc_message *msg,
@@ -796,8 +720,9 @@ int nfs4_call_sync(struct rpc_clnt *clnt,
                   struct nfs4_sequence_res *res,
                   int cache_reply)
 {
+        nfs41_init_sequence(args, res, cache_reply);
        return server->nfs_client->cl_mvops->call_sync(clnt, server, msg,
-                                                args, res, cache_reply);
+                                                args, res);
 }
 static void update_changeattr(struct inode *dir, struct nfs4_change_info *cinfo)
@@ -1445,13 +1370,6 @@ int nfs4_open_delegation_recall(struct nfs_open_context *ctx, struct nfs4_state
                                nfs_inode_find_state_and_recover(state->inode,
                                                stateid);
                                nfs4_schedule_stateid_recovery(server, state);
-                        case -EKEYEXPIRED:
-                                /*
-                                 * User RPCSEC_GSS context has expired.
-                                 * We cannot recover this stateid now, so
-                                 * skip it and allow recovery thread to
-                                 * proceed.
-                                 */
                        case -ENOMEM:
                                err = 0;
                                goto out;
@@ -1574,20 +1492,12 @@ static void nfs4_open_prepare(struct rpc_task *task, void *calldata)
                                &data->o_res.seq_res,
                                task) != 0)
                nfs_release_seqid(data->o_arg.seqid);
-        else
-                rpc_call_start(task);
        return;
 unlock_no_action:
        rcu_read_unlock();
 out_no_action:
        task->tk_action = NULL;
+        nfs4_sequence_done(task, &data->o_res.seq_res);
-}
-static void nfs4_recover_open_prepare(struct rpc_task *task, void *calldata)
-{
-        rpc_task_set_priority(task, RPC_PRIORITY_PRIVILEGED);
-        nfs4_open_prepare(task, calldata);
 }
 static void nfs4_open_done(struct rpc_task *task, void *calldata)
@@ -1648,12 +1558,6 @@ static const struct rpc_call_ops nfs4_open_ops = {
        .rpc_release = nfs4_open_release,
 };
-static const struct rpc_call_ops nfs4_recover_open_ops = {
-        .rpc_call_prepare = nfs4_recover_open_prepare,
-        .rpc_call_done = nfs4_open_done,
-        .rpc_release = nfs4_open_release,
-};
 static int nfs4_run_open_task(struct nfs4_opendata *data, int isrecover)
 {
        struct inode *dir = data->dir->d_inode;
@@ -1683,7 +1587,7 @@ static int nfs4_run_open_task(struct nfs4_opendata *data, int isrecover)
        data->rpc_status = 0;
        data->cancelled = 0;
        if (isrecover)
-                task_setup_data.callback_ops = &nfs4_recover_open_ops;
+                nfs4_set_sequence_privileged(&o_arg->seq_args);
        task = rpc_run_task(&task_setup_data);
        if (IS_ERR(task))
                return PTR_ERR(task);
@@ -1789,24 +1693,6 @@ static int _nfs4_proc_open(struct nfs4_opendata *data)
        return 0;
 }
-static int nfs4_client_recover_expired_lease(struct nfs_client *clp)
-{
-        unsigned int loop;
-        int ret;
-        for (loop = NFS4_MAX_LOOP_ON_RECOVER; loop != 0; loop--) {
-                ret = nfs4_wait_clnt_recover(clp);
-                if (ret != 0)
-                        break;
-                if (!test_bit(NFS4CLNT_LEASE_EXPIRED, &clp->cl_state) &&
-                    !test_bit(NFS4CLNT_CHECK_LEASE,&clp->cl_state))
-                        break;
-                nfs4_schedule_state_manager(clp);
-                ret = -EIO;
-        }
-        return ret;
-}
 static int nfs4_recover_expired_lease(struct nfs_server *server)
 {
        return nfs4_client_recover_expired_lease(server->nfs_client);
@@ -2282,6 +2168,7 @@ static void nfs4_close_prepare(struct rpc_task *task, void *data)
        if (!call_close) {
                /* Note: exit _without_ calling nfs4_close_done */
                task->tk_action = NULL;
+                nfs4_sequence_done(task, &calldata->res.seq_res);
                goto out;
        }
@@ -2299,8 +2186,6 @@ static void nfs4_close_prepare(struct rpc_task *task, void *data)
                                &calldata->res.seq_res,
                                task) != 0)
                nfs_release_seqid(calldata->arg.seqid);
-        else
-                rpc_call_start(task);
 out:
        dprintk("%s: done!\n", __func__);
 }
@@ -2533,7 +2418,8 @@ static int nfs4_find_root_sec(struct nfs_server *server, struct nfs_fh *fhandle,
        rpc_authflavor_t flav_array[NFS_MAX_SECFLAVORS];
        len = rpcauth_list_flavors(flav_array, ARRAY_SIZE(flav_array));
-        BUG_ON(len < 0);
+        if (len < 0)
+                return len;
        for (i = 0; i < len; i++) {
                /* AUTH_UNIX is the default flavor if none was specified,
@@ -3038,12 +2924,10 @@ static void nfs4_proc_unlink_setup(struct rpc_message *msg, struct inode *dir)
 static void nfs4_proc_unlink_rpc_prepare(struct rpc_task *task, struct nfs_unlinkdata *data)
 {
-        if (nfs4_setup_sequence(NFS_SERVER(data->dir),
+        nfs4_setup_sequence(NFS_SERVER(data->dir),
-                                &data->args.seq_args,
+                        &data->args.seq_args,
-                                &data->res.seq_res,
+                        &data->res.seq_res,
-                                task))
+                        task);
-                return;
-        rpc_call_start(task);
 }
 static int nfs4_proc_unlink_done(struct rpc_task *task, struct inode *dir)
@@ -3071,12 +2955,10 @@ static void nfs4_proc_rename_setup(struct rpc_message *msg, struct inode *dir)
 static void nfs4_proc_rename_rpc_prepare(struct rpc_task *task, struct nfs_renamedata *data)
 {
-        if (nfs4_setup_sequence(NFS_SERVER(data->old_dir),
+        nfs4_setup_sequence(NFS_SERVER(data->old_dir),
-                                &data->args.seq_args,
+                        &data->args.seq_args,
-                                &data->res.seq_res,
+                        &data->res.seq_res,
-                                task))
+                        task);
-                return;
-        rpc_call_start(task);
 }
 static int nfs4_proc_rename_done(struct rpc_task *task, struct inode *old_dir,
@@ -3362,9 +3244,6 @@ static int _nfs4_proc_mknod(struct inode *dir, struct dentry *dentry,
        int mode = sattr->ia_mode;
        int status = -ENOMEM;
-        BUG_ON(!(sattr->ia_valid & ATTR_MODE));
-        BUG_ON(!S_ISFIFO(mode) && !S_ISBLK(mode) && !S_ISCHR(mode) && !S_ISSOCK(mode));
        data = nfs4_alloc_createdata(dir, &dentry->d_name, sattr, NF4SOCK);
        if (data == NULL)
                goto out;
@@ -3380,10 +3259,13 @@ static int _nfs4_proc_mknod(struct inode *dir, struct dentry *dentry,
                data->arg.ftype = NF4CHR;
                data->arg.u.device.specdata1 = MAJOR(rdev);
                data->arg.u.device.specdata2 = MINOR(rdev);
+        } else if (!S_ISSOCK(mode)) {
+                status = -EINVAL;
+                goto out_free;
        }
        
        status = nfs4_do_create(dir, dentry, data);
+out_free:
        nfs4_free_createdata(data);
 out:
        return status;
@@ -3565,12 +3447,10 @@ static void nfs4_proc_read_setup(struct nfs_read_data *data, struct rpc_message
 static void nfs4_proc_read_rpc_prepare(struct rpc_task *task, struct nfs_read_data *data)
 {
-        if (nfs4_setup_sequence(NFS_SERVER(data->header->inode),
+        nfs4_setup_sequence(NFS_SERVER(data->header->inode),
-                                &data->args.seq_args,
+                        &data->args.seq_args,
-                                &data->res.seq_res,
+                        &data->res.seq_res,
-                                task))
+                        task);
-                return;
-        rpc_call_start(task);
 }
 static int nfs4_write_done_cb(struct rpc_task *task, struct nfs_write_data *data)
@@ -3631,22 +3511,18 @@ static void nfs4_proc_write_setup(struct nfs_write_data *data, struct rpc_messag
 static void nfs4_proc_write_rpc_prepare(struct rpc_task *task, struct nfs_write_data *data)
 {
-        if (nfs4_setup_sequence(NFS_SERVER(data->header->inode),
+        nfs4_setup_sequence(NFS_SERVER(data->header->inode),
-                                &data->args.seq_args,
+                        &data->args.seq_args,
-                                &data->res.seq_res,
+                        &data->res.seq_res,
-                                task))
+                        task);
-                return;
-        rpc_call_start(task);
 }
 static void nfs4_proc_commit_rpc_prepare(struct rpc_task *task, struct nfs_commit_data *data)
 {
-        if (nfs4_setup_sequence(NFS_SERVER(data->inode),
+        nfs4_setup_sequence(NFS_SERVER(data->inode),
-                                &data->args.seq_args,
+                        &data->args.seq_args,
-                                &data->res.seq_res,
+                        &data->res.seq_res,
-                                task))
+                        task);
-                return;
-        rpc_call_start(task);
 }
 static int nfs4_commit_done_cb(struct rpc_task *task, struct nfs_commit_data *data)
@@ -3937,8 +3813,13 @@ static ssize_t __nfs4_get_acl_uncached(struct inode *inode, void *buf, size_t bu
                goto out_free;
        }
        nfs4_write_cached_acl(inode, pages, res.acl_data_offset, res.acl_len);
-        if (buf)
+        if (buf) {
+                if (res.acl_len > buflen) {
+                        ret = -ERANGE;
+                        goto out_free;
+                }
                _copy_from_pages(buf, pages, res.acl_data_offset, res.acl_len);
+        }
 out_ok:
        ret = res.acl_len;
 out_free:
@@ -4085,7 +3966,6 @@ nfs4_async_handle_error(struct rpc_task *task, const struct nfs_server *server,
                case -NFS4ERR_DELAY:
                        nfs_inc_server_stats(server, NFSIOS_DELAY);
                case -NFS4ERR_GRACE:
-                case -EKEYEXPIRED:
                        rpc_delay(task, NFS4_POLL_RETRY_MAX);
                        task->tk_status = 0;
                        return -EAGAIN;
@@ -4293,11 +4173,10 @@ static void nfs4_delegreturn_prepare(struct rpc_task *task, void *data)
        d_data = (struct nfs4_delegreturndata *)data;
-        if (nfs4_setup_sequence(d_data->res.server,
+        nfs4_setup_sequence(d_data->res.server,
-                                &d_data->args.seq_args,
+                        &d_data->args.seq_args,
-                                &d_data->res.seq_res, task))
+                        &d_data->res.seq_res,
-                return;
+                        task);
-        rpc_call_start(task);
 }
 #endif /* CONFIG_NFS_V4_1 */
@@ -4543,6 +4422,7 @@ static void nfs4_locku_prepare(struct rpc_task *task, void *data)
        if (test_bit(NFS_LOCK_INITIALIZED, &calldata->lsp->ls_flags) == 0) {
                /* Note: exit _without_ running nfs4_locku_done */
                task->tk_action = NULL;
+                nfs4_sequence_done(task, &calldata->res.seq_res);
                return;
        }
        calldata->timestamp = jiffies;
@@ -4551,8 +4431,6 @@ static void nfs4_locku_prepare(struct rpc_task *task, void *data)
                                &calldata->res.seq_res,
                                task) != 0)
                nfs_release_seqid(calldata->arg.seqid);
-        else
-                rpc_call_start(task);
 }
 static const struct rpc_call_ops nfs4_locku_ops = {
@@ -4696,8 +4574,9 @@ static void nfs4_lock_prepare(struct rpc_task *task, void *calldata)
                return;
        /* Do we need to do an open_to_lock_owner? */
        if (!(data->arg.lock_seqid->sequence->flags & NFS_SEQID_CONFIRMED)) {
-                if (nfs_wait_on_sequence(data->arg.open_seqid, task) != 0)
+                if (nfs_wait_on_sequence(data->arg.open_seqid, task) != 0) {
                        goto out_release_lock_seqid;
+                }
                data->arg.open_stateid = &state->stateid;
                data->arg.new_lock_owner = 1;
                data->res.open_seqid = data->arg.open_seqid;
@@ -4707,20 +4586,12 @@ static void nfs4_lock_prepare(struct rpc_task *task, void *calldata)
        if (nfs4_setup_sequence(data->server,
                                &data->arg.seq_args,
                                &data->res.seq_res,
-                                task) == 0) {
+                                task) == 0)
-                rpc_call_start(task);
                return;
-        }
        nfs_release_seqid(data->arg.open_seqid);
 out_release_lock_seqid:
        nfs_release_seqid(data->arg.lock_seqid);
-        dprintk("%s: done!, ret = %d\n", __func__, task->tk_status);
+        dprintk("%s: done!, ret = %d\n", __func__, data->rpc_status);
-}
-static void nfs4_recover_lock_prepare(struct rpc_task *task, void *calldata)
-{
-        rpc_task_set_priority(task, RPC_PRIORITY_PRIVILEGED);
-        nfs4_lock_prepare(task, calldata);
 }
 static void nfs4_lock_done(struct rpc_task *task, void *calldata)
@@ -4775,12 +4646,6 @@ static const struct rpc_call_ops nfs4_lock_ops = {
        .rpc_release = nfs4_lock_release,
 };
-static const struct rpc_call_ops nfs4_recover_lock_ops = {
-        .rpc_call_prepare = nfs4_recover_lock_prepare,
-        .rpc_call_done = nfs4_lock_done,
-        .rpc_release = nfs4_lock_release,
-};
 static void nfs4_handle_setlk_error(struct nfs_server *server, struct nfs4_lock_state *lsp, int new_lock_owner, int error)
 {
        switch (error) {
@@ -4823,15 +4688,15 @@ static int _nfs4_do_setlk(struct nfs4_state *state, int cmd, struct file_lock *f
                return -ENOMEM;
        if (IS_SETLKW(cmd))
                data->arg.block = 1;
-        if (recovery_type > NFS_LOCK_NEW) {
-                if (recovery_type == NFS_LOCK_RECLAIM)
-                        data->arg.reclaim = NFS_LOCK_RECLAIM;
-                task_setup_data.callback_ops = &nfs4_recover_lock_ops;
-        }
        nfs41_init_sequence(&data->arg.seq_args, &data->res.seq_res, 1);
        msg.rpc_argp = &data->arg;
        msg.rpc_resp = &data->res;
        task_setup_data.callback_data = data;
+        if (recovery_type > NFS_LOCK_NEW) {
+                if (recovery_type == NFS_LOCK_RECLAIM)
+                        data->arg.reclaim = NFS_LOCK_RECLAIM;
+                nfs4_set_sequence_privileged(&data->arg.seq_args);
+        }
        task = rpc_run_task(&task_setup_data);
        if (IS_ERR(task))
                return PTR_ERR(task);
@@ -5100,15 +4965,6 @@ int nfs4_lock_delegation_recall(struct nfs4_state *state, struct file_lock *fl)
                                nfs4_schedule_stateid_recovery(server, state);
                                err = 0;
                                goto out;
-                        case -EKEYEXPIRED:
-                                /*
-                                 * User RPCSEC_GSS context has expired.
-                                 * We cannot recover this stateid now, so
-                                 * skip it and allow recovery thread to
-                                 * proceed.
-                                 */
-                                err = 0;
-                                goto out;
                        case -ENOMEM:
                        case -NFS4ERR_DENIED:
                                /* kill_proc(fl->fl_pid, SIGLOST, 1); */
@@ -5357,7 +5213,6 @@ int nfs4_proc_bind_conn_to_session(struct nfs_client *clp, struct rpc_cred *cred
        };
        dprintk("--> %s\n", __func__);
-        BUG_ON(clp == NULL);
        res.session = kzalloc(sizeof(struct nfs4_session), GFP_NOFS);
        if (unlikely(res.session == NULL)) {
@@ -5569,20 +5424,16 @@ struct nfs4_get_lease_time_data {
 static void nfs4_get_lease_time_prepare(struct rpc_task *task,
                                        void *calldata)
 {
-        int ret;
        struct nfs4_get_lease_time_data *data =
                        (struct nfs4_get_lease_time_data *)calldata;
        dprintk("--> %s\n", __func__);
-        rpc_task_set_priority(task, RPC_PRIORITY_PRIVILEGED);
        /* just setup sequence, do not trigger session recovery
           since we're invoked within one */
-        ret = nfs41_setup_sequence(data->clp->cl_session,
+        nfs41_setup_sequence(data->clp->cl_session,
-                                   &data->args->la_seq_args,
+                        &data->args->la_seq_args,
-                                   &data->res->lr_seq_res, task);
+                        &data->res->lr_seq_res,
+                        task);
-        BUG_ON(ret == -EAGAIN);
-        rpc_call_start(task);
        dprintk("<-- %s\n", __func__);
 }
@@ -5644,6 +5495,7 @@ int nfs4_proc_get_lease_time(struct nfs_client *clp, struct nfs_fsinfo *fsinfo)
        int status;
        nfs41_init_sequence(&args.la_seq_args, &res.lr_seq_res, 0);
+        nfs4_set_sequence_privileged(&args.la_seq_args);
        dprintk("--> %s\n", __func__);
        task = rpc_run_task(&task_setup);
@@ -5658,145 +5510,6 @@ int nfs4_proc_get_lease_time(struct nfs_client *clp, struct nfs_fsinfo *fsinfo)
        return status;
 }
-static struct nfs4_slot *nfs4_alloc_slots(u32 max_slots, gfp_t gfp_flags)
-{
-        return kcalloc(max_slots, sizeof(struct nfs4_slot), gfp_flags);
-}
-static void nfs4_add_and_init_slots(struct nfs4_slot_table *tbl,
-                struct nfs4_slot *new,
-                u32 max_slots,
-                u32 ivalue)
-{
-        struct nfs4_slot *old = NULL;
-        u32 i;
-        spin_lock(&tbl->slot_tbl_lock);
-        if (new) {
-                old = tbl->slots;
-                tbl->slots = new;
-                tbl->max_slots = max_slots;
-        }
-        tbl->highest_used_slotid = NFS4_NO_SLOT;
-        for (i = 0; i < tbl->max_slots; i++)
-                tbl->slots[i].seq_nr = ivalue;
-        spin_unlock(&tbl->slot_tbl_lock);
-        kfree(old);
-}
-/*
- * (re)Initialise a slot table
- */
-static int nfs4_realloc_slot_table(struct nfs4_slot_table *tbl, u32 max_reqs,
-                                 u32 ivalue)
-{
-        struct nfs4_slot *new = NULL;
-        int ret = -ENOMEM;
-        dprintk("--> %s: max_reqs=%u, tbl->max_slots %d\n", __func__,
-                max_reqs, tbl->max_slots);
-        /* Does the newly negotiated max_reqs match the existing slot table? */
-        if (max_reqs != tbl->max_slots) {
-                new = nfs4_alloc_slots(max_reqs, GFP_NOFS);
-                if (!new)
-                        goto out;
-        }
-        ret = 0;
-        nfs4_add_and_init_slots(tbl, new, max_reqs, ivalue);
-        dprintk("%s: tbl=%p slots=%p max_slots=%d\n", __func__,
-                tbl, tbl->slots, tbl->max_slots);
-out:
-        dprintk("<-- %s: return %d\n", __func__, ret);
-        return ret;
-}
-/* Destroy the slot table */
-static void nfs4_destroy_slot_tables(struct nfs4_session *session)
-{
-        if (session->fc_slot_table.slots != NULL) {
-                kfree(session->fc_slot_table.slots);
-                session->fc_slot_table.slots = NULL;
-        }
-        if (session->bc_slot_table.slots != NULL) {
-                kfree(session->bc_slot_table.slots);
-                session->bc_slot_table.slots = NULL;
-        }
-        return;
-}
-/*
- * Initialize or reset the forechannel and backchannel tables
- */
-static int nfs4_setup_session_slot_tables(struct nfs4_session *ses)
-{
-        struct nfs4_slot_table *tbl;
-        int status;
-        dprintk("--> %s\n", __func__);
-        /* Fore channel */
-        tbl = &ses->fc_slot_table;
-        status = nfs4_realloc_slot_table(tbl, ses->fc_attrs.max_reqs, 1);
-        if (status) /* -ENOMEM */
-                return status;
-        /* Back channel */
-        tbl = &ses->bc_slot_table;
-        status = nfs4_realloc_slot_table(tbl, ses->bc_attrs.max_reqs, 0);
-        if (status && tbl->slots == NULL)
-                /* Fore and back channel share a connection so get
-                 * both slot tables or neither */
-                nfs4_destroy_slot_tables(ses);
-        return status;
-}
-struct nfs4_session *nfs4_alloc_session(struct nfs_client *clp)
-{
-        struct nfs4_session *session;
-        struct nfs4_slot_table *tbl;
-        session = kzalloc(sizeof(struct nfs4_session), GFP_NOFS);
-        if (!session)
-                return NULL;
-        tbl = &session->fc_slot_table;
-        tbl->highest_used_slotid = NFS4_NO_SLOT;
-        spin_lock_init(&tbl->slot_tbl_lock);
-        rpc_init_priority_wait_queue(&tbl->slot_tbl_waitq, "ForeChannel Slot table");
-        init_completion(&tbl->complete);
-        tbl = &session->bc_slot_table;
-        tbl->highest_used_slotid = NFS4_NO_SLOT;
-        spin_lock_init(&tbl->slot_tbl_lock);
-        rpc_init_wait_queue(&tbl->slot_tbl_waitq, "BackChannel Slot table");
-        init_completion(&tbl->complete);
-        session->session_state = 1<<NFS4_SESSION_INITING;
-        session->clp = clp;
-        return session;
-}
-void nfs4_destroy_session(struct nfs4_session *session)
-{
-        struct rpc_xprt *xprt;
-        struct rpc_cred *cred;
-        cred = nfs4_get_exchange_id_cred(session->clp);
-        nfs4_proc_destroy_session(session, cred);
-        if (cred)
-                put_rpccred(cred);
-        rcu_read_lock();
-        xprt = rcu_dereference(session->clp->cl_rpcclient->cl_xprt);
-        rcu_read_unlock();
-        dprintk("%s Destroy backchannel for xprt %p\n",
-                __func__, xprt);
-        xprt_destroy_backchannel(xprt, NFS41_BC_MIN_CALLBACKS);
-        nfs4_destroy_slot_tables(session);
-        kfree(session);
-}
 /*
 * Initialize the values to be used by the client in CREATE_SESSION
 * If nfs4_init_session set the fore channel request and response sizes,
@@ -5809,8 +5522,8 @@ void nfs4_destroy_session(struct nfs4_session *session)
 static void nfs4_init_channel_attrs(struct nfs41_create_session_args *args)
 {
        struct nfs4_session *session = args->client->cl_session;
-        unsigned int mxrqst_sz = session->fc_attrs.max_rqst_sz,
+        unsigned int mxrqst_sz = session->fc_target_max_rqst_sz,
-                     mxresp_sz = session->fc_attrs.max_resp_sz;
+                     mxresp_sz = session->fc_target_max_resp_sz;
        if (mxrqst_sz == 0)
                mxrqst_sz = NFS_MAX_FILE_IO_SIZE;
@@ -5919,10 +5632,9 @@ static int _nfs4_proc_create_session(struct nfs_client *clp,
        status = rpc_call_sync(session->clp->cl_rpcclient, &msg, RPC_TASK_TIMEOUT);
-        if (!status)
+        if (!status) {
                /* Verify the session's negotiated channel_attrs values */
                status = nfs4_verify_channel_attrs(&args, session);
-        if (!status) {
                /* Increment the clientid slot sequence id */
                clp->cl_seqid++;
        }
@@ -5992,83 +5704,6 @@ int nfs4_proc_destroy_session(struct nfs4_session *session,
 }
 /*
- * With sessions, the client is not marked ready until after a
- * successful EXCHANGE_ID and CREATE_SESSION.
- *
- * Map errors cl_cons_state errors to EPROTONOSUPPORT to indicate
- * other versions of NFS can be tried.
- */
-static int nfs41_check_session_ready(struct nfs_client *clp)
-{
-        int ret;
-        
-        if (clp->cl_cons_state == NFS_CS_SESSION_INITING) {
-                ret = nfs4_client_recover_expired_lease(clp);
-                if (ret)
-                        return ret;
-        }
-        if (clp->cl_cons_state < NFS_CS_READY)
-                return -EPROTONOSUPPORT;
-        smp_rmb();
-        return 0;
-}
-int nfs4_init_session(struct nfs_server *server)
-{
-        struct nfs_client *clp = server->nfs_client;
-        struct nfs4_session *session;
-        unsigned int rsize, wsize;
-        if (!nfs4_has_session(clp))
-                return 0;
-        session = clp->cl_session;
-        spin_lock(&clp->cl_lock);
-        if (test_and_clear_bit(NFS4_SESSION_INITING, &session->session_state)) {
-                rsize = server->rsize;
-                if (rsize == 0)
-                        rsize = NFS_MAX_FILE_IO_SIZE;
-                wsize = server->wsize;
-                if (wsize == 0)
-                        wsize = NFS_MAX_FILE_IO_SIZE;
-                session->fc_attrs.max_rqst_sz = wsize + nfs41_maxwrite_overhead;
-                session->fc_attrs.max_resp_sz = rsize + nfs41_maxread_overhead;
-        }
-        spin_unlock(&clp->cl_lock);
-        return nfs41_check_session_ready(clp);
-}
-int nfs4_init_ds_session(struct nfs_client *clp, unsigned long lease_time)
-{
-        struct nfs4_session *session = clp->cl_session;
-        int ret;
-        spin_lock(&clp->cl_lock);
-        if (test_and_clear_bit(NFS4_SESSION_INITING, &session->session_state)) {
-                /*
-                 * Do not set NFS_CS_CHECK_LEASE_TIME instead set the
-                 * DS lease to be equal to the MDS lease.
-                 */
-                clp->cl_lease_time = lease_time;
-                clp->cl_last_renewal = jiffies;
-        }
-        spin_unlock(&clp->cl_lock);
-        ret = nfs41_check_session_ready(clp);
-        if (ret)
-                return ret;
-        /* Test for the DS role */
-        if (!is_ds_client(clp))
-                return -ENODEV;
-        return 0;
-}
-EXPORT_SYMBOL_GPL(nfs4_init_ds_session);
-/*
 * Renew the cl_session lease.
 */
 struct nfs4_sequence_data {
@@ -6133,9 +5768,7 @@ static void nfs41_sequence_prepare(struct rpc_task *task, void *data)
        args = task->tk_msg.rpc_argp;
        res = task->tk_msg.rpc_resp;
-        if (nfs41_setup_sequence(clp->cl_session, args, res, task))
+        nfs41_setup_sequence(clp->cl_session, args, res, task);
-                return;
-        rpc_call_start(task);
 }
 static const struct rpc_call_ops nfs41_sequence_ops = {
@@ -6144,7 +5777,9 @@ static const struct rpc_call_ops nfs41_sequence_ops = {
        .rpc_release = nfs41_sequence_release,
 };
-static struct rpc_task *_nfs41_proc_sequence(struct nfs_client *clp, struct rpc_cred *cred)
+static struct rpc_task *_nfs41_proc_sequence(struct nfs_client *clp,
+                struct rpc_cred *cred,
+                bool is_privileged)
 {
        struct nfs4_sequence_data *calldata;
        struct rpc_message msg = {
@@ -6166,6 +5801,8 @@ static struct rpc_task *_nfs41_proc_sequence(struct nfs_client *clp, struct rpc_
                return ERR_PTR(-ENOMEM);
        }
        nfs41_init_sequence(&calldata->args, &calldata->res, 0);
+        if (is_privileged)
+                nfs4_set_sequence_privileged(&calldata->args);
        msg.rpc_argp = &calldata->args;
        msg.rpc_resp = &calldata->res;
        calldata->clp = clp;
@@ -6181,7 +5818,7 @@ static int nfs41_proc_async_sequence(struct nfs_client *clp, struct rpc_cred *cr
        if ((renew_flags & NFS4_RENEW_TIMEOUT) == 0)
                return 0;
-        task = _nfs41_proc_sequence(clp, cred);
+        task = _nfs41_proc_sequence(clp, cred, false);
        if (IS_ERR(task))
                ret = PTR_ERR(task);
        else
@@ -6195,7 +5832,7 @@ static int nfs4_proc_sequence(struct nfs_client *clp, struct rpc_cred *cred)
        struct rpc_task *task;
        int ret;
-        task = _nfs41_proc_sequence(clp, cred);
+        task = _nfs41_proc_sequence(clp, cred, true);
        if (IS_ERR(task)) {
                ret = PTR_ERR(task);
                goto out;
@@ -6224,13 +5861,10 @@ static void nfs4_reclaim_complete_prepare(struct rpc_task *task, void *data)
 {
        struct nfs4_reclaim_complete_data *calldata = data;
-        rpc_task_set_priority(task, RPC_PRIORITY_PRIVILEGED);
+        nfs41_setup_sequence(calldata->clp->cl_session,
-        if (nfs41_setup_sequence(calldata->clp->cl_session,
+                        &calldata->arg.seq_args,
-                                &calldata->arg.seq_args,
+                        &calldata->res.seq_res,
-                                &calldata->res.seq_res, task))
+                        task);
-                return;
-        rpc_call_start(task);
 }
 static int nfs41_reclaim_complete_handle_errors(struct rpc_task *task, struct nfs_client *clp)
@@ -6307,6 +5941,7 @@ static int nfs41_proc_reclaim_complete(struct nfs_client *clp)
        calldata->arg.one_fs = 0;
        nfs41_init_sequence(&calldata->arg.seq_args, &calldata->res.seq_res, 0);
+        nfs4_set_sequence_privileged(&calldata->arg.seq_args);
        msg.rpc_argp = &calldata->arg;
        msg.rpc_resp = &calldata->res;
        task_setup_data.callback_data = calldata;
@@ -6330,6 +5965,7 @@ nfs4_layoutget_prepare(struct rpc_task *task, void *calldata)
 {
        struct nfs4_layoutget *lgp = calldata;
        struct nfs_server *server = NFS_SERVER(lgp->args.inode);
+        struct nfs4_session *session = nfs4_get_session(server);
        dprintk("--> %s\n", __func__);
        /* Note the is a race here, where a CB_LAYOUTRECALL can come in
@@ -6337,16 +5973,14 @@ nfs4_layoutget_prepare(struct rpc_task *task, void *calldata)
         * However, that is not so catastrophic, and there seems
         * to be no way to prevent it completely.
         */
-        if (nfs4_setup_sequence(server, &lgp->args.seq_args,
+        if (nfs41_setup_sequence(session, &lgp->args.seq_args,
                                &lgp->res.seq_res, task))
                return;
        if (pnfs_choose_layoutget_stateid(&lgp->args.stateid,
                                          NFS_I(lgp->args.inode)->layout,
                                          lgp->args.ctx->state)) {
                rpc_exit(task, NFS4_OK);
-                return;
        }
-        rpc_call_start(task);
 }
 static void nfs4_layoutget_done(struct rpc_task *task, void *calldata)
@@ -6359,7 +5993,7 @@ static void nfs4_layoutget_done(struct rpc_task *task, void *calldata)
        dprintk("--> %s\n", __func__);
-        if (!nfs4_sequence_done(task, &lgp->res.seq_res))
+        if (!nfs41_sequence_done(task, &lgp->res.seq_res))
                goto out;
        switch (task->tk_status) {
@@ -6510,10 +6144,10 @@ nfs4_layoutreturn_prepare(struct rpc_task *task, void *calldata)
        struct nfs4_layoutreturn *lrp = calldata;
        dprintk("--> %s\n", __func__);
-        if (nfs41_setup_sequence(lrp->clp->cl_session, &lrp->args.seq_args,
+        nfs41_setup_sequence(lrp->clp->cl_session,
-                                &lrp->res.seq_res, task))
+                        &lrp->args.seq_args,
-                return;
+                        &lrp->res.seq_res,
-        rpc_call_start(task);
+                        task);
 }
 static void nfs4_layoutreturn_done(struct rpc_task *task, void *calldata)
@@ -6523,7 +6157,7 @@ static void nfs4_layoutreturn_done(struct rpc_task *task, void *calldata)
        dprintk("--> %s\n", __func__);
-        if (!nfs4_sequence_done(task, &lrp->res.seq_res))
+        if (!nfs41_sequence_done(task, &lrp->res.seq_res))
                return;
        server = NFS_SERVER(lrp->args.inode);
@@ -6672,11 +6306,12 @@ static void nfs4_layoutcommit_prepare(struct rpc_task *task, void *calldata)
 {
        struct nfs4_layoutcommit_data *data = calldata;
        struct nfs_server *server = NFS_SERVER(data->args.inode);
+        struct nfs4_session *session = nfs4_get_session(server);
-        if (nfs4_setup_sequence(server, &data->args.seq_args,
+        nfs41_setup_sequence(session,
-                                &data->res.seq_res, task))
+                        &data->args.seq_args,
-                return;
+                        &data->res.seq_res,
-        rpc_call_start(task);
+                        task);
 }
 static void
@@ -6685,7 +6320,7 @@ nfs4_layoutcommit_done(struct rpc_task *task, void *calldata)
        struct nfs4_layoutcommit_data *data = calldata;
        struct nfs_server *server = NFS_SERVER(data->args.inode);
-        if (!nfs4_sequence_done(task, &data->res.seq_res))
+        if (!nfs41_sequence_done(task, &data->res.seq_res))
                return;
        switch (task->tk_status) { /* Just ignore these failures */
@@ -6873,7 +6508,9 @@ static int _nfs41_test_stateid(struct nfs_server *server, nfs4_stateid *stateid)
        dprintk("NFS call  test_stateid %p\n", stateid);
        nfs41_init_sequence(&args.seq_args, &res.seq_res, 0);
-        status = nfs4_call_sync_sequence(server->client, server, &msg, &args.seq_args, &res.seq_res, 1);
+        nfs4_set_sequence_privileged(&args.seq_args);
+        status = nfs4_call_sync_sequence(server->client, server, &msg,
+                        &args.seq_args, &res.seq_res);
        if (status != NFS_OK) {
                dprintk("NFS reply test_stateid: failed, %d\n", status);
                return status;
@@ -6920,8 +6557,9 @@ static int _nfs4_free_stateid(struct nfs_server *server, nfs4_stateid *stateid)
        dprintk("NFS call  free_stateid %p\n", stateid);
        nfs41_init_sequence(&args.seq_args, &res.seq_res, 0);
+        nfs4_set_sequence_privileged(&args.seq_args);
        status = nfs4_call_sync_sequence(server->client, server, &msg,
-                                         &args.seq_args, &res.seq_res, 1);
+                        &args.seq_args, &res.seq_res);
        dprintk("NFS reply free_stateid: %d\n", status);
        return status;
 }
@@ -7041,7 +6679,7 @@ static const struct nfs4_minor_version_ops nfs_v4_0_minor_ops = {
 #if defined(CONFIG_NFS_V4_1)
 static const struct nfs4_minor_version_ops nfs_v4_1_minor_ops = {
        .minor_version = 1,
-        .call_sync = _nfs4_call_sync_session,
+        .call_sync = nfs4_call_sync_sequence,
        .match_stateid = nfs41_match_stateid,
        .find_root_sec = nfs41_find_root_sec,
        .reboot_recovery_ops = &nfs41_reboot_recovery_ops,
diff --git a/fs/nfs/nfs4session.c b/fs/nfs/nfs4session.c
new file mode 100644
index 000000000000..ebda5f4a031b
--- /dev/null
+++ b/fs/nfs/nfs4session.c
@@ -0,0 +1,552 @@
+/*
+ * fs/nfs/nfs4session.c
+ *
+ * Copyright (c) 2012 Trond Myklebust <Trond.Myklebust@netapp.com>
+ *
+ */
+#include <linux/kernel.h>
+#include <linux/errno.h>
+#include <linux/string.h>
+#include <linux/printk.h>
+#include <linux/slab.h>
+#include <linux/sunrpc/sched.h>
+#include <linux/sunrpc/bc_xprt.h>
+#include <linux/nfs.h>
+#include <linux/nfs4.h>
+#include <linux/nfs_fs.h>
+#include <linux/module.h>
+#include "nfs4_fs.h"
+#include "internal.h"
+#include "nfs4session.h"
+#include "callback.h"
+#define NFSDBG_FACILITY         NFSDBG_STATE
+/*
+ * nfs4_shrink_slot_table - free retired slots from the slot table
+ */
+static void nfs4_shrink_slot_table(struct nfs4_slot_table  *tbl, u32 newsize)
+{
+        struct nfs4_slot **p;
+        if (newsize >= tbl->max_slots)
+                return;
+        p = &tbl->slots;
+        while (newsize--)
+                p = &(*p)->next;
+        while (*p) {
+                struct nfs4_slot *slot = *p;
+                *p = slot->next;
+                kfree(slot);
+                tbl->max_slots--;
+        }
+}
+/*
+ * nfs4_free_slot - free a slot and efficiently update slot table.
+ *
+ * freeing a slot is trivially done by clearing its respective bit
+ * in the bitmap.
+ * If the freed slotid equals highest_used_slotid we want to update it
+ * so that the server would be able to size down the slot table if needed,
+ * otherwise we know that the highest_used_slotid is still in use.
+ * When updating highest_used_slotid there may be "holes" in the bitmap
+ * so we need to scan down from highest_used_slotid to 0 looking for the now
+ * highest slotid in use.
+ * If none found, highest_used_slotid is set to NFS4_NO_SLOT.
+ *
+ * Must be called while holding tbl->slot_tbl_lock
+ */
+void nfs4_free_slot(struct nfs4_slot_table *tbl, struct nfs4_slot *slot)
+{
+        u32 slotid = slot->slot_nr;
+        /* clear used bit in bitmap */
+        __clear_bit(slotid, tbl->used_slots);
+        /* update highest_used_slotid when it is freed */
+        if (slotid == tbl->highest_used_slotid) {
+                u32 new_max = find_last_bit(tbl->used_slots, slotid);
+                if (new_max < slotid)
+                        tbl->highest_used_slotid = new_max;
+                else {
+                        tbl->highest_used_slotid = NFS4_NO_SLOT;
+                        nfs4_session_drain_complete(tbl->session, tbl);
+                }
+        }
+        dprintk("%s: slotid %u highest_used_slotid %d\n", __func__,
+                slotid, tbl->highest_used_slotid);
+}
+static struct nfs4_slot *nfs4_new_slot(struct nfs4_slot_table  *tbl,
+                u32 slotid, u32 seq_init, gfp_t gfp_mask)
+{
+        struct nfs4_slot *slot;
+        slot = kzalloc(sizeof(*slot), gfp_mask);
+        if (slot) {
+                slot->table = tbl;
+                slot->slot_nr = slotid;
+                slot->seq_nr = seq_init;
+        }
+        return slot;
+}
+static struct nfs4_slot *nfs4_find_or_create_slot(struct nfs4_slot_table  *tbl,
+                u32 slotid, u32 seq_init, gfp_t gfp_mask)
+{
+        struct nfs4_slot **p, *slot;
+        p = &tbl->slots;
+        for (;;) {
+                if (*p == NULL) {
+                        *p = nfs4_new_slot(tbl, tbl->max_slots,
+                                        seq_init, gfp_mask);
+                        if (*p == NULL)
+                                break;
+                        tbl->max_slots++;
+                }
+                slot = *p;
+                if (slot->slot_nr == slotid)
+                        return slot;
+                p = &slot->next;
+        }
+        return ERR_PTR(-ENOMEM);
+}
+/*
+ * nfs4_alloc_slot - efficiently look for a free slot
+ *
+ * nfs4_alloc_slot looks for an unset bit in the used_slots bitmap.
+ * If found, we mark the slot as used, update the highest_used_slotid,
+ * and respectively set up the sequence operation args.
+ *
+ * Note: must be called with under the slot_tbl_lock.
+ */
+struct nfs4_slot *nfs4_alloc_slot(struct nfs4_slot_table *tbl)
+{
+        struct nfs4_slot *ret = ERR_PTR(-EBUSY);
+        u32 slotid;
+        dprintk("--> %s used_slots=%04lx highest_used=%u max_slots=%u\n",
+                __func__, tbl->used_slots[0], tbl->highest_used_slotid,
+                tbl->max_slotid + 1);
+        slotid = find_first_zero_bit(tbl->used_slots, tbl->max_slotid + 1);
+        if (slotid > tbl->max_slotid)
+                goto out;
+        ret = nfs4_find_or_create_slot(tbl, slotid, 1, GFP_NOWAIT);
+        if (IS_ERR(ret))
+                goto out;
+        __set_bit(slotid, tbl->used_slots);
+        if (slotid > tbl->highest_used_slotid ||
+                        tbl->highest_used_slotid == NFS4_NO_SLOT)
+                tbl->highest_used_slotid = slotid;
+        ret->generation = tbl->generation;
+out:
+        dprintk("<-- %s used_slots=%04lx highest_used=%d slotid=%d \n",
+                __func__, tbl->used_slots[0], tbl->highest_used_slotid,
+                !IS_ERR(ret) ? ret->slot_nr : -1);
+        return ret;
+}
+static int nfs4_grow_slot_table(struct nfs4_slot_table *tbl,
+                 u32 max_reqs, u32 ivalue)
+{
+        if (max_reqs <= tbl->max_slots)
+                return 0;
+        if (!IS_ERR(nfs4_find_or_create_slot(tbl, max_reqs - 1, ivalue, GFP_NOFS)))
+                return 0;
+        return -ENOMEM;
+}
+static void nfs4_reset_slot_table(struct nfs4_slot_table *tbl,
+                u32 server_highest_slotid,
+                u32 ivalue)
+{
+        struct nfs4_slot **p;
+        nfs4_shrink_slot_table(tbl, server_highest_slotid + 1);
+        p = &tbl->slots;
+        while (*p) {
+                (*p)->seq_nr = ivalue;
+                (*p)->interrupted = 0;
+                p = &(*p)->next;
+        }
+        tbl->highest_used_slotid = NFS4_NO_SLOT;
+        tbl->target_highest_slotid = server_highest_slotid;
+        tbl->server_highest_slotid = server_highest_slotid;
+        tbl->d_target_highest_slotid = 0;
+        tbl->d2_target_highest_slotid = 0;
+        tbl->max_slotid = server_highest_slotid;
+}
+/*
+ * (re)Initialise a slot table
+ */
+static int nfs4_realloc_slot_table(struct nfs4_slot_table *tbl,
+                u32 max_reqs, u32 ivalue)
+{
+        int ret;
+        dprintk("--> %s: max_reqs=%u, tbl->max_slots %d\n", __func__,
+                max_reqs, tbl->max_slots);
+        if (max_reqs > NFS4_MAX_SLOT_TABLE)
+                max_reqs = NFS4_MAX_SLOT_TABLE;
+        ret = nfs4_grow_slot_table(tbl, max_reqs, ivalue);
+        if (ret)
+                goto out;
+        spin_lock(&tbl->slot_tbl_lock);
+        nfs4_reset_slot_table(tbl, max_reqs - 1, ivalue);
+        spin_unlock(&tbl->slot_tbl_lock);
+        dprintk("%s: tbl=%p slots=%p max_slots=%d\n", __func__,
+                tbl, tbl->slots, tbl->max_slots);
+out:
+        dprintk("<-- %s: return %d\n", __func__, ret);
+        return ret;
+}
+/* Destroy the slot table */
+static void nfs4_destroy_slot_tables(struct nfs4_session *session)
+{
+        nfs4_shrink_slot_table(&session->fc_slot_table, 0);
+        nfs4_shrink_slot_table(&session->bc_slot_table, 0);
+}
+static bool nfs41_assign_slot(struct rpc_task *task, void *pslot)
+{
+        struct nfs4_sequence_args *args = task->tk_msg.rpc_argp;
+        struct nfs4_sequence_res *res = task->tk_msg.rpc_resp;
+        struct nfs4_slot *slot = pslot;
+        struct nfs4_slot_table *tbl = slot->table;
+        if (nfs4_session_draining(tbl->session) && !args->sa_privileged)
+                return false;
+        slot->generation = tbl->generation;
+        args->sa_slot = slot;
+        res->sr_timestamp = jiffies;
+        res->sr_slot = slot;
+        res->sr_status_flags = 0;
+        res->sr_status = 1;
+        return true;
+}
+static bool __nfs41_wake_and_assign_slot(struct nfs4_slot_table *tbl,
+                struct nfs4_slot *slot)
+{
+        if (rpc_wake_up_first(&tbl->slot_tbl_waitq, nfs41_assign_slot, slot))
+                return true;
+        return false;
+}
+bool nfs41_wake_and_assign_slot(struct nfs4_slot_table *tbl,
+                struct nfs4_slot *slot)
+{
+        if (slot->slot_nr > tbl->max_slotid)
+                return false;
+        return __nfs41_wake_and_assign_slot(tbl, slot);
+}
+static bool nfs41_try_wake_next_slot_table_entry(struct nfs4_slot_table *tbl)
+{
+        struct nfs4_slot *slot = nfs4_alloc_slot(tbl);
+        if (!IS_ERR(slot)) {
+                bool ret = __nfs41_wake_and_assign_slot(tbl, slot);
+                if (ret)
+                        return ret;
+                nfs4_free_slot(tbl, slot);
+        }
+        return false;
+}
+void nfs41_wake_slot_table(struct nfs4_slot_table *tbl)
+{
+        for (;;) {
+                if (!nfs41_try_wake_next_slot_table_entry(tbl))
+                        break;
+        }
+}
+static void nfs41_set_max_slotid_locked(struct nfs4_slot_table *tbl,
+                u32 target_highest_slotid)
+{
+        u32 max_slotid;
+        max_slotid = min(NFS4_MAX_SLOT_TABLE - 1, target_highest_slotid);
+        if (max_slotid > tbl->server_highest_slotid)
+                max_slotid = tbl->server_highest_slotid;
+        if (max_slotid > tbl->target_highest_slotid)
+                max_slotid = tbl->target_highest_slotid;
+        tbl->max_slotid = max_slotid;
+        nfs41_wake_slot_table(tbl);
+}
+/* Update the client's idea of target_highest_slotid */
+static void nfs41_set_target_slotid_locked(struct nfs4_slot_table *tbl,
+                u32 target_highest_slotid)
+{
+        if (tbl->target_highest_slotid == target_highest_slotid)
+                return;
+        tbl->target_highest_slotid = target_highest_slotid;
+        tbl->generation++;
+}
+void nfs41_set_target_slotid(struct nfs4_slot_table *tbl,
+                u32 target_highest_slotid)
+{
+        spin_lock(&tbl->slot_tbl_lock);
+        nfs41_set_target_slotid_locked(tbl, target_highest_slotid);
+        tbl->d_target_highest_slotid = 0;
+        tbl->d2_target_highest_slotid = 0;
+        nfs41_set_max_slotid_locked(tbl, target_highest_slotid);
+        spin_unlock(&tbl->slot_tbl_lock);
+}
+static void nfs41_set_server_slotid_locked(struct nfs4_slot_table *tbl,
+                u32 highest_slotid)
+{
+        if (tbl->server_highest_slotid == highest_slotid)
+                return;
+        if (tbl->highest_used_slotid > highest_slotid)
+                return;
+        /* Deallocate slots */
+        nfs4_shrink_slot_table(tbl, highest_slotid + 1);
+        tbl->server_highest_slotid = highest_slotid;
+}
+static s32 nfs41_derivative_target_slotid(s32 s1, s32 s2)
+{
+        s1 -= s2;
+        if (s1 == 0)
+                return 0;
+        if (s1 < 0)
+                return (s1 - 1) >> 1;
+        return (s1 + 1) >> 1;
+}
+static int nfs41_sign_s32(s32 s1)
+{
+        if (s1 > 0)
+                return 1;
+        if (s1 < 0)
+                return -1;
+        return 0;
+}
+static bool nfs41_same_sign_or_zero_s32(s32 s1, s32 s2)
+{
+        if (!s1 || !s2)
+                return true;
+        return nfs41_sign_s32(s1) == nfs41_sign_s32(s2);
+}
+/* Try to eliminate outliers by checking for sharp changes in the
+ * derivatives and second derivatives
+ */
+static bool nfs41_is_outlier_target_slotid(struct nfs4_slot_table *tbl,
+                u32 new_target)
+{
+        s32 d_target, d2_target;
+        bool ret = true;
+        d_target = nfs41_derivative_target_slotid(new_target,
+                        tbl->target_highest_slotid);
+        d2_target = nfs41_derivative_target_slotid(d_target,
+                        tbl->d_target_highest_slotid);
+        /* Is first derivative same sign? */
+        if (nfs41_same_sign_or_zero_s32(d_target, tbl->d_target_highest_slotid))
+                ret = false;
+        /* Is second derivative same sign? */
+        if (nfs41_same_sign_or_zero_s32(d2_target, tbl->d2_target_highest_slotid))
+                ret = false;
+        tbl->d_target_highest_slotid = d_target;
+        tbl->d2_target_highest_slotid = d2_target;
+        return ret;
+}
+void nfs41_update_target_slotid(struct nfs4_slot_table *tbl,
+                struct nfs4_slot *slot,
+                struct nfs4_sequence_res *res)
+{
+        spin_lock(&tbl->slot_tbl_lock);
+        if (!nfs41_is_outlier_target_slotid(tbl, res->sr_target_highest_slotid))
+                nfs41_set_target_slotid_locked(tbl, res->sr_target_highest_slotid);
+        if (tbl->generation == slot->generation)
+                nfs41_set_server_slotid_locked(tbl, res->sr_highest_slotid);
+        nfs41_set_max_slotid_locked(tbl, res->sr_target_highest_slotid);
+        spin_unlock(&tbl->slot_tbl_lock);
+}
+/*
+ * Initialize or reset the forechannel and backchannel tables
+ */
+int nfs4_setup_session_slot_tables(struct nfs4_session *ses)
+{
+        struct nfs4_slot_table *tbl;
+        int status;
+        dprintk("--> %s\n", __func__);
+        /* Fore channel */
+        tbl = &ses->fc_slot_table;
+        tbl->session = ses;
+        status = nfs4_realloc_slot_table(tbl, ses->fc_attrs.max_reqs, 1);
+        if (status) /* -ENOMEM */
+                return status;
+        /* Back channel */
+        tbl = &ses->bc_slot_table;
+        tbl->session = ses;
+        status = nfs4_realloc_slot_table(tbl, ses->bc_attrs.max_reqs, 0);
+        if (status && tbl->slots == NULL)
+                /* Fore and back channel share a connection so get
+                 * both slot tables or neither */
+                nfs4_destroy_slot_tables(ses);
+        return status;
+}
+struct nfs4_session *nfs4_alloc_session(struct nfs_client *clp)
+{
+        struct nfs4_session *session;
+        struct nfs4_slot_table *tbl;
+        session = kzalloc(sizeof(struct nfs4_session), GFP_NOFS);
+        if (!session)
+                return NULL;
+        tbl = &session->fc_slot_table;
+        tbl->highest_used_slotid = NFS4_NO_SLOT;
+        spin_lock_init(&tbl->slot_tbl_lock);
+        rpc_init_priority_wait_queue(&tbl->slot_tbl_waitq, "ForeChannel Slot table");
+        init_completion(&tbl->complete);
+        tbl = &session->bc_slot_table;
+        tbl->highest_used_slotid = NFS4_NO_SLOT;
+        spin_lock_init(&tbl->slot_tbl_lock);
+        rpc_init_wait_queue(&tbl->slot_tbl_waitq, "BackChannel Slot table");
+        init_completion(&tbl->complete);
+        session->session_state = 1<<NFS4_SESSION_INITING;
+        session->clp = clp;
+        return session;
+}
+void nfs4_destroy_session(struct nfs4_session *session)
+{
+        struct rpc_xprt *xprt;
+        struct rpc_cred *cred;
+        cred = nfs4_get_exchange_id_cred(session->clp);
+        nfs4_proc_destroy_session(session, cred);
+        if (cred)
+                put_rpccred(cred);
+        rcu_read_lock();
+        xprt = rcu_dereference(session->clp->cl_rpcclient->cl_xprt);
+        rcu_read_unlock();
+        dprintk("%s Destroy backchannel for xprt %p\n",
+                __func__, xprt);
+        xprt_destroy_backchannel(xprt, NFS41_BC_MIN_CALLBACKS);
+        nfs4_destroy_slot_tables(session);
+        kfree(session);
+}
+/*
+ * With sessions, the client is not marked ready until after a
+ * successful EXCHANGE_ID and CREATE_SESSION.
+ *
+ * Map errors cl_cons_state errors to EPROTONOSUPPORT to indicate
+ * other versions of NFS can be tried.
+ */
+static int nfs41_check_session_ready(struct nfs_client *clp)
+{
+        int ret;
+        
+        if (clp->cl_cons_state == NFS_CS_SESSION_INITING) {
+                ret = nfs4_client_recover_expired_lease(clp);
+                if (ret)
+                        return ret;
+        }
+        if (clp->cl_cons_state < NFS_CS_READY)
+                return -EPROTONOSUPPORT;
+        smp_rmb();
+        return 0;
+}
+int nfs4_init_session(struct nfs_server *server)
+{
+        struct nfs_client *clp = server->nfs_client;
+        struct nfs4_session *session;
+        unsigned int target_max_rqst_sz = NFS_MAX_FILE_IO_SIZE;
+        unsigned int target_max_resp_sz = NFS_MAX_FILE_IO_SIZE;
+        if (!nfs4_has_session(clp))
+                return 0;
+        if (server->rsize != 0)
+                target_max_resp_sz = server->rsize;
+        target_max_resp_sz += nfs41_maxread_overhead;
+        if (server->wsize != 0)
+                target_max_rqst_sz = server->wsize;
+        target_max_rqst_sz += nfs41_maxwrite_overhead;
+        session = clp->cl_session;
+        spin_lock(&clp->cl_lock);
+        if (test_and_clear_bit(NFS4_SESSION_INITING, &session->session_state)) {
+                /* Initialise targets and channel attributes */
+                session->fc_target_max_rqst_sz = target_max_rqst_sz;
+                session->fc_attrs.max_rqst_sz = target_max_rqst_sz;
+                session->fc_target_max_resp_sz = target_max_resp_sz;
+                session->fc_attrs.max_resp_sz = target_max_resp_sz;
+        } else {
+                /* Just adjust the targets */
+                if (target_max_rqst_sz > session->fc_target_max_rqst_sz) {
+                        session->fc_target_max_rqst_sz = target_max_rqst_sz;
+                        set_bit(NFS4CLNT_SESSION_RESET, &clp->cl_state);
+                }
+                if (target_max_resp_sz > session->fc_target_max_resp_sz) {
+                        session->fc_target_max_resp_sz = target_max_resp_sz;
+                        set_bit(NFS4CLNT_SESSION_RESET, &clp->cl_state);
+                }
+        }
+        spin_unlock(&clp->cl_lock);
+        if (test_bit(NFS4CLNT_SESSION_RESET, &clp->cl_state))
+                nfs4_schedule_lease_recovery(clp);
+        return nfs41_check_session_ready(clp);
+}
+int nfs4_init_ds_session(struct nfs_client *clp, unsigned long lease_time)
+{
+        struct nfs4_session *session = clp->cl_session;
+        int ret;
+        spin_lock(&clp->cl_lock);
+        if (test_and_clear_bit(NFS4_SESSION_INITING, &session->session_state)) {
+                /*
+                 * Do not set NFS_CS_CHECK_LEASE_TIME instead set the
+                 * DS lease to be equal to the MDS lease.
+                 */
+                clp->cl_lease_time = lease_time;
+                clp->cl_last_renewal = jiffies;
+        }
+        spin_unlock(&clp->cl_lock);
+        ret = nfs41_check_session_ready(clp);
+        if (ret)
+                return ret;
+        /* Test for the DS role */
+        if (!is_ds_client(clp))
+                return -ENODEV;
+        return 0;
+}
+EXPORT_SYMBOL_GPL(nfs4_init_ds_session);
diff --git a/fs/nfs/nfs4session.h b/fs/nfs/nfs4session.h
new file mode 100644
index 000000000000..6f3cb39386d4
--- /dev/null
+++ b/fs/nfs/nfs4session.h
@@ -0,0 +1,142 @@
+/*
+ * fs/nfs/nfs4session.h
+ *
+ * Copyright (c) 2012 Trond Myklebust <Trond.Myklebust@netapp.com>
+ *
+ */
+#ifndef __LINUX_FS_NFS_NFS4SESSION_H
+#define __LINUX_FS_NFS_NFS4SESSION_H
+/* maximum number of slots to use */
+#define NFS4_DEF_SLOT_TABLE_SIZE (16U)
+#define NFS4_MAX_SLOT_TABLE (1024U)
+#define NFS4_NO_SLOT ((u32)-1)
+#if IS_ENABLED(CONFIG_NFS_V4)
+/* Sessions slot seqid */
+struct nfs4_slot {
+        struct nfs4_slot_table  *table;
+        struct nfs4_slot        *next;
+        unsigned long           generation;
+        u32                     slot_nr;
+        u32                     seq_nr;
+        unsigned int            interrupted : 1;
+};
+/* Sessions */
+#define SLOT_TABLE_SZ DIV_ROUND_UP(NFS4_MAX_SLOT_TABLE, 8*sizeof(long))
+struct nfs4_slot_table {
+        struct nfs4_session *session;           /* Parent session */
+        struct nfs4_slot *slots;                /* seqid per slot */
+        unsigned long   used_slots[SLOT_TABLE_SZ]; /* used/unused bitmap */
+        spinlock_t      slot_tbl_lock;
+        struct rpc_wait_queue   slot_tbl_waitq; /* allocators may wait here */
+        u32             max_slots;              /* # slots in table */
+        u32             max_slotid;             /* Max allowed slotid value */
+        u32             highest_used_slotid;    /* sent to server on each SEQ.
+                                                 * op for dynamic resizing */
+        u32             target_highest_slotid;  /* Server max_slot target */
+        u32             server_highest_slotid;  /* Server highest slotid */
+        s32             d_target_highest_slotid; /* Derivative */
+        s32             d2_target_highest_slotid; /* 2nd derivative */
+        unsigned long   generation;             /* Generation counter for
+                                                   target_highest_slotid */
+        struct completion complete;
+};
+/*
+ * Session related parameters
+ */
+struct nfs4_session {
+        struct nfs4_sessionid           sess_id;
+        u32                             flags;
+        unsigned long                   session_state;
+        u32                             hash_alg;
+        u32                             ssv_len;
+        /* The fore and back channel */
+        struct nfs4_channel_attrs       fc_attrs;
+        struct nfs4_slot_table          fc_slot_table;
+        struct nfs4_channel_attrs       bc_attrs;
+        struct nfs4_slot_table          bc_slot_table;
+        struct nfs_client               *clp;
+        /* Create session arguments */
+        unsigned int                    fc_target_max_rqst_sz;
+        unsigned int                    fc_target_max_resp_sz;
+};
+enum nfs4_session_state {
+        NFS4_SESSION_INITING,
+        NFS4_SESSION_DRAINING,
+};
+#if defined(CONFIG_NFS_V4_1)
+extern struct nfs4_slot *nfs4_alloc_slot(struct nfs4_slot_table *tbl);
+extern void nfs4_free_slot(struct nfs4_slot_table *tbl, struct nfs4_slot *slot);
+extern void nfs41_set_target_slotid(struct nfs4_slot_table *tbl,
+                u32 target_highest_slotid);
+extern void nfs41_update_target_slotid(struct nfs4_slot_table *tbl,
+                struct nfs4_slot *slot,
+                struct nfs4_sequence_res *res);
+extern int nfs4_setup_session_slot_tables(struct nfs4_session *ses);
+extern struct nfs4_session *nfs4_alloc_session(struct nfs_client *clp);
+extern void nfs4_destroy_session(struct nfs4_session *session);
+extern int nfs4_init_session(struct nfs_server *server);
+extern int nfs4_init_ds_session(struct nfs_client *, unsigned long);
+extern void nfs4_session_drain_complete(struct nfs4_session *session,
+                struct nfs4_slot_table *tbl);
+static inline bool nfs4_session_draining(struct nfs4_session *session)
+{
+        return !!test_bit(NFS4_SESSION_DRAINING, &session->session_state);
+}
+bool nfs41_wake_and_assign_slot(struct nfs4_slot_table *tbl,
+                struct nfs4_slot *slot);
+void nfs41_wake_slot_table(struct nfs4_slot_table *tbl);
+/*
+ * Determine if sessions are in use.
+ */
+static inline int nfs4_has_session(const struct nfs_client *clp)
+{
+        if (clp->cl_session)
+                return 1;
+        return 0;
+}
+static inline int nfs4_has_persistent_session(const struct nfs_client *clp)
+{
+        if (nfs4_has_session(clp))
+                return (clp->cl_session->flags & SESSION4_PERSIST);
+        return 0;
+}
+#else /* defined(CONFIG_NFS_V4_1) */
+static inline int nfs4_init_session(struct nfs_server *server)
+{
+        return 0;
+}
+/*
+ * Determine if sessions are in use.
+ */
+static inline int nfs4_has_session(const struct nfs_client *clp)
+{
+        return 0;
+}
+static inline int nfs4_has_persistent_session(const struct nfs_client *clp)
+{
+        return 0;
+}
+#endif /* defined(CONFIG_NFS_V4_1) */
+#endif /* IS_ENABLED(CONFIG_NFS_V4) */
+#endif /* __LINUX_FS_NFS_NFS4SESSION_H */
diff --git a/fs/nfs/nfs4state.c b/fs/nfs/nfs4state.c
index c351e6b39838..9448c579d41a 100644
--- a/fs/nfs/nfs4state.c
+++ b/fs/nfs/nfs4state.c
@@ -57,6 +57,7 @@
 #include "callback.h"
 #include "delegation.h"
 #include "internal.h"
+#include "nfs4session.h"
 #include "pnfs.h"
 #include "netns.h"
@@ -66,7 +67,6 @@
 const nfs4_stateid zero_stateid;
 static DEFINE_MUTEX(nfs_clid_init_mutex);
-static LIST_HEAD(nfs4_clientid_list);
 int nfs4_init_clientid(struct nfs_client *clp, struct rpc_cred *cred)
 {
@@ -254,24 +254,27 @@ static void nfs4_end_drain_session(struct nfs_client *clp)
 {
        struct nfs4_session *ses = clp->cl_session;
        struct nfs4_slot_table *tbl;
-        int max_slots;
        if (ses == NULL)
                return;
        tbl = &ses->fc_slot_table;
        if (test_and_clear_bit(NFS4_SESSION_DRAINING, &ses->session_state)) {
                spin_lock(&tbl->slot_tbl_lock);
-                max_slots = tbl->max_slots;
+                nfs41_wake_slot_table(tbl);
-                while (max_slots--) {
-                        if (rpc_wake_up_first(&tbl->slot_tbl_waitq,
-                                                nfs4_set_task_privileged,
-                                                NULL) == NULL)
-                                break;
-                }
                spin_unlock(&tbl->slot_tbl_lock);
        }
 }
+/*
+ * Signal state manager thread if session fore channel is drained
+ */
+void nfs4_session_drain_complete(struct nfs4_session *session,
+                struct nfs4_slot_table *tbl)
+{
+        if (nfs4_session_draining(session))
+                complete(&tbl->complete);
+}
 static int nfs4_wait_on_slot_tbl(struct nfs4_slot_table *tbl)
 {
        spin_lock(&tbl->slot_tbl_lock);
@@ -303,7 +306,6 @@ static void nfs41_finish_session_reset(struct nfs_client *clp)
        clear_bit(NFS4CLNT_LEASE_CONFIRM, &clp->cl_state);
        clear_bit(NFS4CLNT_SESSION_RESET, &clp->cl_state);
        /* create_session negotiated new slot table */
-        clear_bit(NFS4CLNT_RECALL_SLOT, &clp->cl_state);
        clear_bit(NFS4CLNT_BIND_CONN_TO_SESSION, &clp->cl_state);
        nfs41_setup_state_renewal(clp);
 }
@@ -1086,7 +1088,6 @@ void nfs_free_seqid(struct nfs_seqid *seqid)
 */
 static void nfs_increment_seqid(int status, struct nfs_seqid *seqid)
 {
-        BUG_ON(list_first_entry(&seqid->sequence->list, struct nfs_seqid, list) != seqid);
        switch (status) {
                case 0:
                        break;
@@ -1209,6 +1210,40 @@ void nfs4_schedule_lease_recovery(struct nfs_client *clp)
 }
 EXPORT_SYMBOL_GPL(nfs4_schedule_lease_recovery);
+int nfs4_wait_clnt_recover(struct nfs_client *clp)
+{
+        int res;
+        might_sleep();
+        res = wait_on_bit(&clp->cl_state, NFS4CLNT_MANAGER_RUNNING,
+                        nfs_wait_bit_killable, TASK_KILLABLE);
+        if (res)
+                return res;
+        if (clp->cl_cons_state < 0)
+                return clp->cl_cons_state;
+        return 0;
+}
+int nfs4_client_recover_expired_lease(struct nfs_client *clp)
+{
+        unsigned int loop;
+        int ret;
+        for (loop = NFS4_MAX_LOOP_ON_RECOVER; loop != 0; loop--) {
+                ret = nfs4_wait_clnt_recover(clp);
+                if (ret != 0)
+                        break;
+                if (!test_bit(NFS4CLNT_LEASE_EXPIRED, &clp->cl_state) &&
+                    !test_bit(NFS4CLNT_CHECK_LEASE,&clp->cl_state))
+                        break;
+                nfs4_schedule_state_manager(clp);
+                ret = -EIO;
+        }
+        return ret;
+}
 /*
 * nfs40_handle_cb_pathdown - return all delegations after NFS4ERR_CB_PATH_DOWN
 * @clp: client to process
@@ -1401,14 +1436,6 @@ restart:
                                /* Mark the file as being 'closed' */
                                state->state = 0;
                                break;
-                        case -EKEYEXPIRED:
-                                /*
-                                 * User RPCSEC_GSS context has expired.
-                                 * We cannot recover this stateid now, so
-                                 * skip it and allow recovery thread to
-                                 * proceed.
-                                 */
-                                break;
                        case -NFS4ERR_ADMIN_REVOKED:
                        case -NFS4ERR_STALE_STATEID:
                        case -NFS4ERR_BAD_STATEID:
@@ -1561,14 +1588,6 @@ static void nfs4_state_start_reclaim_nograce(struct nfs_client *clp)
        nfs4_state_mark_reclaim_helper(clp, nfs4_state_mark_reclaim_nograce);
 }
-static void nfs4_warn_keyexpired(const char *s)
-{
-        printk_ratelimited(KERN_WARNING "Error: state manager"
-                        " encountered RPCSEC_GSS session"
-                        " expired against NFSv4 server %s.\n",
-                        s);
-}
 static int nfs4_recovery_handle_error(struct nfs_client *clp, int error)
 {
        switch (error) {
@@ -1602,10 +1621,6 @@ static int nfs4_recovery_handle_error(struct nfs_client *clp, int error)
                case -NFS4ERR_CONN_NOT_BOUND_TO_SESSION:
                        set_bit(NFS4CLNT_BIND_CONN_TO_SESSION, &clp->cl_state);
                        break;
-                case -EKEYEXPIRED:
-                        /* Nothing we can do */
-                        nfs4_warn_keyexpired(clp->cl_hostname);
-                        break;
                default:
                        dprintk("%s: failed to handle error %d for server %s\n",
                                        __func__, error, clp->cl_hostname);
@@ -1722,8 +1737,6 @@ static int nfs4_handle_reclaim_lease_error(struct nfs_client *clp, int status)
                dprintk("%s: exit with error %d for server %s\n",
                                __func__, -EPROTONOSUPPORT, clp->cl_hostname);
                return -EPROTONOSUPPORT;
-        case -EKEYEXPIRED:
-                nfs4_warn_keyexpired(clp->cl_hostname);
        case -NFS4ERR_NOT_SAME: /* FixMe: implement recovery
                                 * in nfs4_exchange_id */
        default:
@@ -1876,7 +1889,6 @@ again:
                break;
        case -EKEYEXPIRED:
-                nfs4_warn_keyexpired(clp->cl_hostname);
        case -NFS4ERR_NOT_SAME: /* FixMe: implement recovery
                                 * in nfs4_exchange_id */
                status = -EKEYEXPIRED;
@@ -1907,14 +1919,23 @@ void nfs4_schedule_session_recovery(struct nfs4_session *session, int err)
 }
 EXPORT_SYMBOL_GPL(nfs4_schedule_session_recovery);
-void nfs41_handle_recall_slot(struct nfs_client *clp)
+static void nfs41_ping_server(struct nfs_client *clp)
 {
-        set_bit(NFS4CLNT_RECALL_SLOT, &clp->cl_state);
+        /* Use CHECK_LEASE to ping the server with a SEQUENCE */
-        dprintk("%s: scheduling slot recall for server %s\n", __func__,
+        set_bit(NFS4CLNT_CHECK_LEASE, &clp->cl_state);
-                        clp->cl_hostname);
        nfs4_schedule_state_manager(clp);
 }
+void nfs41_server_notify_target_slotid_update(struct nfs_client *clp)
+{
+        nfs41_ping_server(clp);
+}
+void nfs41_server_notify_highest_slotid_update(struct nfs_client *clp)
+{
+        nfs41_ping_server(clp);
+}
 static void nfs4_reset_all_state(struct nfs_client *clp)
 {
        if (test_and_set_bit(NFS4CLNT_LEASE_EXPIRED, &clp->cl_state) == 0) {
@@ -2024,35 +2045,6 @@ out:
        return status;
 }
-static int nfs4_recall_slot(struct nfs_client *clp)
-{
-        struct nfs4_slot_table *fc_tbl;
-        struct nfs4_slot *new, *old;
-        int i;
-        if (!nfs4_has_session(clp))
-                return 0;
-        nfs4_begin_drain_session(clp);
-        fc_tbl = &clp->cl_session->fc_slot_table;
-        new = kmalloc(fc_tbl->target_max_slots * sizeof(struct nfs4_slot),
-                      GFP_NOFS);
-        if (!new)
-                return -ENOMEM;
-        spin_lock(&fc_tbl->slot_tbl_lock);
-        for (i = 0; i < fc_tbl->target_max_slots; i++)
-                new[i].seq_nr = fc_tbl->slots[i].seq_nr;
-        old = fc_tbl->slots;
-        fc_tbl->slots = new;
-        fc_tbl->max_slots = fc_tbl->target_max_slots;
-        fc_tbl->target_max_slots = 0;
-        clp->cl_session->fc_attrs.max_reqs = fc_tbl->max_slots;
-        spin_unlock(&fc_tbl->slot_tbl_lock);
-        kfree(old);
-        return 0;
-}
 static int nfs4_bind_conn_to_session(struct nfs_client *clp)
 {
        struct rpc_cred *cred;
@@ -2083,7 +2075,6 @@ static int nfs4_bind_conn_to_session(struct nfs_client *clp)
 #else /* CONFIG_NFS_V4_1 */
 static int nfs4_reset_session(struct nfs_client *clp) { return 0; }
 static int nfs4_end_drain_session(struct nfs_client *clp) { return 0; }
-static int nfs4_recall_slot(struct nfs_client *clp) { return 0; }
 static int nfs4_bind_conn_to_session(struct nfs_client *clp)
 {
@@ -2115,15 +2106,6 @@ static void nfs4_state_manager(struct nfs_client *clp)
                        continue;
                }
-                if (test_and_clear_bit(NFS4CLNT_CHECK_LEASE, &clp->cl_state)) {
-                        section = "check lease";
-                        status = nfs4_check_lease(clp);
-                        if (status < 0)
-                                goto out_error;
-                        if (test_bit(NFS4CLNT_LEASE_EXPIRED, &clp->cl_state))
-                                continue;
-                }
                /* Initialize or reset the session */
                if (test_and_clear_bit(NFS4CLNT_SESSION_RESET, &clp->cl_state)) {
                        section = "reset session";
@@ -2144,10 +2126,9 @@ static void nfs4_state_manager(struct nfs_client *clp)
                        continue;
                }
-                /* Recall session slots */
+                if (test_and_clear_bit(NFS4CLNT_CHECK_LEASE, &clp->cl_state)) {
-                if (test_and_clear_bit(NFS4CLNT_RECALL_SLOT, &clp->cl_state)) {
+                        section = "check lease";
-                        section = "recall slot";
+                        status = nfs4_check_lease(clp);
-                        status = nfs4_recall_slot(clp);
                        if (status < 0)
                                goto out_error;
                        continue;
diff --git a/fs/nfs/nfs4super.c b/fs/nfs/nfs4super.c
index bd61221ad2c5..84d2e9e2f313 100644
--- a/fs/nfs/nfs4super.c
+++ b/fs/nfs/nfs4super.c
@@ -51,6 +51,7 @@ static const struct super_operations nfs4_sops = {
        .alloc_inode    = nfs_alloc_inode,
        .destroy_inode  = nfs_destroy_inode,
        .write_inode    = nfs4_write_inode,
+        .drop_inode     = nfs_drop_inode,
        .put_super      = nfs_put_super,
        .statfs         = nfs_statfs,
        .evict_inode    = nfs4_evict_inode,
diff --git a/fs/nfs/nfs4xdr.c b/fs/nfs/nfs4xdr.c
index 40836ee5dc3a..26b143920433 100644
--- a/fs/nfs/nfs4xdr.c
+++ b/fs/nfs/nfs4xdr.c
@@ -56,6 +56,7 @@
 #include "nfs4_fs.h"
 #include "internal.h"
+#include "nfs4session.h"
 #include "pnfs.h"
 #include "netns.h"
@@ -270,6 +271,8 @@ static int nfs4_stat_to_errno(int);
 #if defined(CONFIG_NFS_V4_1)
 #define NFS4_MAX_MACHINE_NAME_LEN (64)
+#define IMPL_NAME_LIMIT (sizeof(utsname()->sysname) + sizeof(utsname()->release) + \
+                         sizeof(utsname()->version) + sizeof(utsname()->machine) + 8)
 #define encode_exchange_id_maxsz (op_encode_hdr_maxsz + \
                                encode_verifier_maxsz + \
@@ -282,7 +285,7 @@ static int nfs4_stat_to_errno(int);
                                1 /* nii_domain */ + \
                                XDR_QUADLEN(NFS4_OPAQUE_LIMIT) + \
                                1 /* nii_name */ + \
-                                XDR_QUADLEN(NFS4_OPAQUE_LIMIT) + \
+                                XDR_QUADLEN(IMPL_NAME_LIMIT) + \
                                3 /* nii_date */)
 #define decode_exchange_id_maxsz (op_decode_hdr_maxsz + \
                                2 /* eir_clientid */ + \
@@ -936,7 +939,7 @@ static void encode_compound_hdr(struct xdr_stream *xdr,
         * but this is not required as a MUST for the server to do so. */
        hdr->replen = RPC_REPHDRSIZE + auth->au_rslack + 3 + hdr->taglen;
-        BUG_ON(hdr->taglen > NFS4_MAXTAGLEN);
+        WARN_ON_ONCE(hdr->taglen > NFS4_MAXTAGLEN);
        encode_string(xdr, hdr->taglen, hdr->tag);
        p = reserve_space(xdr, 8);
        *p++ = cpu_to_be32(hdr->minorversion);
@@ -955,7 +958,7 @@ static void encode_op_hdr(struct xdr_stream *xdr, enum nfs_opnum4 op,
 static void encode_nops(struct compound_hdr *hdr)
 {
-        BUG_ON(hdr->nops > NFS4_MAX_OPS);
+        WARN_ON_ONCE(hdr->nops > NFS4_MAX_OPS);
        *hdr->nops_p = htonl(hdr->nops);
 }
@@ -1403,7 +1406,6 @@ static void encode_opentype(struct xdr_stream *xdr, const struct nfs_openargs *a
                *p = cpu_to_be32(NFS4_OPEN_NOCREATE);
                break;
        default:
-                BUG_ON(arg->claim != NFS4_OPEN_CLAIM_NULL);
                *p = cpu_to_be32(NFS4_OPEN_CREATE);
                encode_createmode(xdr, arg);
        }
@@ -1621,7 +1623,6 @@ encode_setacl(struct xdr_stream *xdr, struct nfs_setaclargs *arg, struct compoun
        p = reserve_space(xdr, 2*4);
        *p++ = cpu_to_be32(1);
        *p = cpu_to_be32(FATTR4_WORD0_ACL);
-        BUG_ON(arg->acl_len % 4);
        p = reserve_space(xdr, 4);
        *p = cpu_to_be32(arg->acl_len);
        xdr_write_pages(xdr, arg->acl_pages, arg->acl_pgbase, arg->acl_len);
@@ -1713,7 +1714,7 @@ static void encode_exchange_id(struct xdr_stream *xdr,
                               struct compound_hdr *hdr)
 {
        __be32 *p;
-        char impl_name[NFS4_OPAQUE_LIMIT];
+        char impl_name[IMPL_NAME_LIMIT];
        int len = 0;
        encode_op_hdr(xdr, OP_EXCHANGE_ID, decode_exchange_id_maxsz, hdr);
@@ -1728,7 +1729,7 @@ static void encode_exchange_id(struct xdr_stream *xdr,
        if (send_implementation_id &&
            sizeof(CONFIG_NFS_V4_1_IMPLEMENTATION_ID_DOMAIN) > 1 &&
            sizeof(CONFIG_NFS_V4_1_IMPLEMENTATION_ID_DOMAIN)
-                <= NFS4_OPAQUE_LIMIT + 1)
+                <= sizeof(impl_name) + 1)
                len = snprintf(impl_name, sizeof(impl_name), "%s %s %s %s",
                               utsname()->sysname, utsname()->release,
                               utsname()->version, utsname()->machine);
@@ -1835,18 +1836,16 @@ static void encode_sequence(struct xdr_stream *xdr,
                            struct compound_hdr *hdr)
 {
 #if defined(CONFIG_NFS_V4_1)
-        struct nfs4_session *session = args->sa_session;
+        struct nfs4_session *session;
        struct nfs4_slot_table *tp;
-        struct nfs4_slot *slot;
+        struct nfs4_slot *slot = args->sa_slot;
        __be32 *p;
-        if (!session)
+        if (slot == NULL)
                return;
-        tp = &session->fc_slot_table;
+        tp = slot->table;
+        session = tp->session;
-        WARN_ON(args->sa_slotid == NFS4_MAX_SLOT_TABLE);
-        slot = tp->slots + args->sa_slotid;
        encode_op_hdr(xdr, OP_SEQUENCE, decode_sequence_maxsz, hdr);
@@ -1860,12 +1859,12 @@ static void encode_sequence(struct xdr_stream *xdr,
                ((u32 *)session->sess_id.data)[1],
                ((u32 *)session->sess_id.data)[2],
                ((u32 *)session->sess_id.data)[3],
-                slot->seq_nr, args->sa_slotid,
+                slot->seq_nr, slot->slot_nr,
                tp->highest_used_slotid, args->sa_cache_this);
        p = reserve_space(xdr, NFS4_MAX_SESSIONID_LEN + 16);
        p = xdr_encode_opaque_fixed(p, session->sess_id.data, NFS4_MAX_SESSIONID_LEN);
        *p++ = cpu_to_be32(slot->seq_nr);
-        *p++ = cpu_to_be32(args->sa_slotid);
+        *p++ = cpu_to_be32(slot->slot_nr);
        *p++ = cpu_to_be32(tp->highest_used_slotid);
        *p = cpu_to_be32(args->sa_cache_this);
 #endif /* CONFIG_NFS_V4_1 */
@@ -2027,8 +2026,9 @@ static void encode_free_stateid(struct xdr_stream *xdr,
 static u32 nfs4_xdr_minorversion(const struct nfs4_sequence_args *args)
 {
 #if defined(CONFIG_NFS_V4_1)
-        if (args->sa_session)
-                return args->sa_session->clp->cl_mvops->minor_version;
+        if (args->sa_slot)
+                return args->sa_slot->table->session->clp->cl_mvops->minor_version;
 #endif /* CONFIG_NFS_V4_1 */
        return 0;
 }
@@ -5509,12 +5509,13 @@ static int decode_sequence(struct xdr_stream *xdr,
                           struct rpc_rqst *rqstp)
 {
 #if defined(CONFIG_NFS_V4_1)
+        struct nfs4_session *session;
        struct nfs4_sessionid id;
        u32 dummy;
        int status;
        __be32 *p;
-        if (!res->sr_session)
+        if (res->sr_slot == NULL)
                return 0;
        status = decode_op_hdr(xdr, OP_SEQUENCE);
@@ -5528,8 +5529,9 @@ static int decode_sequence(struct xdr_stream *xdr,
         * sequence number, the server is looney tunes.
         */
        status = -EREMOTEIO;
+        session = res->sr_slot->table->session;
-        if (memcmp(id.data, res->sr_session->sess_id.data,
+        if (memcmp(id.data, session->sess_id.data,
                   NFS4_MAX_SESSIONID_LEN)) {
                dprintk("%s Invalid session id\n", __func__);
                goto out_err;
@@ -5547,14 +5549,14 @@ static int decode_sequence(struct xdr_stream *xdr,
        }
        /* slot id */
        dummy = be32_to_cpup(p++);
-        if (dummy != res->sr_slot - res->sr_session->fc_slot_table.slots) {
+        if (dummy != res->sr_slot->slot_nr) {
                dprintk("%s Invalid slot id\n", __func__);
                goto out_err;
        }
-        /* highest slot id - currently not processed */
+        /* highest slot id */
-        dummy = be32_to_cpup(p++);
+        res->sr_highest_slotid = be32_to_cpup(p++);
-        /* target highest slot id - currently not processed */
+        /* target highest slot id */
-        dummy = be32_to_cpup(p++);
+        res->sr_target_highest_slotid = be32_to_cpup(p++);
        /* result flags */
        res->sr_status_flags = be32_to_cpup(p);
        status = 0;
diff --git a/fs/nfs/objlayout/objlayout.c b/fs/nfs/objlayout/objlayout.c
index 874613545301..a9ebd817278b 100644
--- a/fs/nfs/objlayout/objlayout.c
+++ b/fs/nfs/objlayout/objlayout.c
@@ -148,17 +148,6 @@ end_offset(u64 start, u64 len)
        return end >= start ? end : NFS4_MAX_UINT64;
 }
-/* last octet in a range */
-static inline u64
-last_byte_offset(u64 start, u64 len)
-{
-        u64 end;
-        BUG_ON(!len);
-        end = start + len;
-        return end > start ? end - 1 : NFS4_MAX_UINT64;
-}
 static void _fix_verify_io_params(struct pnfs_layout_segment *lseg,
                           struct page ***p_pages, unsigned *p_pgbase,
                           u64 offset, unsigned long count)
diff --git a/fs/nfs/pnfs.c b/fs/nfs/pnfs.c
index 2878f97bd78d..e7165d915362 100644
--- a/fs/nfs/pnfs.c
+++ b/fs/nfs/pnfs.c
@@ -369,17 +369,6 @@ end_offset(u64 start, u64 len)
        return end >= start ? end : NFS4_MAX_UINT64;
 }
-/* last octet in a range */
-static inline u64
-last_byte_offset(u64 start, u64 len)
-{
-        u64 end;
-        BUG_ON(!len);
-        end = start + len;
-        return end > start ? end - 1 : NFS4_MAX_UINT64;
-}
 /*
 * is l2 fully contained in l1?
 *   start1                             end1
@@ -645,7 +634,6 @@ send_layoutget(struct pnfs_layout_hdr *lo,
        dprintk("--> %s\n", __func__);
-        BUG_ON(ctx == NULL);
        lgp = kzalloc(sizeof(*lgp), gfp_flags);
        if (lgp == NULL)
                return NULL;
@@ -1126,7 +1114,6 @@ pnfs_update_layout(struct inode *ino,
                 * chance of a CB_LAYOUTRECALL(FILE) coming in.
                 */
                spin_lock(&clp->cl_lock);
-                BUG_ON(!list_empty(&lo->plh_layouts));
                list_add_tail(&lo->plh_layouts, &server->layouts);
                spin_unlock(&clp->cl_lock);
        }
@@ -1222,7 +1209,7 @@ pnfs_generic_pg_init_read(struct nfs_pageio_descriptor *pgio, struct nfs_page *r
 {
        u64 rd_size = req->wb_bytes;
-        BUG_ON(pgio->pg_lseg != NULL);
+        WARN_ON_ONCE(pgio->pg_lseg != NULL);
        if (req->wb_offset != req->wb_pgbase) {
                nfs_pageio_reset_read_mds(pgio);
@@ -1251,7 +1238,7 @@ void
 pnfs_generic_pg_init_write(struct nfs_pageio_descriptor *pgio,
                           struct nfs_page *req, u64 wb_size)
 {
-        BUG_ON(pgio->pg_lseg != NULL);
+        WARN_ON_ONCE(pgio->pg_lseg != NULL);
        if (req->wb_offset != req->wb_pgbase) {
                nfs_pageio_reset_write_mds(pgio);
diff --git a/fs/nfs/proc.c b/fs/nfs/proc.c
index 50a88c3546ed..f084dac948e1 100644
--- a/fs/nfs/proc.c
+++ b/fs/nfs/proc.c
@@ -47,39 +47,6 @@
 #define NFSDBG_FACILITY         NFSDBG_PROC
 /*
- * wrapper to handle the -EKEYEXPIRED error message. This should generally
- * only happen if using krb5 auth and a user's TGT expires. NFSv2 doesn't
- * support the NFSERR_JUKEBOX error code, but we handle this situation in the
- * same way that we handle that error with NFSv3.
- */
-static int
-nfs_rpc_wrapper(struct rpc_clnt *clnt, struct rpc_message *msg, int flags)
-{
-        int res;
-        do {
-                res = rpc_call_sync(clnt, msg, flags);
-                if (res != -EKEYEXPIRED)
-                        break;
-                freezable_schedule_timeout_killable(NFS_JUKEBOX_RETRY_TIME);
-                res = -ERESTARTSYS;
-        } while (!fatal_signal_pending(current));
-        return res;
-}
-#define rpc_call_sync(clnt, msg, flags) nfs_rpc_wrapper(clnt, msg, flags)
-static int
-nfs_async_handle_expired_key(struct rpc_task *task)
-{
-        if (task->tk_status != -EKEYEXPIRED)
-                return 0;
-        task->tk_status = 0;
-        rpc_restart_call(task);
-        rpc_delay(task, NFS_JUKEBOX_RETRY_TIME);
-        return 1;
-}
-/*
 * Bare-bones access to getattr: this is for nfs_read_super.
 */
 static int
@@ -364,8 +331,6 @@ static void nfs_proc_unlink_rpc_prepare(struct rpc_task *task, struct nfs_unlink
 static int nfs_proc_unlink_done(struct rpc_task *task, struct inode *dir)
 {
-        if (nfs_async_handle_expired_key(task))
-                return 0;
        nfs_mark_for_revalidate(dir);
        return 1;
 }
@@ -385,8 +350,6 @@ static int
 nfs_proc_rename_done(struct rpc_task *task, struct inode *old_dir,
                     struct inode *new_dir)
 {
-        if (nfs_async_handle_expired_key(task))
-                return 0;
        nfs_mark_for_revalidate(old_dir);
        nfs_mark_for_revalidate(new_dir);
        return 1;
@@ -642,9 +605,6 @@ static int nfs_read_done(struct rpc_task *task, struct nfs_read_data *data)
 {
        struct inode *inode = data->header->inode;
-        if (nfs_async_handle_expired_key(task))
-                return -EAGAIN;
        nfs_invalidate_atime(inode);
        if (task->tk_status >= 0) {
                nfs_refresh_inode(inode, data->res.fattr);
@@ -671,9 +631,6 @@ static int nfs_write_done(struct rpc_task *task, struct nfs_write_data *data)
 {
        struct inode *inode = data->header->inode;
-        if (nfs_async_handle_expired_key(task))
-                return -EAGAIN;
        if (task->tk_status >= 0)
                nfs_post_op_update_inode_force_wcc(inode, data->res.fattr);
        return 0;
diff --git a/fs/nfs/super.c b/fs/nfs/super.c
index 652d3f7176a9..aa5315bb3666 100644
--- a/fs/nfs/super.c
+++ b/fs/nfs/super.c
@@ -64,6 +64,7 @@
 #include "iostat.h"
 #include "internal.h"
 #include "fscache.h"
+#include "nfs4session.h"
 #include "pnfs.h"
 #include "nfs.h"
@@ -307,6 +308,7 @@ const struct super_operations nfs_sops = {
        .alloc_inode    = nfs_alloc_inode,
        .destroy_inode  = nfs_destroy_inode,
        .write_inode    = nfs_write_inode,
+        .drop_inode     = nfs_drop_inode,
        .put_super      = nfs_put_super,
        .statfs         = nfs_statfs,
        .evict_inode    = nfs_evict_inode,
diff --git a/fs/nfs/write.c b/fs/nfs/write.c
index 9347ab7c9574..5209916e1222 100644
--- a/fs/nfs/write.c
+++ b/fs/nfs/write.c
@@ -202,7 +202,6 @@ out:
 /* A writeback failed: mark the page as bad, and invalidate the page cache */
 static void nfs_set_pageerror(struct page *page)
 {
-        SetPageError(page);
        nfs_zap_mapping(page_file_mapping(page)->host, page_file_mapping(page));
 }
@@ -239,21 +238,18 @@ int nfs_congestion_kb;
 #define NFS_CONGESTION_OFF_THRESH       \
        (NFS_CONGESTION_ON_THRESH - (NFS_CONGESTION_ON_THRESH >> 2))
-static int nfs_set_page_writeback(struct page *page)
+static void nfs_set_page_writeback(struct page *page)
 {
+        struct nfs_server *nfss = NFS_SERVER(page_file_mapping(page)->host);
        int ret = test_set_page_writeback(page);
-        if (!ret) {
+        WARN_ON_ONCE(ret != 0);
-                struct inode *inode = page_file_mapping(page)->host;
-                struct nfs_server *nfss = NFS_SERVER(inode);
-                if (atomic_long_inc_return(&nfss->writeback) >
+        if (atomic_long_inc_return(&nfss->writeback) >
-                                NFS_CONGESTION_ON_THRESH) {
+                        NFS_CONGESTION_ON_THRESH) {
-                        set_bdi_congested(&nfss->backing_dev_info,
+                set_bdi_congested(&nfss->backing_dev_info,
-                                                BLK_RW_ASYNC);
+                                        BLK_RW_ASYNC);
-                }
        }
-        return ret;
 }
 static void nfs_end_page_writeback(struct page *page)
@@ -315,10 +311,10 @@ static int nfs_page_async_flush(struct nfs_pageio_descriptor *pgio,
        if (IS_ERR(req))
                goto out;
-        ret = nfs_set_page_writeback(page);
+        nfs_set_page_writeback(page);
-        BUG_ON(ret != 0);
+        WARN_ON_ONCE(test_bit(PG_CLEAN, &req->wb_flags));
-        BUG_ON(test_bit(PG_CLEAN, &req->wb_flags));
+        ret = 0;
        if (!nfs_pageio_add_request(pgio, req)) {
                nfs_redirty_request(req);
                ret = pgio->pg_error;
@@ -451,8 +447,6 @@ static void nfs_inode_remove_request(struct nfs_page *req)
        struct inode *inode = req->wb_context->dentry->d_inode;
        struct nfs_inode *nfsi = NFS_I(inode);
-        BUG_ON (!NFS_WBACK_BUSY(req));
        spin_lock(&inode->i_lock);
        if (likely(!PageSwapCache(req->wb_page))) {
                set_page_private(req->wb_page, 0);
@@ -884,7 +878,7 @@ static bool nfs_write_pageuptodate(struct page *page, struct inode *inode)
 {
        if (nfs_have_delegated_attributes(inode))
                goto out;
-        if (NFS_I(inode)->cache_validity & NFS_INO_REVAL_PAGECACHE)
+        if (NFS_I(inode)->cache_validity & (NFS_INO_INVALID_DATA|NFS_INO_REVAL_PAGECACHE))
                return false;
 out:
        return PageUptodate(page) != 0;
@@ -1727,7 +1721,6 @@ int nfs_wb_page_cancel(struct inode *inode, struct page *page)
        struct nfs_page *req;
        int ret = 0;
-        BUG_ON(!PageLocked(page));
        for (;;) {
                wait_on_page_writeback(page);
                req = nfs_page_find_request(page);
@@ -1829,7 +1822,7 @@ int __init nfs_init_writepagecache(void)
                goto out_destroy_write_mempool;
        nfs_commit_mempool = mempool_create_slab_pool(MIN_POOL_COMMIT,
-                                                      nfs_wdata_cachep);
+                                                      nfs_cdata_cachep);
        if (nfs_commit_mempool == NULL)
                goto out_destroy_commit_cache;
diff --git a/fs/nilfs2/page.c b/fs/nilfs2/page.c
index 3e7b2a0dc0c8..07f76db04ec7 100644
--- a/fs/nilfs2/page.c
+++ b/fs/nilfs2/page.c
@@ -431,7 +431,7 @@ void nilfs_mapping_init(struct address_space *mapping, struct inode *inode,
        mapping->host = inode;
        mapping->flags = 0;
        mapping_set_gfp_mask(mapping, GFP_NOFS);
-        mapping->assoc_mapping = NULL;
+        mapping->private_data = NULL;
        mapping->backing_dev_info = bdi;
        mapping->a_ops = &empty_aops;
 }
diff --git a/fs/notify/Makefile b/fs/notify/Makefile
index ae5f33a6d868..96d3420d0242 100644
--- a/fs/notify/Makefile
+++ b/fs/notify/Makefile
@@ -1,5 +1,5 @@
 obj-$(CONFIG_FSNOTIFY)          += fsnotify.o notification.o group.o inode_mark.o \
-                                   mark.o vfsmount_mark.o
+                                   mark.o vfsmount_mark.o fdinfo.o
 obj-y                   += dnotify/
 obj-y                   += inotify/
diff --git a/fs/notify/fanotify/Kconfig b/fs/notify/fanotify/Kconfig
index 7dceff005a67..e5f911bd80d2 100644
--- a/fs/notify/fanotify/Kconfig
+++ b/fs/notify/fanotify/Kconfig
@@ -4,7 +4,7 @@ config FANOTIFY
        select ANON_INODES
        default n
        ---help---
-           Say Y here to enable fanotify suport.  fanotify is a file access
+           Say Y here to enable fanotify support.  fanotify is a file access
           notification system which differs from inotify in that it sends
           an open file descriptor to the userspace listener along with
           the event.
diff --git a/fs/notify/fanotify/fanotify.c b/fs/notify/fanotify/fanotify.c
index f35794b97e8e..a50636025364 100644
--- a/fs/notify/fanotify/fanotify.c
+++ b/fs/notify/fanotify/fanotify.c
@@ -21,6 +21,7 @@ static bool should_merge(struct fsnotify_event *old, struct fsnotify_event *new)
                        if ((old->path.mnt == new->path.mnt) &&
                            (old->path.dentry == new->path.dentry))
                                return true;
+                        break;
                case (FSNOTIFY_EVENT_NONE):
                        return true;
                default:
diff --git a/fs/notify/fanotify/fanotify_user.c b/fs/notify/fanotify/fanotify_user.c
index 721d692fa8d4..a5cd9bba022f 100644
--- a/fs/notify/fanotify/fanotify_user.c
+++ b/fs/notify/fanotify/fanotify_user.c
@@ -17,6 +17,7 @@
 #include <asm/ioctls.h>
 #include "../../mount.h"
+#include "../fdinfo.h"
 #define FANOTIFY_DEFAULT_MAX_EVENTS     16384
 #define FANOTIFY_DEFAULT_MAX_MARKS      8192
@@ -258,7 +259,8 @@ static ssize_t copy_event_to_user(struct fsnotify_group *group,
        if (ret)
                goto out_close_fd;
-        fd_install(fd, f);
+        if (fd != FAN_NOFD)
+                fd_install(fd, f);
        return fanotify_event_metadata.event_len;
 out_close_fd:
@@ -427,6 +429,7 @@ static long fanotify_ioctl(struct file *file, unsigned int cmd, unsigned long ar
 }
 static const struct file_operations fanotify_fops = {
+        .show_fdinfo    = fanotify_show_fdinfo,
        .poll           = fanotify_poll,
        .read           = fanotify_read,
        .write          = fanotify_write,
diff --git a/fs/notify/fdinfo.c b/fs/notify/fdinfo.c
new file mode 100644
index 000000000000..514c4b81483d
--- /dev/null
+++ b/fs/notify/fdinfo.c
@@ -0,0 +1,179 @@
+#include <linux/file.h>
+#include <linux/fs.h>
+#include <linux/fsnotify_backend.h>
+#include <linux/idr.h>
+#include <linux/init.h>
+#include <linux/inotify.h>
+#include <linux/fanotify.h>
+#include <linux/kernel.h>
+#include <linux/namei.h>
+#include <linux/sched.h>
+#include <linux/types.h>
+#include <linux/seq_file.h>
+#include <linux/proc_fs.h>
+#include <linux/exportfs.h>
+#include "inotify/inotify.h"
+#include "../fs/mount.h"
+#if defined(CONFIG_PROC_FS)
+#if defined(CONFIG_INOTIFY_USER) || defined(CONFIG_FANOTIFY)
+static int show_fdinfo(struct seq_file *m, struct file *f,
+                       int (*show)(struct seq_file *m, struct fsnotify_mark *mark))
+{
+        struct fsnotify_group *group = f->private_data;
+        struct fsnotify_mark *mark;
+        int ret = 0;
+        spin_lock(&group->mark_lock);
+        list_for_each_entry(mark, &group->marks_list, g_list) {
+                ret = show(m, mark);
+                if (ret)
+                        break;
+        }
+        spin_unlock(&group->mark_lock);
+        return ret;
+}
+#if defined(CONFIG_EXPORTFS)
+static int show_mark_fhandle(struct seq_file *m, struct inode *inode)
+{
+        struct {
+                struct file_handle handle;
+                u8 pad[64];
+        } f;
+        int size, ret, i;
+        f.handle.handle_bytes = sizeof(f.pad);
+        size = f.handle.handle_bytes >> 2;
+        ret = exportfs_encode_inode_fh(inode, (struct fid *)f.handle.f_handle, &size, 0);
+        if ((ret == 255) || (ret == -ENOSPC)) {
+                WARN_ONCE(1, "Can't encode file handler for inotify: %d\n", ret);
+                return 0;
+        }
+        f.handle.handle_type = ret;
+        f.handle.handle_bytes = size * sizeof(u32);
+        ret = seq_printf(m, "fhandle-bytes:%x fhandle-type:%x f_handle:",
+                         f.handle.handle_bytes, f.handle.handle_type);
+        for (i = 0; i < f.handle.handle_bytes; i++)
+                ret |= seq_printf(m, "%02x", (int)f.handle.f_handle[i]);
+        return ret;
+}
+#else
+static int show_mark_fhandle(struct seq_file *m, struct inode *inode)
+{
+        return 0;
+}
+#endif
+#ifdef CONFIG_INOTIFY_USER
+static int inotify_fdinfo(struct seq_file *m, struct fsnotify_mark *mark)
+{
+        struct inotify_inode_mark *inode_mark;
+        struct inode *inode;
+        int ret = 0;
+        if (!(mark->flags & (FSNOTIFY_MARK_FLAG_ALIVE | FSNOTIFY_MARK_FLAG_INODE)))
+                return 0;
+        inode_mark = container_of(mark, struct inotify_inode_mark, fsn_mark);
+        inode = igrab(mark->i.inode);
+        if (inode) {
+                ret = seq_printf(m, "inotify wd:%x ino:%lx sdev:%x "
+                                 "mask:%x ignored_mask:%x ",
+                                 inode_mark->wd, inode->i_ino,
+                                 inode->i_sb->s_dev,
+                                 mark->mask, mark->ignored_mask);
+                ret |= show_mark_fhandle(m, inode);
+                ret |= seq_putc(m, '\n');
+                iput(inode);
+        }
+        return ret;
+}
+int inotify_show_fdinfo(struct seq_file *m, struct file *f)
+{
+        return show_fdinfo(m, f, inotify_fdinfo);
+}
+#endif /* CONFIG_INOTIFY_USER */
+#ifdef CONFIG_FANOTIFY
+static int fanotify_fdinfo(struct seq_file *m, struct fsnotify_mark *mark)
+{
+        unsigned int mflags = 0;
+        struct inode *inode;
+        int ret = 0;
+        if (!(mark->flags & FSNOTIFY_MARK_FLAG_ALIVE))
+                return 0;
+        if (mark->flags & FSNOTIFY_MARK_FLAG_IGNORED_SURV_MODIFY)
+                mflags |= FAN_MARK_IGNORED_SURV_MODIFY;
+        if (mark->flags & FSNOTIFY_MARK_FLAG_INODE) {
+                inode = igrab(mark->i.inode);
+                if (!inode)
+                        goto out;
+                ret = seq_printf(m, "fanotify ino:%lx sdev:%x "
+                                 "mflags:%x mask:%x ignored_mask:%x ",
+                                 inode->i_ino, inode->i_sb->s_dev,
+                                 mflags, mark->mask, mark->ignored_mask);
+                ret |= show_mark_fhandle(m, inode);
+                ret |= seq_putc(m, '\n');
+                iput(inode);
+        } else if (mark->flags & FSNOTIFY_MARK_FLAG_VFSMOUNT) {
+                struct mount *mnt = real_mount(mark->m.mnt);
+                ret = seq_printf(m, "fanotify mnt_id:%x mflags:%x mask:%x "
+                                 "ignored_mask:%x\n", mnt->mnt_id, mflags,
+                                 mark->mask, mark->ignored_mask);
+        }
+out:
+        return ret;
+}
+int fanotify_show_fdinfo(struct seq_file *m, struct file *f)
+{
+        struct fsnotify_group *group = f->private_data;
+        unsigned int flags = 0;
+        switch (group->priority) {
+        case FS_PRIO_0:
+                flags |= FAN_CLASS_NOTIF;
+                break;
+        case FS_PRIO_1:
+                flags |= FAN_CLASS_CONTENT;
+                break;
+        case FS_PRIO_2:
+                flags |= FAN_CLASS_PRE_CONTENT;
+                break;
+        }
+        if (group->max_events == UINT_MAX)
+                flags |= FAN_UNLIMITED_QUEUE;
+        if (group->fanotify_data.max_marks == UINT_MAX)
+                flags |= FAN_UNLIMITED_MARKS;
+        seq_printf(m, "fanotify flags:%x event-flags:%x\n",
+                   flags, group->fanotify_data.f_flags);
+        return show_fdinfo(m, f, fanotify_fdinfo);
+}
+#endif /* CONFIG_FANOTIFY */
+#endif /* CONFIG_INOTIFY_USER || CONFIG_FANOTIFY */
+#endif /* CONFIG_PROC_FS */
diff --git a/fs/notify/fdinfo.h b/fs/notify/fdinfo.h
new file mode 100644
index 000000000000..556afda990e9
--- /dev/null
+++ b/fs/notify/fdinfo.h
@@ -0,0 +1,27 @@
+#ifndef __FSNOTIFY_FDINFO_H__
+#define __FSNOTIFY_FDINFO_H__
+#include <linux/errno.h>
+#include <linux/proc_fs.h>
+struct seq_file;
+struct file;
+#ifdef CONFIG_PROC_FS
+#ifdef CONFIG_INOTIFY_USER
+extern int inotify_show_fdinfo(struct seq_file *m, struct file *f);
+#endif
+#ifdef CONFIG_FANOTIFY
+extern int fanotify_show_fdinfo(struct seq_file *m, struct file *f);
+#endif
+#else /* CONFIG_PROC_FS */
+#define inotify_show_fdinfo     NULL
+#define fanotify_show_fdinfo    NULL
+#endif /* CONFIG_PROC_FS */
+#endif /* __FSNOTIFY_FDINFO_H__ */
diff --git a/fs/notify/inode_mark.c b/fs/notify/inode_mark.c
index b13c00ac48eb..f3035691f528 100644
--- a/fs/notify/inode_mark.c
+++ b/fs/notify/inode_mark.c
@@ -116,8 +116,9 @@ void fsnotify_clear_inode_marks_by_group(struct fsnotify_group *group)
 * given a group and inode, find the mark associated with that combination.
 * if found take a reference to that mark and return it, else return NULL
 */
-struct fsnotify_mark *fsnotify_find_inode_mark_locked(struct fsnotify_group *group,
+static struct fsnotify_mark *fsnotify_find_inode_mark_locked(
-                                                      struct inode *inode)
+                struct fsnotify_group *group,
+                struct inode *inode)
 {
        struct fsnotify_mark *mark;
        struct hlist_node *pos;
diff --git a/fs/notify/inotify/inotify_user.c b/fs/notify/inotify/inotify_user.c
index c311dda054a3..36cb013c7c13 100644
--- a/fs/notify/inotify/inotify_user.c
+++ b/fs/notify/inotify/inotify_user.c
@@ -40,6 +40,7 @@
 #include <linux/wait.h>
 #include "inotify.h"
+#include "../fdinfo.h"
 #include <asm/ioctls.h>
@@ -335,6 +336,7 @@ static long inotify_ioctl(struct file *file, unsigned int cmd,
 }
 static const struct file_operations inotify_fops = {
+        .show_fdinfo    = inotify_show_fdinfo,
        .poll           = inotify_poll,
        .read           = inotify_read,
        .fasync         = inotify_fasync,
diff --git a/fs/notify/notification.c b/fs/notify/notification.c
index c887b1378f7e..48cb994e4922 100644
--- a/fs/notify/notification.c
+++ b/fs/notify/notification.c
@@ -18,7 +18,7 @@
 /*
 * Basic idea behind the notification queue: An fsnotify group (like inotify)
- * sends the userspace notification about events asyncronously some time after
+ * sends the userspace notification about events asynchronously some time after
 * the event happened.  When inotify gets an event it will need to add that
 * event to the group notify queue.  Since a single event might need to be on
 * multiple group's notification queues we can't add the event directly to each
diff --git a/fs/ocfs2/extent_map.c b/fs/ocfs2/extent_map.c
index 70b5863a2d64..f487aa343442 100644
--- a/fs/ocfs2/extent_map.c
+++ b/fs/ocfs2/extent_map.c
@@ -832,7 +832,7 @@ out:
        return ret;
 }
-int ocfs2_seek_data_hole_offset(struct file *file, loff_t *offset, int origin)
+int ocfs2_seek_data_hole_offset(struct file *file, loff_t *offset, int whence)
 {
        struct inode *inode = file->f_mapping->host;
        int ret;
@@ -843,7 +843,7 @@ int ocfs2_seek_data_hole_offset(struct file *file, loff_t *offset, int origin)
        struct buffer_head *di_bh = NULL;
        struct ocfs2_extent_rec rec;
-        BUG_ON(origin != SEEK_DATA && origin != SEEK_HOLE);
+        BUG_ON(whence != SEEK_DATA && whence != SEEK_HOLE);
        ret = ocfs2_inode_lock(inode, &di_bh, 0);
        if (ret) {
@@ -859,7 +859,7 @@ int ocfs2_seek_data_hole_offset(struct file *file, loff_t *offset, int origin)
        }
        if (OCFS2_I(inode)->ip_dyn_features & OCFS2_INLINE_DATA_FL) {
-                if (origin == SEEK_HOLE)
+                if (whence == SEEK_HOLE)
                        *offset = inode->i_size;
                goto out_unlock;
        }
@@ -888,8 +888,8 @@ int ocfs2_seek_data_hole_offset(struct file *file, loff_t *offset, int origin)
                        is_data = (rec.e_flags & OCFS2_EXT_UNWRITTEN) ?  0 : 1;
                }
-                if ((!is_data && origin == SEEK_HOLE) ||
+                if ((!is_data && whence == SEEK_HOLE) ||
-                    (is_data && origin == SEEK_DATA)) {
+                    (is_data && whence == SEEK_DATA)) {
                        if (extoff > *offset)
                                *offset = extoff;
                        goto out_unlock;
@@ -899,7 +899,7 @@ int ocfs2_seek_data_hole_offset(struct file *file, loff_t *offset, int origin)
                        cpos += clen;
        }
-        if (origin == SEEK_HOLE) {
+        if (whence == SEEK_HOLE) {
                extoff = cpos;
                extoff <<= cs_bits;
                extlen = clen;
diff --git a/fs/ocfs2/file.c b/fs/ocfs2/file.c
index 5a4ee77cec51..fe492e1a3cfc 100644
--- a/fs/ocfs2/file.c
+++ b/fs/ocfs2/file.c
@@ -2513,18 +2513,15 @@ static ssize_t ocfs2_file_splice_write(struct pipe_inode_info *pipe,
                ret = sd.num_spliced;
        if (ret > 0) {
-                unsigned long nr_pages;
                int err;
-                nr_pages = (ret + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT;
                err = generic_write_sync(out, *ppos, ret);
                if (err)
                        ret = err;
                else
                        *ppos += ret;
-                balance_dirty_pages_ratelimited_nr(mapping, nr_pages);
+                balance_dirty_pages_ratelimited(mapping);
        }
        return ret;
@@ -2640,14 +2637,14 @@ bail:
 }
 /* Refer generic_file_llseek_unlocked() */
-static loff_t ocfs2_file_llseek(struct file *file, loff_t offset, int origin)
+static loff_t ocfs2_file_llseek(struct file *file, loff_t offset, int whence)
 {
        struct inode *inode = file->f_mapping->host;
        int ret = 0;
        mutex_lock(&inode->i_mutex);
-        switch (origin) {
+        switch (whence) {
        case SEEK_SET:
                break;
        case SEEK_END:
@@ -2662,7 +2659,7 @@ static loff_t ocfs2_file_llseek(struct file *file, loff_t offset, int origin)
                break;
        case SEEK_DATA:
        case SEEK_HOLE:
-                ret = ocfs2_seek_data_hole_offset(file, &offset, origin);
+                ret = ocfs2_seek_data_hole_offset(file, &offset, whence);
                if (ret)
                        goto out;
                break;
diff --git a/fs/open.c b/fs/open.c
index 59071f55bf7f..182d8667b7bd 100644
--- a/fs/open.c
+++ b/fs/open.c
@@ -435,7 +435,7 @@ SYSCALL_DEFINE1(chroot, const char __user *, filename)
                goto dput_and_out;
        error = -EPERM;
-        if (!capable(CAP_SYS_CHROOT))
+        if (!nsown_capable(CAP_SYS_CHROOT))
                goto dput_and_out;
        error = security_path_chroot(&path);
        if (error)
diff --git a/fs/pnode.h b/fs/pnode.h
index 65c60979d541..19b853a3445c 100644
--- a/fs/pnode.h
+++ b/fs/pnode.h
@@ -22,6 +22,7 @@
 #define CL_COPY_ALL             0x04
 #define CL_MAKE_SHARED          0x08
 #define CL_PRIVATE              0x10
+#define CL_SHARED_TO_SLAVE      0x20
 static inline void set_mnt_shared(struct mount *mnt)
 {
diff --git a/fs/proc/Makefile b/fs/proc/Makefile
index 99349efbbc2b..981b05601931 100644
--- a/fs/proc/Makefile
+++ b/fs/proc/Makefile
@@ -21,6 +21,7 @@ proc-y	+= uptime.o
 proc-y  += version.o
 proc-y  += softirqs.o
 proc-y  += namespaces.o
+proc-y  += self.o
 proc-$(CONFIG_PROC_SYSCTL)      += proc_sysctl.o
 proc-$(CONFIG_NET)              += proc_net.o
 proc-$(CONFIG_PROC_KCORE)       += kcore.o
diff --git a/fs/proc/array.c b/fs/proc/array.c
index c1c207c36cae..6a91e6ffbcbd 100644
--- a/fs/proc/array.c
+++ b/fs/proc/array.c
@@ -162,7 +162,7 @@ static inline const char *get_task_state(struct task_struct *tsk)
 static inline void task_state(struct seq_file *m, struct pid_namespace *ns,
                                struct pid *pid, struct task_struct *p)
 {
-        struct user_namespace *user_ns = current_user_ns();
+        struct user_namespace *user_ns = seq_user_ns(m);
        struct group_info *group_info;
        int g;
        struct fdtable *fdt = NULL;
@@ -212,7 +212,7 @@ static inline void task_state(struct seq_file *m, struct pid_namespace *ns,
        group_info = cred->group_info;
        task_unlock(p);
-        for (g = 0; g < min(group_info->ngroups, NGROUPS_SMALL); g++)
+        for (g = 0; g < group_info->ngroups; g++)
                seq_printf(m, "%d ",
                           from_kgid_munged(user_ns, GROUP_AT(group_info, g)));
        put_cred(cred);
@@ -220,7 +220,7 @@ static inline void task_state(struct seq_file *m, struct pid_namespace *ns,
        seq_putc(m, '\n');
 }
-static void render_sigset_t(struct seq_file *m, const char *header,
+void render_sigset_t(struct seq_file *m, const char *header,
                                sigset_t *set)
 {
        int i;
@@ -308,6 +308,10 @@ static void render_cap_t(struct seq_file *m, const char *header,
        seq_putc(m, '\n');
 }
+/* Remove non-existent capabilities */
+#define NORM_CAPS(v) (v.cap[CAP_TO_INDEX(CAP_LAST_CAP)] &= \
+                                CAP_TO_MASK(CAP_LAST_CAP + 1) - 1)
 static inline void task_cap(struct seq_file *m, struct task_struct *p)
 {
        const struct cred *cred;
@@ -321,12 +325,24 @@ static inline void task_cap(struct seq_file *m, struct task_struct *p)
        cap_bset        = cred->cap_bset;
        rcu_read_unlock();
+        NORM_CAPS(cap_inheritable);
+        NORM_CAPS(cap_permitted);
+        NORM_CAPS(cap_effective);
+        NORM_CAPS(cap_bset);
        render_cap_t(m, "CapInh:\t", &cap_inheritable);
        render_cap_t(m, "CapPrm:\t", &cap_permitted);
        render_cap_t(m, "CapEff:\t", &cap_effective);
        render_cap_t(m, "CapBnd:\t", &cap_bset);
 }
+static inline void task_seccomp(struct seq_file *m, struct task_struct *p)
+{
+#ifdef CONFIG_SECCOMP
+        seq_printf(m, "Seccomp:\t%d\n", p->seccomp.mode);
+#endif
+}
 static inline void task_context_switch_counts(struct seq_file *m,
                                                struct task_struct *p)
 {
@@ -360,6 +376,7 @@ int proc_pid_status(struct seq_file *m, struct pid_namespace *ns,
        }
        task_sig(m, task);
        task_cap(m, task);
+        task_seccomp(m, task);
        task_cpus_allowed(m, task);
        cpuset_task_status_allowed(m, task);
        task_context_switch_counts(m, task);
@@ -438,7 +455,7 @@ static int do_task_stat(struct seq_file *m, struct pid_namespace *ns,
                        min_flt += sig->min_flt;
                        maj_flt += sig->maj_flt;
-                        thread_group_times(task, &utime, &stime);
+                        thread_group_cputime_adjusted(task, &utime, &stime);
                        gtime += sig->gtime;
                }
@@ -454,7 +471,7 @@ static int do_task_stat(struct seq_file *m, struct pid_namespace *ns,
        if (!whole) {
                min_flt = task->min_flt;
                maj_flt = task->maj_flt;
-                task_times(task, &utime, &stime);
+                task_cputime_adjusted(task, &utime, &stime);
                gtime = task->gtime;
        }
diff --git a/fs/proc/base.c b/fs/proc/base.c
index 144a96732dd7..5a5a0be40e40 100644
--- a/fs/proc/base.c
+++ b/fs/proc/base.c
@@ -873,12 +873,119 @@ static const struct file_operations proc_environ_operations = {
        .release        = mem_release,
 };
+static ssize_t oom_adj_read(struct file *file, char __user *buf, size_t count,
+                            loff_t *ppos)
+{
+        struct task_struct *task = get_proc_task(file->f_path.dentry->d_inode);
+        char buffer[PROC_NUMBUF];
+        int oom_adj = OOM_ADJUST_MIN;
+        size_t len;
+        unsigned long flags;
+        if (!task)
+                return -ESRCH;
+        if (lock_task_sighand(task, &flags)) {
+                if (task->signal->oom_score_adj == OOM_SCORE_ADJ_MAX)
+                        oom_adj = OOM_ADJUST_MAX;
+                else
+                        oom_adj = (task->signal->oom_score_adj * -OOM_DISABLE) /
+                                  OOM_SCORE_ADJ_MAX;
+                unlock_task_sighand(task, &flags);
+        }
+        put_task_struct(task);
+        len = snprintf(buffer, sizeof(buffer), "%d\n", oom_adj);
+        return simple_read_from_buffer(buf, count, ppos, buffer, len);
+}
+static ssize_t oom_adj_write(struct file *file, const char __user *buf,
+                             size_t count, loff_t *ppos)
+{
+        struct task_struct *task;
+        char buffer[PROC_NUMBUF];
+        int oom_adj;
+        unsigned long flags;
+        int err;
+        memset(buffer, 0, sizeof(buffer));
+        if (count > sizeof(buffer) - 1)
+                count = sizeof(buffer) - 1;
+        if (copy_from_user(buffer, buf, count)) {
+                err = -EFAULT;
+                goto out;
+        }
+        err = kstrtoint(strstrip(buffer), 0, &oom_adj);
+        if (err)
+                goto out;
+        if ((oom_adj < OOM_ADJUST_MIN || oom_adj > OOM_ADJUST_MAX) &&
+             oom_adj != OOM_DISABLE) {
+                err = -EINVAL;
+                goto out;
+        }
+        task = get_proc_task(file->f_path.dentry->d_inode);
+        if (!task) {
+                err = -ESRCH;
+                goto out;
+        }
+        task_lock(task);
+        if (!task->mm) {
+                err = -EINVAL;
+                goto err_task_lock;
+        }
+        if (!lock_task_sighand(task, &flags)) {
+                err = -ESRCH;
+                goto err_task_lock;
+        }
+        /*
+         * Scale /proc/pid/oom_score_adj appropriately ensuring that a maximum
+         * value is always attainable.
+         */
+        if (oom_adj == OOM_ADJUST_MAX)
+                oom_adj = OOM_SCORE_ADJ_MAX;
+        else
+                oom_adj = (oom_adj * OOM_SCORE_ADJ_MAX) / -OOM_DISABLE;
+        if (oom_adj < task->signal->oom_score_adj &&
+            !capable(CAP_SYS_RESOURCE)) {
+                err = -EACCES;
+                goto err_sighand;
+        }
+        /*
+         * /proc/pid/oom_adj is provided for legacy purposes, ask users to use
+         * /proc/pid/oom_score_adj instead.
+         */
+        printk_once(KERN_WARNING "%s (%d): /proc/%d/oom_adj is deprecated, please use /proc/%d/oom_score_adj instead.\n",
+                  current->comm, task_pid_nr(current), task_pid_nr(task),
+                  task_pid_nr(task));
+        task->signal->oom_score_adj = oom_adj;
+        trace_oom_score_adj_update(task);
+err_sighand:
+        unlock_task_sighand(task, &flags);
+err_task_lock:
+        task_unlock(task);
+        put_task_struct(task);
+out:
+        return err < 0 ? err : count;
+}
+static const struct file_operations proc_oom_adj_operations = {
+        .read           = oom_adj_read,
+        .write          = oom_adj_write,
+        .llseek         = generic_file_llseek,
+};
 static ssize_t oom_score_adj_read(struct file *file, char __user *buf,
                                        size_t count, loff_t *ppos)
 {
        struct task_struct *task = get_proc_task(file->f_path.dentry->d_inode);
        char buffer[PROC_NUMBUF];
-        int oom_score_adj = OOM_SCORE_ADJ_MIN;
+        short oom_score_adj = OOM_SCORE_ADJ_MIN;
        unsigned long flags;
        size_t len;
@@ -889,7 +996,7 @@ static ssize_t oom_score_adj_read(struct file *file, char __user *buf,
                unlock_task_sighand(task, &flags);
        }
        put_task_struct(task);
-        len = snprintf(buffer, sizeof(buffer), "%d\n", oom_score_adj);
+        len = snprintf(buffer, sizeof(buffer), "%hd\n", oom_score_adj);
        return simple_read_from_buffer(buf, count, ppos, buffer, len);
 }
@@ -936,15 +1043,15 @@ static ssize_t oom_score_adj_write(struct file *file, const char __user *buf,
                goto err_task_lock;
        }
-        if (oom_score_adj < task->signal->oom_score_adj_min &&
+        if ((short)oom_score_adj < task->signal->oom_score_adj_min &&
                        !capable(CAP_SYS_RESOURCE)) {
                err = -EACCES;
                goto err_sighand;
        }
-        task->signal->oom_score_adj = oom_score_adj;
+        task->signal->oom_score_adj = (short)oom_score_adj;
        if (has_capability_noaudit(current, CAP_SYS_RESOURCE))
-                task->signal->oom_score_adj_min = oom_score_adj;
+                task->signal->oom_score_adj_min = (short)oom_score_adj;
        trace_oom_score_adj_update(task);
 err_sighand:
@@ -1770,8 +1877,9 @@ static struct dentry *proc_map_files_lookup(struct inode *dir,
        if (!vma)
                goto out_no_vma;
-        result = proc_map_files_instantiate(dir, dentry, task,
+        if (vma->vm_file)
-                        (void *)(unsigned long)vma->vm_file->f_mode);
+                result = proc_map_files_instantiate(dir, dentry, task,
+                                (void *)(unsigned long)vma->vm_file->f_mode);
 out_no_vma:
        up_read(&mm->mmap_sem);
@@ -2237,146 +2345,6 @@ static const struct file_operations proc_coredump_filter_operations = {
 };
 #endif
-/*
- * /proc/self:
- */
-static int proc_self_readlink(struct dentry *dentry, char __user *buffer,
-                              int buflen)
-{
-        struct pid_namespace *ns = dentry->d_sb->s_fs_info;
-        pid_t tgid = task_tgid_nr_ns(current, ns);
-        char tmp[PROC_NUMBUF];
-        if (!tgid)
-                return -ENOENT;
-        sprintf(tmp, "%d", tgid);
-        return vfs_readlink(dentry,buffer,buflen,tmp);
-}
-static void *proc_self_follow_link(struct dentry *dentry, struct nameidata *nd)
-{
-        struct pid_namespace *ns = dentry->d_sb->s_fs_info;
-        pid_t tgid = task_tgid_nr_ns(current, ns);
-        char *name = ERR_PTR(-ENOENT);
-        if (tgid) {
-                /* 11 for max length of signed int in decimal + NULL term */
-                name = kmalloc(12, GFP_KERNEL);
-                if (!name)
-                        name = ERR_PTR(-ENOMEM);
-                else
-                        sprintf(name, "%d", tgid);
-        }
-        nd_set_link(nd, name);
-        return NULL;
-}
-static void proc_self_put_link(struct dentry *dentry, struct nameidata *nd,
-                                void *cookie)
-{
-        char *s = nd_get_link(nd);
-        if (!IS_ERR(s))
-                kfree(s);
-}
-static const struct inode_operations proc_self_inode_operations = {
-        .readlink       = proc_self_readlink,
-        .follow_link    = proc_self_follow_link,
-        .put_link       = proc_self_put_link,
-};
-/*
- * proc base
- *
- * These are the directory entries in the root directory of /proc
- * that properly belong to the /proc filesystem, as they describe
- * describe something that is process related.
- */
-static const struct pid_entry proc_base_stuff[] = {
-        NOD("self", S_IFLNK|S_IRWXUGO,
-                &proc_self_inode_operations, NULL, {}),
-};
-static struct dentry *proc_base_instantiate(struct inode *dir,
-        struct dentry *dentry, struct task_struct *task, const void *ptr)
-{
-        const struct pid_entry *p = ptr;
-        struct inode *inode;
-        struct proc_inode *ei;
-        struct dentry *error;
-        /* Allocate the inode */
-        error = ERR_PTR(-ENOMEM);
-        inode = new_inode(dir->i_sb);
-        if (!inode)
-                goto out;
-        /* Initialize the inode */
-        ei = PROC_I(inode);
-        inode->i_ino = get_next_ino();
-        inode->i_mtime = inode->i_atime = inode->i_ctime = CURRENT_TIME;
-        /*
-         * grab the reference to the task.
-         */
-        ei->pid = get_task_pid(task, PIDTYPE_PID);
-        if (!ei->pid)
-                goto out_iput;
-        inode->i_mode = p->mode;
-        if (S_ISDIR(inode->i_mode))
-                set_nlink(inode, 2);
-        if (S_ISLNK(inode->i_mode))
-                inode->i_size = 64;
-        if (p->iop)
-                inode->i_op = p->iop;
-        if (p->fop)
-                inode->i_fop = p->fop;
-        ei->op = p->op;
-        d_add(dentry, inode);
-        error = NULL;
-out:
-        return error;
-out_iput:
-        iput(inode);
-        goto out;
-}
-static struct dentry *proc_base_lookup(struct inode *dir, struct dentry *dentry)
-{
-        struct dentry *error;
-        struct task_struct *task = get_proc_task(dir);
-        const struct pid_entry *p, *last;
-        error = ERR_PTR(-ENOENT);
-        if (!task)
-                goto out_no_task;
-        /* Lookup the directory entry */
-        last = &proc_base_stuff[ARRAY_SIZE(proc_base_stuff) - 1];
-        for (p = proc_base_stuff; p <= last; p++) {
-                if (p->len != dentry->d_name.len)
-                        continue;
-                if (!memcmp(dentry->d_name.name, p->name, p->len))
-                        break;
-        }
-        if (p > last)
-                goto out;
-        error = proc_base_instantiate(dir, dentry, task, p);
-out:
-        put_task_struct(task);
-out_no_task:
-        return error;
-}
-static int proc_base_fill_cache(struct file *filp, void *dirent,
-        filldir_t filldir, struct task_struct *task, const struct pid_entry *p)
-{
-        return proc_fill_cache(filp, dirent, filldir, p->name, p->len,
-                                proc_base_instantiate, task, p);
-}
 #ifdef CONFIG_TASK_IO_ACCOUNTING
 static int do_io_accounting(struct task_struct *task, char *buffer, int whole)
 {
@@ -2598,6 +2566,7 @@ static const struct pid_entry tgid_base_stuff[] = {
        REG("cgroup",  S_IRUGO, proc_cgroup_operations),
 #endif
        INF("oom_score",  S_IRUGO, proc_oom_score),
+        REG("oom_adj",    S_IRUGO|S_IWUSR, proc_oom_adj_operations),
        REG("oom_score_adj", S_IRUGO|S_IWUSR, proc_oom_score_adj_operations),
 #ifdef CONFIG_AUDITSYSCALL
        REG("loginuid",   S_IWUSR|S_IRUGO, proc_loginuid_operations),
@@ -2730,10 +2699,6 @@ void proc_flush_task(struct task_struct *task)
                proc_flush_task_mnt(upid->ns->proc_mnt, upid->nr,
                                        tgid->numbers[i].nr);
        }
-        upid = &pid->numbers[pid->level];
-        if (upid->nr == 1)
-                pid_ns_release_proc(upid->ns);
 }
 static struct dentry *proc_pid_instantiate(struct inode *dir,
@@ -2767,15 +2732,11 @@ out:
 struct dentry *proc_pid_lookup(struct inode *dir, struct dentry * dentry, unsigned int flags)
 {
-        struct dentry *result;
+        struct dentry *result = NULL;
        struct task_struct *task;
        unsigned tgid;
        struct pid_namespace *ns;
-        result = proc_base_lookup(dir, dentry);
-        if (!IS_ERR(result) || PTR_ERR(result) != -ENOENT)
-                goto out;
        tgid = name_to_int(dentry);
        if (tgid == ~0U)
                goto out;
@@ -2838,7 +2799,7 @@ retry:
        return iter;
 }
-#define TGID_OFFSET (FIRST_PROCESS_ENTRY + ARRAY_SIZE(proc_base_stuff))
+#define TGID_OFFSET (FIRST_PROCESS_ENTRY)
 static int proc_pid_fill_cache(struct file *filp, void *dirent, filldir_t filldir,
        struct tgid_iter iter)
@@ -2858,25 +2819,12 @@ static int fake_filldir(void *buf, const char *name, int namelen,
 /* for the /proc/ directory itself, after non-process stuff has been done */
 int proc_pid_readdir(struct file * filp, void * dirent, filldir_t filldir)
 {
-        unsigned int nr;
-        struct task_struct *reaper;
        struct tgid_iter iter;
        struct pid_namespace *ns;
        filldir_t __filldir;
        if (filp->f_pos >= PID_MAX_LIMIT + TGID_OFFSET)
-                goto out_no_task;
+                goto out;
-        nr = filp->f_pos - FIRST_PROCESS_ENTRY;
-        reaper = get_proc_task(filp->f_path.dentry->d_inode);
-        if (!reaper)
-                goto out_no_task;
-        for (; nr < ARRAY_SIZE(proc_base_stuff); filp->f_pos++, nr++) {
-                const struct pid_entry *p = &proc_base_stuff[nr];
-                if (proc_base_fill_cache(filp, dirent, filldir, reaper, p) < 0)
-                        goto out;
-        }
        ns = filp->f_dentry->d_sb->s_fs_info;
        iter.task = NULL;
@@ -2897,8 +2845,6 @@ int proc_pid_readdir(struct file * filp, void * dirent, filldir_t filldir)
        }
        filp->f_pos = PID_MAX_LIMIT + TGID_OFFSET;
 out:
-        put_task_struct(reaper);
-out_no_task:
        return 0;
 }
@@ -2964,6 +2910,7 @@ static const struct pid_entry tid_base_stuff[] = {
        REG("cgroup",  S_IRUGO, proc_cgroup_operations),
 #endif
        INF("oom_score", S_IRUGO, proc_oom_score),
+        REG("oom_adj",   S_IRUGO|S_IWUSR, proc_oom_adj_operations),
        REG("oom_score_adj", S_IRUGO|S_IWUSR, proc_oom_score_adj_operations),
 #ifdef CONFIG_AUDITSYSCALL
        REG("loginuid",  S_IWUSR|S_IRUGO, proc_loginuid_operations),
diff --git a/fs/proc/fd.c b/fs/proc/fd.c
index f28a875f8779..d7a4a28ef630 100644
--- a/fs/proc/fd.c
+++ b/fs/proc/fd.c
@@ -50,6 +50,8 @@ static int seq_show(struct seq_file *m, void *v)
        if (!ret) {
                seq_printf(m, "pos:\t%lli\nflags:\t0%o\n",
                           (long long)file->f_pos, f_flags);
+                if (file->f_op->show_fdinfo)
+                        ret = file->f_op->show_fdinfo(m, file);
                fput(file);
        }
diff --git a/fs/proc/generic.c b/fs/proc/generic.c
index 0d80cef4cfb9..7b3ae3cc0ef9 100644
--- a/fs/proc/generic.c
+++ b/fs/proc/generic.c
@@ -350,14 +350,14 @@ static DEFINE_SPINLOCK(proc_inum_lock); /* protects the above */
 * Return an inode number between PROC_DYNAMIC_FIRST and
 * 0xffffffff, or zero on failure.
 */
-static unsigned int get_inode_number(void)
+int proc_alloc_inum(unsigned int *inum)
 {
        unsigned int i;
        int error;
 retry:
-        if (ida_pre_get(&proc_inum_ida, GFP_KERNEL) == 0)
+        if (!ida_pre_get(&proc_inum_ida, GFP_KERNEL))
-                return 0;
+                return -ENOMEM;
        spin_lock(&proc_inum_lock);
        error = ida_get_new(&proc_inum_ida, &i);
@@ -365,18 +365,19 @@ retry:
        if (error == -EAGAIN)
                goto retry;
        else if (error)
-                return 0;
+                return error;
        if (i > UINT_MAX - PROC_DYNAMIC_FIRST) {
                spin_lock(&proc_inum_lock);
                ida_remove(&proc_inum_ida, i);
                spin_unlock(&proc_inum_lock);
-                return 0;
+                return -ENOSPC;
        }
-        return PROC_DYNAMIC_FIRST + i;
+        *inum = PROC_DYNAMIC_FIRST + i;
+        return 0;
 }
-static void release_inode_number(unsigned int inum)
+void proc_free_inum(unsigned int inum)
 {
        spin_lock(&proc_inum_lock);
        ida_remove(&proc_inum_ida, inum - PROC_DYNAMIC_FIRST);
@@ -554,13 +555,12 @@ static const struct inode_operations proc_dir_inode_operations = {
 static int proc_register(struct proc_dir_entry * dir, struct proc_dir_entry * dp)
 {
-        unsigned int i;
        struct proc_dir_entry *tmp;
+        int ret;
        
-        i = get_inode_number();
+        ret = proc_alloc_inum(&dp->low_ino);
-        if (i == 0)
+        if (ret)
-                return -EAGAIN;
+                return ret;
-        dp->low_ino = i;
        if (S_ISDIR(dp->mode)) {
                if (dp->proc_iops == NULL) {
@@ -764,7 +764,7 @@ EXPORT_SYMBOL(proc_create_data);
 static void free_proc_entry(struct proc_dir_entry *de)
 {
-        release_inode_number(de->low_ino);
+        proc_free_inum(de->low_ino);
        if (S_ISLNK(de->mode))
                kfree(de->data);
diff --git a/fs/proc/inode.c b/fs/proc/inode.c
index 3b22bbdee9ec..439ae6886507 100644
--- a/fs/proc/inode.c
+++ b/fs/proc/inode.c
@@ -31,6 +31,7 @@ static void proc_evict_inode(struct inode *inode)
        struct proc_dir_entry *de;
        struct ctl_table_header *head;
        const struct proc_ns_operations *ns_ops;
+        void *ns;
        truncate_inode_pages(&inode->i_data, 0);
        clear_inode(inode);
@@ -49,8 +50,9 @@ static void proc_evict_inode(struct inode *inode)
        }
        /* Release any associated namespace */
        ns_ops = PROC_I(inode)->ns_ops;
-        if (ns_ops && ns_ops->put)
+        ns = PROC_I(inode)->ns;
-                ns_ops->put(PROC_I(inode)->ns);
+        if (ns_ops && ns)
+                ns_ops->put(ns);
 }
 static struct kmem_cache * proc_inode_cachep;
diff --git a/fs/proc/internal.h b/fs/proc/internal.h
index 43973b084abf..252544c05207 100644
--- a/fs/proc/internal.h
+++ b/fs/proc/internal.h
@@ -15,6 +15,7 @@ struct  ctl_table_header;
 struct  mempolicy;
 extern struct proc_dir_entry proc_root;
+extern void proc_self_init(void);
 #ifdef CONFIG_PROC_SYSCTL
 extern int proc_sys_init(void);
 extern void sysctl_head_put(struct ctl_table_header *head);
diff --git a/fs/proc/kcore.c b/fs/proc/kcore.c
index 86c67eee439f..e96d4f18ca3a 100644
--- a/fs/proc/kcore.c
+++ b/fs/proc/kcore.c
@@ -249,7 +249,7 @@ static int kcore_update_ram(void)
        /* Not inialized....update now */
        /* find out "max pfn" */
        end_pfn = 0;
-        for_each_node_state(nid, N_HIGH_MEMORY) {
+        for_each_node_state(nid, N_MEMORY) {
                unsigned long node_end;
                node_end  = NODE_DATA(nid)->node_start_pfn +
                        NODE_DATA(nid)->node_spanned_pages;
diff --git a/fs/proc/namespaces.c b/fs/proc/namespaces.c
index b178ed733c36..b7a47196c8c3 100644
--- a/fs/proc/namespaces.c
+++ b/fs/proc/namespaces.c
@@ -11,6 +11,7 @@
 #include <net/net_namespace.h>
 #include <linux/ipc_namespace.h>
 #include <linux/pid_namespace.h>
+#include <linux/user_namespace.h>
 #include "internal.h"
@@ -24,12 +25,168 @@ static const struct proc_ns_operations *ns_entries[] = {
 #ifdef CONFIG_IPC_NS
        &ipcns_operations,
 #endif
+#ifdef CONFIG_PID_NS
+        &pidns_operations,
+#endif
+#ifdef CONFIG_USER_NS
+        &userns_operations,
+#endif
+        &mntns_operations,
 };
 static const struct file_operations ns_file_operations = {
        .llseek         = no_llseek,
 };
+static const struct inode_operations ns_inode_operations = {
+        .setattr        = proc_setattr,
+};
+static int ns_delete_dentry(const struct dentry *dentry)
+{
+        /* Don't cache namespace inodes when not in use */
+        return 1;
+}
+static char *ns_dname(struct dentry *dentry, char *buffer, int buflen)
+{
+        struct inode *inode = dentry->d_inode;
+        const struct proc_ns_operations *ns_ops = PROC_I(inode)->ns_ops;
+        return dynamic_dname(dentry, buffer, buflen, "%s:[%lu]",
+                ns_ops->name, inode->i_ino);
+}
+const struct dentry_operations ns_dentry_operations =
+{
+        .d_delete       = ns_delete_dentry,
+        .d_dname        = ns_dname,
+};
+static struct dentry *proc_ns_get_dentry(struct super_block *sb,
+        struct task_struct *task, const struct proc_ns_operations *ns_ops)
+{
+        struct dentry *dentry, *result;
+        struct inode *inode;
+        struct proc_inode *ei;
+        struct qstr qname = { .name = "", };
+        void *ns;
+        ns = ns_ops->get(task);
+        if (!ns)
+                return ERR_PTR(-ENOENT);
+        dentry = d_alloc_pseudo(sb, &qname);
+        if (!dentry) {
+                ns_ops->put(ns);
+                return ERR_PTR(-ENOMEM);
+        }
+        inode = iget_locked(sb, ns_ops->inum(ns));
+        if (!inode) {
+                dput(dentry);
+                ns_ops->put(ns);
+                return ERR_PTR(-ENOMEM);
+        }
+        ei = PROC_I(inode);
+        if (inode->i_state & I_NEW) {
+                inode->i_mtime = inode->i_atime = inode->i_ctime = CURRENT_TIME;
+                inode->i_op = &ns_inode_operations;
+                inode->i_mode = S_IFREG | S_IRUGO;
+                inode->i_fop = &ns_file_operations;
+                ei->ns_ops = ns_ops;
+                ei->ns = ns;
+                unlock_new_inode(inode);
+        } else {
+                ns_ops->put(ns);
+        }
+        d_set_d_op(dentry, &ns_dentry_operations);
+        result = d_instantiate_unique(dentry, inode);
+        if (result) {
+                dput(dentry);
+                dentry = result;
+        }
+        return dentry;
+}
+static void *proc_ns_follow_link(struct dentry *dentry, struct nameidata *nd)
+{
+        struct inode *inode = dentry->d_inode;
+        struct super_block *sb = inode->i_sb;
+        struct proc_inode *ei = PROC_I(inode);
+        struct task_struct *task;
+        struct dentry *ns_dentry;
+        void *error = ERR_PTR(-EACCES);
+        task = get_proc_task(inode);
+        if (!task)
+                goto out;
+        if (!ptrace_may_access(task, PTRACE_MODE_READ))
+                goto out_put_task;
+        ns_dentry = proc_ns_get_dentry(sb, task, ei->ns_ops);
+        if (IS_ERR(ns_dentry)) {
+                error = ERR_CAST(ns_dentry);
+                goto out_put_task;
+        }
+        dput(nd->path.dentry);
+        nd->path.dentry = ns_dentry;
+        error = NULL;
+out_put_task:
+        put_task_struct(task);
+out:
+        return error;
+}
+static int proc_ns_readlink(struct dentry *dentry, char __user *buffer, int buflen)
+{
+        struct inode *inode = dentry->d_inode;
+        struct proc_inode *ei = PROC_I(inode);
+        const struct proc_ns_operations *ns_ops = ei->ns_ops;
+        struct task_struct *task;
+        void *ns;
+        char name[50];
+        int len = -EACCES;
+        task = get_proc_task(inode);
+        if (!task)
+                goto out;
+        if (!ptrace_may_access(task, PTRACE_MODE_READ))
+                goto out_put_task;
+        len = -ENOENT;
+        ns = ns_ops->get(task);
+        if (!ns)
+                goto out_put_task;
+        snprintf(name, sizeof(name), "%s:[%u]", ns_ops->name, ns_ops->inum(ns));
+        len = strlen(name);
+        if (len > buflen)
+                len = buflen;
+        if (copy_to_user(buffer, name, len))
+                len = -EFAULT;
+        ns_ops->put(ns);
+out_put_task:
+        put_task_struct(task);
+out:
+        return len;
+}
+static const struct inode_operations proc_ns_link_inode_operations = {
+        .readlink       = proc_ns_readlink,
+        .follow_link    = proc_ns_follow_link,
+        .setattr        = proc_setattr,
+};
 static struct dentry *proc_ns_instantiate(struct inode *dir,
        struct dentry *dentry, struct task_struct *task, const void *ptr)
 {
@@ -37,21 +194,15 @@ static struct dentry *proc_ns_instantiate(struct inode *dir,
        struct inode *inode;
        struct proc_inode *ei;
        struct dentry *error = ERR_PTR(-ENOENT);
-        void *ns;
        inode = proc_pid_make_inode(dir->i_sb, task);
        if (!inode)
                goto out;
-        ns = ns_ops->get(task);
-        if (!ns)
-                goto out_iput;
        ei = PROC_I(inode);
-        inode->i_mode = S_IFREG|S_IRUSR;
+        inode->i_mode = S_IFLNK|S_IRWXUGO;
-        inode->i_fop  = &ns_file_operations;
+        inode->i_op = &proc_ns_link_inode_operations;
-        ei->ns_ops    = ns_ops;
+        ei->ns_ops = ns_ops;
-        ei->ns        = ns;
        d_set_d_op(dentry, &pid_dentry_operations);
        d_add(dentry, inode);
@@ -60,9 +211,6 @@ static struct dentry *proc_ns_instantiate(struct inode *dir,
                error = NULL;
 out:
        return error;
-out_iput:
-        iput(inode);
-        goto out;
 }
 static int proc_ns_fill_cache(struct file *filp, void *dirent,
@@ -89,10 +237,6 @@ static int proc_ns_dir_readdir(struct file *filp, void *dirent,
        if (!task)
                goto out_no_task;
-        ret = -EPERM;
-        if (!ptrace_may_access(task, PTRACE_MODE_READ))
-                goto out;
        ret = 0;
        i = filp->f_pos;
        switch (i) {
@@ -152,10 +296,6 @@ static struct dentry *proc_ns_dir_lookup(struct inode *dir,
        if (!task)
                goto out_no_task;
-        error = ERR_PTR(-EPERM);
-        if (!ptrace_may_access(task, PTRACE_MODE_READ))
-                goto out;
        last = &ns_entries[ARRAY_SIZE(ns_entries)];
        for (entry = ns_entries; entry < last; entry++) {
                if (strlen((*entry)->name) != len)
@@ -163,7 +303,6 @@ static struct dentry *proc_ns_dir_lookup(struct inode *dir,
                if (!memcmp(dentry->d_name.name, (*entry)->name, len))
                        break;
        }
-        error = ERR_PTR(-ENOENT);
        if (entry == last)
                goto out;
@@ -198,3 +337,7 @@ out_invalid:
        return ERR_PTR(-EINVAL);
 }
+bool proc_ns_inode(struct inode *inode)
+{
+        return inode->i_fop == &ns_file_operations;
+}
diff --git a/fs/proc/proc_devtree.c b/fs/proc/proc_devtree.c
index df7dd08d4391..de20ec480fa0 100644
--- a/fs/proc/proc_devtree.c
+++ b/fs/proc/proc_devtree.c
@@ -195,11 +195,7 @@ void proc_device_tree_add_node(struct device_node *np,
        set_node_proc_entry(np, de);
        for (child = NULL; (child = of_get_next_child(np, child));) {
                /* Use everything after the last slash, or the full name */
-                p = strrchr(child->full_name, '/');
+                p = kbasename(child->full_name);
-                if (!p)
-                        p = child->full_name;
-                else
-                        ++p;
                if (duplicate_name(de, p))
                        p = fixup_name(np, de, p);
diff --git a/fs/proc/proc_sysctl.c b/fs/proc/proc_sysctl.c
index a781bdf06694..701580ddfcc3 100644
--- a/fs/proc/proc_sysctl.c
+++ b/fs/proc/proc_sysctl.c
@@ -378,12 +378,13 @@ static int test_perm(int mode, int op)
        return -EACCES;
 }
-static int sysctl_perm(struct ctl_table_root *root, struct ctl_table *table, int op)
+static int sysctl_perm(struct ctl_table_header *head, struct ctl_table *table, int op)
 {
+        struct ctl_table_root *root = head->root;
        int mode;
        if (root->permissions)
-                mode = root->permissions(root, current->nsproxy, table);
+                mode = root->permissions(head, table);
        else
                mode = table->mode;
@@ -491,7 +492,7 @@ static ssize_t proc_sys_call_handler(struct file *filp, void __user *buf,
         * and won't be until we finish.
         */
        error = -EPERM;
-        if (sysctl_perm(head->root, table, write ? MAY_WRITE : MAY_READ))
+        if (sysctl_perm(head, table, write ? MAY_WRITE : MAY_READ))
                goto out;
        /* if that can happen at all, it should be -EINVAL, not -EISDIR */
@@ -717,7 +718,7 @@ static int proc_sys_permission(struct inode *inode, int mask)
        if (!table) /* global root - r-xr-xr-x */
                error = mask & MAY_WRITE ? -EACCES : 0;
        else /* Use the permissions on the sysctl table entry */
-                error = sysctl_perm(head->root, table, mask & ~MAY_NOT_BLOCK);
+                error = sysctl_perm(head, table, mask & ~MAY_NOT_BLOCK);
        sysctl_head_finish(head);
        return error;
diff --git a/fs/proc/root.c b/fs/proc/root.c
index 9889a92d2e01..c6e9fac26bac 100644
--- a/fs/proc/root.c
+++ b/fs/proc/root.c
@@ -100,14 +100,13 @@ static struct dentry *proc_mount(struct file_system_type *fs_type,
        int err;
        struct super_block *sb;
        struct pid_namespace *ns;
-        struct proc_inode *ei;
        char *options;
        if (flags & MS_KERNMOUNT) {
                ns = (struct pid_namespace *)data;
                options = NULL;
        } else {
-                ns = current->nsproxy->pid_ns;
+                ns = task_active_pid_ns(current);
                options = data;
        }
@@ -130,13 +129,6 @@ static struct dentry *proc_mount(struct file_system_type *fs_type,
                sb->s_flags |= MS_ACTIVE;
        }
-        ei = PROC_I(sb->s_root->d_inode);
-        if (!ei->pid) {
-                rcu_read_lock();
-                ei->pid = get_pid(find_pid_ns(1, ns));
-                rcu_read_unlock();
-        }
        return dget(sb->s_root);
 }
@@ -153,6 +145,7 @@ static struct file_system_type proc_fs_type = {
        .name           = "proc",
        .mount          = proc_mount,
        .kill_sb        = proc_kill_sb,
+        .fs_flags       = FS_USERNS_MOUNT,
 };
 void __init proc_root_init(void)
@@ -163,12 +156,8 @@ void __init proc_root_init(void)
        err = register_filesystem(&proc_fs_type);
        if (err)
                return;
-        err = pid_ns_prepare_proc(&init_pid_ns);
-        if (err) {
-                unregister_filesystem(&proc_fs_type);
-                return;
-        }
+        proc_self_init();
        proc_symlink("mounts", NULL, "self/mounts");
        proc_net_init();
diff --git a/fs/proc/self.c b/fs/proc/self.c
new file mode 100644
index 000000000000..aa5cc3bff140
--- /dev/null
+++ b/fs/proc/self.c
@@ -0,0 +1,59 @@
+#include <linux/proc_fs.h>
+#include <linux/sched.h>
+#include <linux/namei.h>
+/*
+ * /proc/self:
+ */
+static int proc_self_readlink(struct dentry *dentry, char __user *buffer,
+                              int buflen)
+{
+        struct pid_namespace *ns = dentry->d_sb->s_fs_info;
+        pid_t tgid = task_tgid_nr_ns(current, ns);
+        char tmp[PROC_NUMBUF];
+        if (!tgid)
+                return -ENOENT;
+        sprintf(tmp, "%d", tgid);
+        return vfs_readlink(dentry,buffer,buflen,tmp);
+}
+static void *proc_self_follow_link(struct dentry *dentry, struct nameidata *nd)
+{
+        struct pid_namespace *ns = dentry->d_sb->s_fs_info;
+        pid_t tgid = task_tgid_nr_ns(current, ns);
+        char *name = ERR_PTR(-ENOENT);
+        if (tgid) {
+                /* 11 for max length of signed int in decimal + NULL term */
+                name = kmalloc(12, GFP_KERNEL);
+                if (!name)
+                        name = ERR_PTR(-ENOMEM);
+                else
+                        sprintf(name, "%d", tgid);
+        }
+        nd_set_link(nd, name);
+        return NULL;
+}
+static void proc_self_put_link(struct dentry *dentry, struct nameidata *nd,
+                                void *cookie)
+{
+        char *s = nd_get_link(nd);
+        if (!IS_ERR(s))
+                kfree(s);
+}
+static const struct inode_operations proc_self_inode_operations = {
+        .readlink       = proc_self_readlink,
+        .follow_link    = proc_self_follow_link,
+        .put_link       = proc_self_put_link,
+};
+void __init proc_self_init(void)
+{
+        struct proc_dir_entry *proc_self_symlink;
+        mode_t mode;
+        mode = S_IFLNK | S_IRWXUGO;
+        proc_self_symlink = proc_create("self", mode, NULL, NULL );
+        proc_self_symlink->proc_iops = &proc_self_inode_operations;
+}
diff --git a/fs/proc/task_mmu.c b/fs/proc/task_mmu.c
index 90c63f9392a5..448455b7fd91 100644
--- a/fs/proc/task_mmu.c
+++ b/fs/proc/task_mmu.c
@@ -526,6 +526,57 @@ static int smaps_pte_range(pmd_t *pmd, unsigned long addr, unsigned long end,
        return 0;
 }
+static void show_smap_vma_flags(struct seq_file *m, struct vm_area_struct *vma)
+{
+        /*
+         * Don't forget to update Documentation/ on changes.
+         */
+        static const char mnemonics[BITS_PER_LONG][2] = {
+                /*
+                 * In case if we meet a flag we don't know about.
+                 */
+                [0 ... (BITS_PER_LONG-1)] = "??",
+                [ilog2(VM_READ)]        = "rd",
+                [ilog2(VM_WRITE)]       = "wr",
+                [ilog2(VM_EXEC)]        = "ex",
+                [ilog2(VM_SHARED)]      = "sh",
+                [ilog2(VM_MAYREAD)]     = "mr",
+                [ilog2(VM_MAYWRITE)]    = "mw",
+                [ilog2(VM_MAYEXEC)]     = "me",
+                [ilog2(VM_MAYSHARE)]    = "ms",
+                [ilog2(VM_GROWSDOWN)]   = "gd",
+                [ilog2(VM_PFNMAP)]      = "pf",
+                [ilog2(VM_DENYWRITE)]   = "dw",
+                [ilog2(VM_LOCKED)]      = "lo",
+                [ilog2(VM_IO)]          = "io",
+                [ilog2(VM_SEQ_READ)]    = "sr",
+                [ilog2(VM_RAND_READ)]   = "rr",
+                [ilog2(VM_DONTCOPY)]    = "dc",
+                [ilog2(VM_DONTEXPAND)]  = "de",
+                [ilog2(VM_ACCOUNT)]     = "ac",
+                [ilog2(VM_NORESERVE)]   = "nr",
+                [ilog2(VM_HUGETLB)]     = "ht",
+                [ilog2(VM_NONLINEAR)]   = "nl",
+                [ilog2(VM_ARCH_1)]      = "ar",
+                [ilog2(VM_DONTDUMP)]    = "dd",
+                [ilog2(VM_MIXEDMAP)]    = "mm",
+                [ilog2(VM_HUGEPAGE)]    = "hg",
+                [ilog2(VM_NOHUGEPAGE)]  = "nh",
+                [ilog2(VM_MERGEABLE)]   = "mg",
+        };
+        size_t i;
+        seq_puts(m, "VmFlags: ");
+        for (i = 0; i < BITS_PER_LONG; i++) {
+                if (vma->vm_flags & (1UL << i)) {
+                        seq_printf(m, "%c%c ",
+                                   mnemonics[i][0], mnemonics[i][1]);
+                }
+        }
+        seq_putc(m, '\n');
+}
 static int show_smap(struct seq_file *m, void *v, int is_pid)
 {
        struct proc_maps_private *priv = m->private;
@@ -581,6 +632,8 @@ static int show_smap(struct seq_file *m, void *v, int is_pid)
                seq_printf(m, "Nonlinear:      %8lu kB\n",
                                mss.nonlinear >> 10);
+        show_smap_vma_flags(m, vma);
        if (m->count < m->size)  /* vma is copied successfully */
                m->version = (vma != get_gate_vma(task->mm))
                        ? vma->vm_start : 0;
@@ -643,7 +696,7 @@ static int clear_refs_pte_range(pmd_t *pmd, unsigned long addr,
        spinlock_t *ptl;
        struct page *page;
-        split_huge_page_pmd(walk->mm, pmd);
+        split_huge_page_pmd(vma, addr, pmd);
        if (pmd_trans_unstable(pmd))
                return 0;
@@ -1126,7 +1179,7 @@ static struct page *can_gather_numa_stats(pte_t pte, struct vm_area_struct *vma,
                return NULL;
        nid = page_to_nid(page);
-        if (!node_isset(nid, node_states[N_HIGH_MEMORY]))
+        if (!node_isset(nid, node_states[N_MEMORY]))
                return NULL;
        return page;
@@ -1279,7 +1332,7 @@ static int show_numa_map(struct seq_file *m, void *v, int is_pid)
        if (md->writeback)
                seq_printf(m, " writeback=%lu", md->writeback);
-        for_each_node_state(n, N_HIGH_MEMORY)
+        for_each_node_state(n, N_MEMORY)
                if (md->node[n])
                        seq_printf(m, " N%d=%lu", n, md->node[n]);
 out:
diff --git a/fs/pstore/ftrace.c b/fs/pstore/ftrace.c
index 2d57e1ac0115..43b12807a51d 100644
--- a/fs/pstore/ftrace.c
+++ b/fs/pstore/ftrace.c
@@ -28,7 +28,9 @@
 #include "internal.h"
 static void notrace pstore_ftrace_call(unsigned long ip,
-                                       unsigned long parent_ip)
+                                       unsigned long parent_ip,
+                                       struct ftrace_ops *op,
+                                       struct pt_regs *regs)
 {
        unsigned long flags;
        struct pstore_ftrace_record rec = {};
diff --git a/fs/pstore/inode.c b/fs/pstore/inode.c
index 4ab572e6d277..67de74ca85f4 100644
--- a/fs/pstore/inode.c
+++ b/fs/pstore/inode.c
@@ -49,6 +49,7 @@ struct pstore_private {
        struct pstore_info *psi;
        enum pstore_type_id type;
        u64     id;
+        int     count;
        ssize_t size;
        char    data[];
 };
@@ -150,13 +151,13 @@ static int pstore_file_open(struct inode *inode, struct file *file)
        return 0;
 }
-static loff_t pstore_file_llseek(struct file *file, loff_t off, int origin)
+static loff_t pstore_file_llseek(struct file *file, loff_t off, int whence)
 {
        struct seq_file *sf = file->private_data;
        if (sf->op)
-                return seq_lseek(file, off, origin);
+                return seq_lseek(file, off, whence);
-        return default_llseek(file, off, origin);
+        return default_llseek(file, off, whence);
 }
 static const struct file_operations pstore_file_operations = {
@@ -175,7 +176,8 @@ static int pstore_unlink(struct inode *dir, struct dentry *dentry)
        struct pstore_private *p = dentry->d_inode->i_private;
        if (p->psi->erase)
-                p->psi->erase(p->type, p->id, p->psi);
+                p->psi->erase(p->type, p->id, p->count,
+                              dentry->d_inode->i_ctime, p->psi);
        return simple_unlink(dir, dentry);
 }
@@ -270,7 +272,7 @@ int pstore_is_mounted(void)
 * Load it up with "size" bytes of data from "buf".
 * Set the mtime & ctime to the date that this record was originally stored.
 */
-int pstore_mkfile(enum pstore_type_id type, char *psname, u64 id,
+int pstore_mkfile(enum pstore_type_id type, char *psname, u64 id, int count,
                  char *data, size_t size, struct timespec time,
                  struct pstore_info *psi)
 {
@@ -306,6 +308,7 @@ int pstore_mkfile(enum pstore_type_id type, char *psname, u64 id,
                goto fail_alloc;
        private->type = type;
        private->id = id;
+        private->count = count;
        private->psi = psi;
        switch (type) {
diff --git a/fs/pstore/internal.h b/fs/pstore/internal.h
index 4847f588b7d5..937d820f273c 100644
--- a/fs/pstore/internal.h
+++ b/fs/pstore/internal.h
@@ -50,7 +50,7 @@ extern struct pstore_info *psinfo;
 extern void     pstore_set_kmsg_bytes(int);
 extern void     pstore_get_records(int);
 extern int      pstore_mkfile(enum pstore_type_id, char *psname, u64 id,
-                              char *data, size_t size,
+                              int count, char *data, size_t size,
                              struct timespec time, struct pstore_info *psi);
 extern int      pstore_is_mounted(void);
diff --git a/fs/pstore/platform.c b/fs/pstore/platform.c
index a40da07e93d6..5ea2e77ff023 100644
--- a/fs/pstore/platform.c
+++ b/fs/pstore/platform.c
@@ -136,7 +136,7 @@ static void pstore_dump(struct kmsg_dumper *dumper,
                        break;
                ret = psinfo->write(PSTORE_TYPE_DMESG, reason, &id, part,
-                                    hsize + len, psinfo);
+                                    oopscount, hsize + len, psinfo);
                if (ret == 0 && reason == KMSG_DUMP_OOPS && pstore_is_mounted())
                        pstore_new_entry = 1;
@@ -161,6 +161,7 @@ static void pstore_console_write(struct console *con, const char *s, unsigned c)
        while (s < e) {
                unsigned long flags;
+                u64 id;
                if (c > psinfo->bufsize)
                        c = psinfo->bufsize;
@@ -172,7 +173,7 @@ static void pstore_console_write(struct console *con, const char *s, unsigned c)
                        spin_lock_irqsave(&psinfo->buf_lock, flags);
                }
                memcpy(psinfo->buf, s, c);
-                psinfo->write(PSTORE_TYPE_CONSOLE, 0, NULL, 0, c, psinfo);
+                psinfo->write(PSTORE_TYPE_CONSOLE, 0, &id, 0, 0, c, psinfo);
                spin_unlock_irqrestore(&psinfo->buf_lock, flags);
                s += c;
                c = e - s;
@@ -196,7 +197,7 @@ static void pstore_register_console(void) {}
 static int pstore_write_compat(enum pstore_type_id type,
                               enum kmsg_dump_reason reason,
-                               u64 *id, unsigned int part,
+                               u64 *id, unsigned int part, int count,
                               size_t size, struct pstore_info *psi)
 {
        return psi->write_buf(type, reason, id, part, psinfo->buf, size, psi);
@@ -266,6 +267,7 @@ void pstore_get_records(int quiet)
        char                    *buf = NULL;
        ssize_t                 size;
        u64                     id;
+        int                     count;
        enum pstore_type_id     type;
        struct timespec         time;
        int                     failed = 0, rc;
@@ -277,9 +279,9 @@ void pstore_get_records(int quiet)
        if (psi->open && psi->open(psi))
                goto out;
-        while ((size = psi->read(&id, &type, &time, &buf, psi)) > 0) {
+        while ((size = psi->read(&id, &type, &count, &time, &buf, psi)) > 0) {
-                rc = pstore_mkfile(type, psi->name, id, buf, (size_t)size,
+                rc = pstore_mkfile(type, psi->name, id, count, buf,
-                                  time, psi);
+                                  (size_t)size, time, psi);
                kfree(buf);
                buf = NULL;
                if (rc && (rc != -EEXIST || !quiet))
diff --git a/fs/pstore/ram.c b/fs/pstore/ram.c
index 1a4f6da58eab..f883e7e74305 100644
--- a/fs/pstore/ram.c
+++ b/fs/pstore/ram.c
@@ -132,9 +132,8 @@ ramoops_get_next_prz(struct persistent_ram_zone *przs[], uint *c, uint max,
 }
 static ssize_t ramoops_pstore_read(u64 *id, enum pstore_type_id *type,
-                                   struct timespec *time,
+                                   int *count, struct timespec *time,
-                                   char **buf,
+                                   char **buf, struct pstore_info *psi)
-                                   struct pstore_info *psi)
 {
        ssize_t size;
        struct ramoops_context *cxt = psi->data;
@@ -189,7 +188,7 @@ static int notrace ramoops_pstore_write_buf(enum pstore_type_id type,
                                            struct pstore_info *psi)
 {
        struct ramoops_context *cxt = psi->data;
-        struct persistent_ram_zone *prz = cxt->przs[cxt->dump_write_cnt];
+        struct persistent_ram_zone *prz;
        size_t hlen;
        if (type == PSTORE_TYPE_CONSOLE) {
@@ -226,6 +225,11 @@ static int notrace ramoops_pstore_write_buf(enum pstore_type_id type,
        if (part != 1)
                return -ENOSPC;
+        if (!cxt->przs)
+                return -ENOSPC;
+        prz = cxt->przs[cxt->dump_write_cnt];
        hlen = ramoops_write_kmsg_hdr(prz);
        if (size + hlen > prz->buffer_size)
                size = prz->buffer_size - hlen;
@@ -236,8 +240,8 @@ static int notrace ramoops_pstore_write_buf(enum pstore_type_id type,
        return 0;
 }
-static int ramoops_pstore_erase(enum pstore_type_id type, u64 id,
+static int ramoops_pstore_erase(enum pstore_type_id type, u64 id, int count,
-                                struct pstore_info *psi)
+                                struct timespec time, struct pstore_info *psi)
 {
        struct ramoops_context *cxt = psi->data;
        struct persistent_ram_zone *prz;
@@ -287,8 +291,9 @@ static void ramoops_free_przs(struct ramoops_context *cxt)
        kfree(cxt->przs);
 }
-static int ramoops_init_przs(struct device *dev, struct ramoops_context *cxt,
+static int __devinit ramoops_init_przs(struct device *dev,
-                              phys_addr_t *paddr, size_t dump_mem_sz)
+                                       struct ramoops_context *cxt,
+                                       phys_addr_t *paddr, size_t dump_mem_sz)
 {
        int err = -ENOMEM;
        int i;
@@ -296,6 +301,11 @@ static int ramoops_init_przs(struct device *dev, struct ramoops_context *cxt,
        if (!cxt->record_size)
                return 0;
+        if (*paddr + dump_mem_sz - cxt->phys_addr > cxt->size) {
+                dev_err(dev, "no room for dumps\n");
+                return -ENOMEM;
+        }
        cxt->max_dump_cnt = dump_mem_sz / cxt->record_size;
        if (!cxt->max_dump_cnt)
                return -ENOMEM;
@@ -326,15 +336,20 @@ fail_prz:
        return err;
 }
-static int ramoops_init_prz(struct device *dev, struct ramoops_context *cxt,
+static int __devinit ramoops_init_prz(struct device *dev,
-                            struct persistent_ram_zone **prz,
+                                      struct ramoops_context *cxt,
-                            phys_addr_t *paddr, size_t sz, u32 sig)
+                                      struct persistent_ram_zone **prz,
+                                      phys_addr_t *paddr, size_t sz, u32 sig)
 {
        if (!sz)
                return 0;
-        if (*paddr + sz > *paddr + cxt->size)
+        if (*paddr + sz - cxt->phys_addr > cxt->size) {
+                dev_err(dev, "no room for mem region (0x%zx@0x%llx) in (0x%lx@0x%llx)\n",
+                        sz, (unsigned long long)*paddr,
+                        cxt->size, (unsigned long long)cxt->phys_addr);
                return -ENOMEM;
+        }
        *prz = persistent_ram_new(*paddr, sz, sig, cxt->ecc_size);
        if (IS_ERR(*prz)) {
@@ -374,10 +389,14 @@ static int __devinit ramoops_probe(struct platform_device *pdev)
                goto fail_out;
        }
-        pdata->mem_size = rounddown_pow_of_two(pdata->mem_size);
+        if (!is_power_of_2(pdata->mem_size))
-        pdata->record_size = rounddown_pow_of_two(pdata->record_size);
+                pdata->mem_size = rounddown_pow_of_two(pdata->mem_size);
-        pdata->console_size = rounddown_pow_of_two(pdata->console_size);
+        if (!is_power_of_2(pdata->record_size))
-        pdata->ftrace_size = rounddown_pow_of_two(pdata->ftrace_size);
+                pdata->record_size = rounddown_pow_of_two(pdata->record_size);
+        if (!is_power_of_2(pdata->console_size))
+                pdata->console_size = rounddown_pow_of_two(pdata->console_size);
+        if (!is_power_of_2(pdata->ftrace_size))
+                pdata->ftrace_size = rounddown_pow_of_two(pdata->ftrace_size);
        cxt->dump_read_cnt = 0;
        cxt->size = pdata->mem_size;
diff --git a/fs/quota/quota.c b/fs/quota/quota.c
index af1661f7a54f..c7314f1771f5 100644
--- a/fs/quota/quota.c
+++ b/fs/quota/quota.c
@@ -307,6 +307,8 @@ static int do_quotactl(struct super_block *sb, int type, int cmd, qid_t id,
        }
 }
+#ifdef CONFIG_BLOCK
 /* Return 1 if 'cmd' will block on frozen filesystem */
 static int quotactl_cmd_write(int cmd)
 {
@@ -322,6 +324,8 @@ static int quotactl_cmd_write(int cmd)
        return 1;
 }
+#endif /* CONFIG_BLOCK */
 /*
 * look up a superblock on which quota ops will be performed
 * - use the name of a block device to find the superblock thereon
diff --git a/fs/read_write.c b/fs/read_write.c
index d06534857e9e..1edaf099ddd7 100644
--- a/fs/read_write.c
+++ b/fs/read_write.c
@@ -54,7 +54,7 @@ static loff_t lseek_execute(struct file *file, struct inode *inode,
 * generic_file_llseek_size - generic llseek implementation for regular files
 * @file:       file structure to seek on
 * @offset:     file offset to seek to
- * @origin:     type of seek
+ * @whence:     type of seek
 * @size:       max size of this file in file system
 * @eof:        offset used for SEEK_END position
 *
@@ -67,12 +67,12 @@ static loff_t lseek_execute(struct file *file, struct inode *inode,
 * read/writes behave like SEEK_SET against seeks.
 */
 loff_t
-generic_file_llseek_size(struct file *file, loff_t offset, int origin,
+generic_file_llseek_size(struct file *file, loff_t offset, int whence,
                loff_t maxsize, loff_t eof)
 {
        struct inode *inode = file->f_mapping->host;
-        switch (origin) {
+        switch (whence) {
        case SEEK_END:
                offset += eof;
                break;
@@ -122,17 +122,17 @@ EXPORT_SYMBOL(generic_file_llseek_size);
 * generic_file_llseek - generic llseek implementation for regular files
 * @file:       file structure to seek on
 * @offset:     file offset to seek to
- * @origin:     type of seek
+ * @whence:     type of seek
 *
 * This is a generic implemenation of ->llseek useable for all normal local
 * filesystems.  It just updates the file offset to the value specified by
- * @offset and @origin under i_mutex.
+ * @offset and @whence under i_mutex.
 */
-loff_t generic_file_llseek(struct file *file, loff_t offset, int origin)
+loff_t generic_file_llseek(struct file *file, loff_t offset, int whence)
 {
        struct inode *inode = file->f_mapping->host;
-        return generic_file_llseek_size(file, offset, origin,
+        return generic_file_llseek_size(file, offset, whence,
                                        inode->i_sb->s_maxbytes,
                                        i_size_read(inode));
 }
@@ -142,32 +142,32 @@ EXPORT_SYMBOL(generic_file_llseek);
 * noop_llseek - No Operation Performed llseek implementation
 * @file:       file structure to seek on
 * @offset:     file offset to seek to
- * @origin:     type of seek
+ * @whence:     type of seek
 *
 * This is an implementation of ->llseek useable for the rare special case when
 * userspace expects the seek to succeed but the (device) file is actually not
 * able to perform the seek. In this case you use noop_llseek() instead of
 * falling back to the default implementation of ->llseek.
 */
-loff_t noop_llseek(struct file *file, loff_t offset, int origin)
+loff_t noop_llseek(struct file *file, loff_t offset, int whence)
 {
        return file->f_pos;
 }
 EXPORT_SYMBOL(noop_llseek);
-loff_t no_llseek(struct file *file, loff_t offset, int origin)
+loff_t no_llseek(struct file *file, loff_t offset, int whence)
 {
        return -ESPIPE;
 }
 EXPORT_SYMBOL(no_llseek);
-loff_t default_llseek(struct file *file, loff_t offset, int origin)
+loff_t default_llseek(struct file *file, loff_t offset, int whence)
 {
        struct inode *inode = file->f_path.dentry->d_inode;
        loff_t retval;
        mutex_lock(&inode->i_mutex);
-        switch (origin) {
+        switch (whence) {
                case SEEK_END:
                        offset += i_size_read(inode);
                        break;
@@ -216,7 +216,7 @@ out:
 }
 EXPORT_SYMBOL(default_llseek);
-loff_t vfs_llseek(struct file *file, loff_t offset, int origin)
+loff_t vfs_llseek(struct file *file, loff_t offset, int whence)
 {
        loff_t (*fn)(struct file *, loff_t, int);
@@ -225,11 +225,11 @@ loff_t vfs_llseek(struct file *file, loff_t offset, int origin)
                if (file->f_op && file->f_op->llseek)
                        fn = file->f_op->llseek;
        }
-        return fn(file, offset, origin);
+        return fn(file, offset, whence);
 }
 EXPORT_SYMBOL(vfs_llseek);
-SYSCALL_DEFINE3(lseek, unsigned int, fd, off_t, offset, unsigned int, origin)
+SYSCALL_DEFINE3(lseek, unsigned int, fd, off_t, offset, unsigned int, whence)
 {
        off_t retval;
        struct fd f = fdget(fd);
@@ -237,8 +237,8 @@ SYSCALL_DEFINE3(lseek, unsigned int, fd, off_t, offset, unsigned int, origin)
                return -EBADF;
        retval = -EINVAL;
-        if (origin <= SEEK_MAX) {
+        if (whence <= SEEK_MAX) {
-                loff_t res = vfs_llseek(f.file, offset, origin);
+                loff_t res = vfs_llseek(f.file, offset, whence);
                retval = res;
                if (res != (loff_t)retval)
                        retval = -EOVERFLOW;    /* LFS: should only happen on 32 bit platforms */
@@ -250,7 +250,7 @@ SYSCALL_DEFINE3(lseek, unsigned int, fd, off_t, offset, unsigned int, origin)
 #ifdef __ARCH_WANT_SYS_LLSEEK
 SYSCALL_DEFINE5(llseek, unsigned int, fd, unsigned long, offset_high,
                unsigned long, offset_low, loff_t __user *, result,
-                unsigned int, origin)
+                unsigned int, whence)
 {
        int retval;
        struct fd f = fdget(fd);
@@ -260,11 +260,11 @@ SYSCALL_DEFINE5(llseek, unsigned int, fd, unsigned long, offset_high,
                return -EBADF;
        retval = -EINVAL;
-        if (origin > SEEK_MAX)
+        if (whence > SEEK_MAX)
                goto out_putf;
        offset = vfs_llseek(f.file, ((loff_t) offset_high << 32) | offset_low,
-                        origin);
+                        whence);
        retval = (int)offset;
        if (offset >= 0) {
diff --git a/fs/reiserfs/inode.c b/fs/reiserfs/inode.c
index f27f01a98aa2..d83736fbc26c 100644
--- a/fs/reiserfs/inode.c
+++ b/fs/reiserfs/inode.c
@@ -1782,8 +1782,9 @@ int reiserfs_new_inode(struct reiserfs_transaction_handle *th,
        BUG_ON(!th->t_trans_id);
-        dquot_initialize(inode);
+        reiserfs_write_unlock(inode->i_sb);
        err = dquot_alloc_inode(inode);
+        reiserfs_write_lock(inode->i_sb);
        if (err)
                goto out_end_trans;
        if (!dir->i_nlink) {
@@ -1979,8 +1980,10 @@ int reiserfs_new_inode(struct reiserfs_transaction_handle *th,
      out_end_trans:
        journal_end(th, th->t_super, th->t_blocks_allocated);
+        reiserfs_write_unlock(inode->i_sb);
        /* Drop can be outside and it needs more credits so it's better to have it outside */
        dquot_drop(inode);
+        reiserfs_write_lock(inode->i_sb);
        inode->i_flags |= S_NOQUOTA;
        make_bad_inode(inode);
@@ -3103,10 +3106,9 @@ int reiserfs_setattr(struct dentry *dentry, struct iattr *attr)
        /* must be turned off for recursive notify_change calls */
        ia_valid = attr->ia_valid &= ~(ATTR_KILL_SUID|ATTR_KILL_SGID);
-        depth = reiserfs_write_lock_once(inode->i_sb);
        if (is_quota_modification(inode, attr))
                dquot_initialize(inode);
+        depth = reiserfs_write_lock_once(inode->i_sb);
        if (attr->ia_valid & ATTR_SIZE) {
                /* version 2 items will be caught by the s_maxbytes check
                 ** done for us in vmtruncate
@@ -3170,7 +3172,9 @@ int reiserfs_setattr(struct dentry *dentry, struct iattr *attr)
                error = journal_begin(&th, inode->i_sb, jbegin_count);
                if (error)
                        goto out;
+                reiserfs_write_unlock_once(inode->i_sb, depth);
                error = dquot_transfer(inode, attr);
+                depth = reiserfs_write_lock_once(inode->i_sb);
                if (error) {
                        journal_end(&th, inode->i_sb, jbegin_count);
                        goto out;
diff --git a/fs/reiserfs/stree.c b/fs/reiserfs/stree.c
index f8afa4b162b8..2f40a4c70a4d 100644
--- a/fs/reiserfs/stree.c
+++ b/fs/reiserfs/stree.c
@@ -1968,7 +1968,9 @@ int reiserfs_paste_into_item(struct reiserfs_transaction_handle *th, struct tree
                       key2type(&(key->on_disk_key)));
 #endif
+        reiserfs_write_unlock(inode->i_sb);
        retval = dquot_alloc_space_nodirty(inode, pasted_size);
+        reiserfs_write_lock(inode->i_sb);
        if (retval) {
                pathrelse(search_path);
                return retval;
@@ -2061,9 +2063,11 @@ int reiserfs_insert_item(struct reiserfs_transaction_handle *th,
                               "reiserquota insert_item(): allocating %u id=%u type=%c",
                               quota_bytes, inode->i_uid, head2type(ih));
 #endif
+                reiserfs_write_unlock(inode->i_sb);
                /* We can't dirty inode here. It would be immediately written but
                 * appropriate stat item isn't inserted yet... */
                retval = dquot_alloc_space_nodirty(inode, quota_bytes);
+                reiserfs_write_lock(inode->i_sb);
                if (retval) {
                        pathrelse(path);
                        return retval;
diff --git a/fs/reiserfs/super.c b/fs/reiserfs/super.c
index 1078ae179993..418bdc3a57da 100644
--- a/fs/reiserfs/super.c
+++ b/fs/reiserfs/super.c
@@ -298,7 +298,9 @@ static int finish_unfinished(struct super_block *s)
                        retval = remove_save_link_only(s, &save_link_key, 0);
                        continue;
                }
+                reiserfs_write_unlock(s);
                dquot_initialize(inode);
+                reiserfs_write_lock(s);
                if (truncate && S_ISDIR(inode->i_mode)) {
                        /* We got a truncate request for a dir which is impossible.
@@ -1335,7 +1337,7 @@ static int reiserfs_remount(struct super_block *s, int *mount_flags, char *arg)
                                kfree(qf_names[i]);
 #endif
                err = -EINVAL;
-                goto out_err;
+                goto out_unlock;
        }
 #ifdef CONFIG_QUOTA
        handle_quota_files(s, qf_names, &qfmt);
@@ -1379,7 +1381,7 @@ static int reiserfs_remount(struct super_block *s, int *mount_flags, char *arg)
        if (blocks) {
                err = reiserfs_resize(s, blocks);
                if (err != 0)
-                        goto out_err;
+                        goto out_unlock;
        }
        if (*mount_flags & MS_RDONLY) {
@@ -1389,9 +1391,15 @@ static int reiserfs_remount(struct super_block *s, int *mount_flags, char *arg)
                        /* it is read-only already */
                        goto out_ok;
+                /*
+                 * Drop write lock. Quota will retake it when needed and lock
+                 * ordering requires calling dquot_suspend() without it.
+                 */
+                reiserfs_write_unlock(s);
                err = dquot_suspend(s, -1);
                if (err < 0)
                        goto out_err;
+                reiserfs_write_lock(s);
                /* try to remount file system with read-only permissions */
                if (sb_umount_state(rs) == REISERFS_VALID_FS
@@ -1401,7 +1409,7 @@ static int reiserfs_remount(struct super_block *s, int *mount_flags, char *arg)
                err = journal_begin(&th, s, 10);
                if (err)
-                        goto out_err;
+                        goto out_unlock;
                /* Mounting a rw partition read-only. */
                reiserfs_prepare_for_journal(s, SB_BUFFER_WITH_SB(s), 1);
@@ -1416,7 +1424,7 @@ static int reiserfs_remount(struct super_block *s, int *mount_flags, char *arg)
                if (reiserfs_is_journal_aborted(journal)) {
                        err = journal->j_errno;
-                        goto out_err;
+                        goto out_unlock;
                }
                handle_data_mode(s, mount_options);
@@ -1425,7 +1433,7 @@ static int reiserfs_remount(struct super_block *s, int *mount_flags, char *arg)
                s->s_flags &= ~MS_RDONLY;       /* now it is safe to call journal_begin */
                err = journal_begin(&th, s, 10);
                if (err)
-                        goto out_err;
+                        goto out_unlock;
                /* Mount a partition which is read-only, read-write */
                reiserfs_prepare_for_journal(s, SB_BUFFER_WITH_SB(s), 1);
@@ -1442,10 +1450,16 @@ static int reiserfs_remount(struct super_block *s, int *mount_flags, char *arg)
        SB_JOURNAL(s)->j_must_wait = 1;
        err = journal_end(&th, s, 10);
        if (err)
-                goto out_err;
+                goto out_unlock;
        if (!(*mount_flags & MS_RDONLY)) {
+                /*
+                 * Drop write lock. Quota will retake it when needed and lock
+                 * ordering requires calling dquot_resume() without it.
+                 */
+                reiserfs_write_unlock(s);
                dquot_resume(s, -1);
+                reiserfs_write_lock(s);
                finish_unfinished(s);
                reiserfs_xattr_init(s, *mount_flags);
        }
@@ -1455,9 +1469,10 @@ out_ok:
        reiserfs_write_unlock(s);
        return 0;
+out_unlock:
+        reiserfs_write_unlock(s);
 out_err:
        kfree(new_opts);
-        reiserfs_write_unlock(s);
        return err;
 }
@@ -2095,13 +2110,15 @@ static int reiserfs_write_dquot(struct dquot *dquot)
                          REISERFS_QUOTA_TRANS_BLOCKS(dquot->dq_sb));
        if (ret)
                goto out;
+        reiserfs_write_unlock(dquot->dq_sb);
        ret = dquot_commit(dquot);
+        reiserfs_write_lock(dquot->dq_sb);
        err =
            journal_end(&th, dquot->dq_sb,
                        REISERFS_QUOTA_TRANS_BLOCKS(dquot->dq_sb));
        if (!ret && err)
                ret = err;
-      out:
+out:
        reiserfs_write_unlock(dquot->dq_sb);
        return ret;
 }
@@ -2117,13 +2134,15 @@ static int reiserfs_acquire_dquot(struct dquot *dquot)
                          REISERFS_QUOTA_INIT_BLOCKS(dquot->dq_sb));
        if (ret)
                goto out;
+        reiserfs_write_unlock(dquot->dq_sb);
        ret = dquot_acquire(dquot);
+        reiserfs_write_lock(dquot->dq_sb);
        err =
            journal_end(&th, dquot->dq_sb,
                        REISERFS_QUOTA_INIT_BLOCKS(dquot->dq_sb));
        if (!ret && err)
                ret = err;
-      out:
+out:
        reiserfs_write_unlock(dquot->dq_sb);
        return ret;
 }
@@ -2137,19 +2156,21 @@ static int reiserfs_release_dquot(struct dquot *dquot)
        ret =
            journal_begin(&th, dquot->dq_sb,
                          REISERFS_QUOTA_DEL_BLOCKS(dquot->dq_sb));
+        reiserfs_write_unlock(dquot->dq_sb);
        if (ret) {
                /* Release dquot anyway to avoid endless cycle in dqput() */
                dquot_release(dquot);
                goto out;
        }
        ret = dquot_release(dquot);
+        reiserfs_write_lock(dquot->dq_sb);
        err =
            journal_end(&th, dquot->dq_sb,
                        REISERFS_QUOTA_DEL_BLOCKS(dquot->dq_sb));
        if (!ret && err)
                ret = err;
-      out:
        reiserfs_write_unlock(dquot->dq_sb);
+out:
        return ret;
 }
@@ -2174,11 +2195,13 @@ static int reiserfs_write_info(struct super_block *sb, int type)
        ret = journal_begin(&th, sb, 2);
        if (ret)
                goto out;
+        reiserfs_write_unlock(sb);
        ret = dquot_commit_info(sb, type);
+        reiserfs_write_lock(sb);
        err = journal_end(&th, sb, 2);
        if (!ret && err)
                ret = err;
-      out:
+out:
        reiserfs_write_unlock(sb);
        return ret;
 }
@@ -2203,8 +2226,11 @@ static int reiserfs_quota_on(struct super_block *sb, int type, int format_id,
        struct reiserfs_transaction_handle th;
        int opt = type == USRQUOTA ? REISERFS_USRQUOTA : REISERFS_GRPQUOTA;
-        if (!(REISERFS_SB(sb)->s_mount_opt & (1 << opt)))
+        reiserfs_write_lock(sb);
-                return -EINVAL;
+        if (!(REISERFS_SB(sb)->s_mount_opt & (1 << opt))) {
+                err = -EINVAL;
+                goto out;
+        }
        /* Quotafile not on the same filesystem? */
        if (path->dentry->d_sb != sb) {
@@ -2246,8 +2272,10 @@ static int reiserfs_quota_on(struct super_block *sb, int type, int format_id,
                if (err)
                        goto out;
        }
-        err = dquot_quota_on(sb, type, format_id, path);
+        reiserfs_write_unlock(sb);
+        return dquot_quota_on(sb, type, format_id, path);
 out:
+        reiserfs_write_unlock(sb);
        return err;
 }
@@ -2320,7 +2348,9 @@ static ssize_t reiserfs_quota_write(struct super_block *sb, int type,
                tocopy = sb->s_blocksize - offset < towrite ?
                    sb->s_blocksize - offset : towrite;
                tmp_bh.b_state = 0;
+                reiserfs_write_lock(sb);
                err = reiserfs_get_block(inode, blk, &tmp_bh, GET_BLOCK_CREATE);
+                reiserfs_write_unlock(sb);
                if (err)
                        goto out;
                if (offset || tocopy != sb->s_blocksize)
@@ -2336,10 +2366,12 @@ static ssize_t reiserfs_quota_write(struct super_block *sb, int type,
                flush_dcache_page(bh->b_page);
                set_buffer_uptodate(bh);
                unlock_buffer(bh);
+                reiserfs_write_lock(sb);
                reiserfs_prepare_for_journal(sb, bh, 1);
                journal_mark_dirty(current->journal_info, sb, bh);
                if (!journal_quota)
                        reiserfs_add_ordered_list(inode, bh);
+                reiserfs_write_unlock(sb);
                brelse(bh);
                offset = 0;
                towrite -= tocopy;
diff --git a/fs/seq_file.c b/fs/seq_file.c
index 99dffab4c4e4..9d863fb501f9 100644
--- a/fs/seq_file.c
+++ b/fs/seq_file.c
@@ -300,14 +300,14 @@ EXPORT_SYMBOL(seq_read);
 *
 *      Ready-made ->f_op->llseek()
 */
-loff_t seq_lseek(struct file *file, loff_t offset, int origin)
+loff_t seq_lseek(struct file *file, loff_t offset, int whence)
 {
        struct seq_file *m = file->private_data;
        loff_t retval = -EINVAL;
        mutex_lock(&m->lock);
        m->version = file->f_version;
-        switch (origin) {
+        switch (whence) {
                case 1:
                        offset += file->f_pos;
                case 0:
diff --git a/fs/signalfd.c b/fs/signalfd.c
index 8bee4e570911..b53486961735 100644
--- a/fs/signalfd.c
+++ b/fs/signalfd.c
@@ -29,6 +29,7 @@
 #include <linux/anon_inodes.h>
 #include <linux/signalfd.h>
 #include <linux/syscalls.h>
+#include <linux/proc_fs.h>
 void signalfd_cleanup(struct sighand_struct *sighand)
 {
@@ -227,7 +228,24 @@ static ssize_t signalfd_read(struct file *file, char __user *buf, size_t count,
        return total ? total: ret;
 }
+#ifdef CONFIG_PROC_FS
+static int signalfd_show_fdinfo(struct seq_file *m, struct file *f)
+{
+        struct signalfd_ctx *ctx = f->private_data;
+        sigset_t sigmask;
+        sigmask = ctx->sigmask;
+        signotset(&sigmask);
+        render_sigset_t(m, "sigmask:\t", &sigmask);
+        return 0;
+}
+#endif
 static const struct file_operations signalfd_fops = {
+#ifdef CONFIG_PROC_FS
+        .show_fdinfo    = signalfd_show_fdinfo,
+#endif
        .release        = signalfd_release,
        .poll           = signalfd_poll,
        .read           = signalfd_read,
diff --git a/fs/splice.c b/fs/splice.c
index 13e5b4776e7a..8890604e3fcd 100644
--- a/fs/splice.c
+++ b/fs/splice.c
@@ -1024,17 +1024,14 @@ generic_file_splice_write(struct pipe_inode_info *pipe, struct file *out,
                ret = sd.num_spliced;
        if (ret > 0) {
-                unsigned long nr_pages;
                int err;
-                nr_pages = (ret + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT;
                err = generic_write_sync(out, *ppos, ret);
                if (err)
                        ret = err;
                else
                        *ppos += ret;
-                balance_dirty_pages_ratelimited_nr(mapping, nr_pages);
+                balance_dirty_pages_ratelimited(mapping);
        }
        sb_end_write(inode->i_sb);
diff --git a/fs/sysfs/file.c b/fs/sysfs/file.c
index 00012e31829d..602f56db0442 100644
--- a/fs/sysfs/file.c
+++ b/fs/sysfs/file.c
@@ -485,8 +485,8 @@ const struct file_operations sysfs_file_operations = {
        .poll           = sysfs_poll,
 };
-int sysfs_attr_ns(struct kobject *kobj, const struct attribute *attr,
+static int sysfs_attr_ns(struct kobject *kobj, const struct attribute *attr,
-                  const void **pns)
+                         const void **pns)
 {
        struct sysfs_dirent *dir_sd = kobj->sd;
        const struct sysfs_ops *ops;
diff --git a/fs/sysfs/mount.c b/fs/sysfs/mount.c
index 71eb7e253927..db940a9be045 100644
--- a/fs/sysfs/mount.c
+++ b/fs/sysfs/mount.c
@@ -149,6 +149,7 @@ static struct file_system_type sysfs_fs_type = {
        .name           = "sysfs",
        .mount          = sysfs_mount,
        .kill_sb        = sysfs_kill_sb,
+        .fs_flags       = FS_USERNS_MOUNT,
 };
 int __init sysfs_init(void)
diff --git a/fs/ubifs/debug.c b/fs/ubifs/debug.c
index 62911637e12f..12817ffc7345 100644
--- a/fs/ubifs/debug.c
+++ b/fs/ubifs/debug.c
@@ -2560,7 +2560,7 @@ static int power_cut_emulated(struct ubifs_info *c, int lnum, int write)
 static int corrupt_data(const struct ubifs_info *c, const void *buf,
                        unsigned int len)
 {
-        unsigned int from, to, i, ffs = chance(1, 2);
+        unsigned int from, to, ffs = chance(1, 2);
        unsigned char *p = (void *)buf;
        from = random32() % (len + 1);
@@ -2571,11 +2571,9 @@ static int corrupt_data(const struct ubifs_info *c, const void *buf,
                   ffs ? "0xFFs" : "random data");
        if (ffs)
-                for (i = from; i < to; i++)
+                memset(p + from, 0xFF, to - from);
-                        p[i] = 0xFF;
        else
-                for (i = from; i < to; i++)
+                prandom_bytes(p + from, to - from);
-                        p[i] = random32() % 0x100;
        return to;
 }
diff --git a/fs/ubifs/dir.c b/fs/ubifs/dir.c
index e271fba1651b..8a574776a493 100644
--- a/fs/ubifs/dir.c
+++ b/fs/ubifs/dir.c
@@ -453,11 +453,11 @@ out:
 }
 /* If a directory is seeked, we have to free saved readdir() state */
-static loff_t ubifs_dir_llseek(struct file *file, loff_t offset, int origin)
+static loff_t ubifs_dir_llseek(struct file *file, loff_t offset, int whence)
 {
        kfree(file->private_data);
        file->private_data = NULL;
-        return generic_file_llseek(file, offset, origin);
+        return generic_file_llseek(file, offset, whence);
 }
 /* Free saved readdir() state when the directory is closed */
diff --git a/fs/ubifs/find.c b/fs/ubifs/find.c
index 28ec13af28d9..2dcf3d473fec 100644
--- a/fs/ubifs/find.c
+++ b/fs/ubifs/find.c
@@ -681,8 +681,16 @@ int ubifs_find_free_leb_for_idx(struct ubifs_info *c)
        if (!lprops) {
                lprops = ubifs_fast_find_freeable(c);
                if (!lprops) {
-                        ubifs_assert(c->freeable_cnt == 0);
+                        /*
-                        if (c->lst.empty_lebs - c->lst.taken_empty_lebs > 0) {
+                         * The first condition means the following: go scan the
+                         * LPT if there are uncategorized lprops, which means
+                         * there may be freeable LEBs there (UBIFS does not
+                         * store the information about freeable LEBs in the
+                         * master node).
+                         */
+                        if (c->in_a_category_cnt != c->main_lebs ||
+                            c->lst.empty_lebs - c->lst.taken_empty_lebs > 0) {
+                                ubifs_assert(c->freeable_cnt == 0);
                                lprops = scan_for_leb_for_idx(c);
                                if (IS_ERR(lprops)) {
                                        err = PTR_ERR(lprops);
diff --git a/fs/ubifs/lprops.c b/fs/ubifs/lprops.c
index e5a2a35a46dc..46190a7c42a6 100644
--- a/fs/ubifs/lprops.c
+++ b/fs/ubifs/lprops.c
@@ -300,8 +300,11 @@ void ubifs_add_to_cat(struct ubifs_info *c, struct ubifs_lprops *lprops,
        default:
                ubifs_assert(0);
        }
        lprops->flags &= ~LPROPS_CAT_MASK;
        lprops->flags |= cat;
+        c->in_a_category_cnt += 1;
+        ubifs_assert(c->in_a_category_cnt <= c->main_lebs);
 }
 /**
@@ -334,6 +337,9 @@ static void ubifs_remove_from_cat(struct ubifs_info *c,
        default:
                ubifs_assert(0);
        }
+        c->in_a_category_cnt -= 1;
+        ubifs_assert(c->in_a_category_cnt >= 0);
 }
 /**
diff --git a/fs/ubifs/ubifs.h b/fs/ubifs/ubifs.h
index 5486346d0a3f..d133c276fe05 100644
--- a/fs/ubifs/ubifs.h
+++ b/fs/ubifs/ubifs.h
@@ -1183,6 +1183,8 @@ struct ubifs_debug_info;
 * @freeable_list: list of freeable non-index LEBs (free + dirty == @leb_size)
 * @frdi_idx_list: list of freeable index LEBs (free + dirty == @leb_size)
 * @freeable_cnt: number of freeable LEBs in @freeable_list
+ * @in_a_category_cnt: count of lprops which are in a certain category, which
+ *                     basically meants that they were loaded from the flash
 *
 * @ltab_lnum: LEB number of LPT's own lprops table
 * @ltab_offs: offset of LPT's own lprops table
@@ -1412,6 +1414,7 @@ struct ubifs_info {
        struct list_head freeable_list;
        struct list_head frdi_idx_list;
        int freeable_cnt;
+        int in_a_category_cnt;
        int ltab_lnum;
        int ltab_offs;
diff --git a/fs/udf/inode.c b/fs/udf/inode.c
index df88b957ccf0..cbae1ed0b7c1 100644
--- a/fs/udf/inode.c
+++ b/fs/udf/inode.c
@@ -587,7 +587,6 @@ out:
 static sector_t inode_getblk(struct inode *inode, sector_t block,
                             int *err, int *new)
 {
-        static sector_t last_block;
        struct kernel_long_ad laarr[EXTENT_MERGE_SIZE];
        struct extent_position prev_epos, cur_epos, next_epos;
        int count = 0, startnum = 0, endnum = 0;
@@ -601,6 +600,7 @@ static sector_t inode_getblk(struct inode *inode, sector_t block,
        struct udf_inode_info *iinfo = UDF_I(inode);
        int goal = 0, pgoal = iinfo->i_location.logicalBlockNum;
        int lastblock = 0;
+        bool isBeyondEOF;
        *err = 0;
        *new = 0;
@@ -676,11 +676,10 @@ static sector_t inode_getblk(struct inode *inode, sector_t block,
                return newblock;
        }
-        last_block = block;
        /* Are we beyond EOF? */
        if (etype == -1) {
                int ret;
+                isBeyondEOF = 1;
                if (count) {
                        if (c)
                                laarr[0] = laarr[1];
@@ -718,11 +717,11 @@ static sector_t inode_getblk(struct inode *inode, sector_t block,
                        memset(&laarr[c].extLocation, 0x00,
                                sizeof(struct kernel_lb_addr));
                        count++;
-                        endnum++;
                }
                endnum = c + 1;
                lastblock = 1;
        } else {
+                isBeyondEOF = 0;
                endnum = startnum = ((count > 2) ? 2 : count);
                /* if the current extent is in position 0,
@@ -765,10 +764,13 @@ static sector_t inode_getblk(struct inode *inode, sector_t block,
                                goal, err);
                if (!newblocknum) {
                        brelse(prev_epos.bh);
+                        brelse(cur_epos.bh);
+                        brelse(next_epos.bh);
                        *err = -ENOSPC;
                        return 0;
                }
-                iinfo->i_lenExtents += inode->i_sb->s_blocksize;
+                if (isBeyondEOF)
+                        iinfo->i_lenExtents += inode->i_sb->s_blocksize;
        }
        /* if the extent the requsted block is located in contains multiple
@@ -795,6 +797,8 @@ static sector_t inode_getblk(struct inode *inode, sector_t block,
        udf_update_extents(inode, laarr, startnum, endnum, &prev_epos);
        brelse(prev_epos.bh);
+        brelse(cur_epos.bh);
+        brelse(next_epos.bh);
        newblock = udf_get_pblock(inode->i_sb, newblocknum,
                                iinfo->i_location.partitionReferenceNum, 0);
diff --git a/fs/xfs/Kconfig b/fs/xfs/Kconfig
index 6100ec0fa1d4..5a7ffe54f5d5 100644
--- a/fs/xfs/Kconfig
+++ b/fs/xfs/Kconfig
@@ -2,6 +2,7 @@ config XFS_FS
        tristate "XFS filesystem support"
        depends on BLOCK
        select EXPORTFS
+        select LIBCRC32C
        help
          XFS is a high performance journaling filesystem which originated
          on the SGI IRIX platform.  It is completely multi-threaded, can
diff --git a/fs/xfs/Makefile b/fs/xfs/Makefile
index d2bf974b1a2f..d02201df855b 100644
--- a/fs/xfs/Makefile
+++ b/fs/xfs/Makefile
@@ -37,9 +37,8 @@ xfs-y				+= xfs_aops.o \
                                   xfs_file.o \
                                   xfs_filestream.o \
                                   xfs_fsops.o \
-                                   xfs_fs_subr.o \
                                   xfs_globals.o \
-                                   xfs_iget.o \
+                                   xfs_icache.o \
                                   xfs_ioctl.o \
                                   xfs_iomap.o \
                                   xfs_iops.o \
@@ -47,7 +46,6 @@ xfs-y				+= xfs_aops.o \
                                   xfs_message.o \
                                   xfs_mru_cache.o \
                                   xfs_super.o \
-                                   xfs_sync.o \
                                   xfs_xattr.o \
                                   xfs_rename.o \
                                   xfs_utils.o \
diff --git a/fs/xfs/uuid.h b/fs/xfs/uuid.h
index 4732d71262cc..104db0f3bed6 100644
--- a/fs/xfs/uuid.h
+++ b/fs/xfs/uuid.h
@@ -26,4 +26,10 @@ extern int uuid_is_nil(uuid_t *uuid);
 extern int uuid_equal(uuid_t *uuid1, uuid_t *uuid2);
 extern void uuid_getnodeuniq(uuid_t *uuid, int fsid [2]);
+static inline void
+uuid_copy(uuid_t *dst, uuid_t *src)
+{
+        memcpy(dst, src, sizeof(uuid_t));
+}
 #endif  /* __XFS_SUPPORT_UUID_H__ */
diff --git a/fs/xfs/xfs_ag.h b/fs/xfs/xfs_ag.h
index 44d65c1533c0..f2aeedb6a579 100644
--- a/fs/xfs/xfs_ag.h
+++ b/fs/xfs/xfs_ag.h
@@ -108,6 +108,8 @@ typedef struct xfs_agf {
 extern int xfs_read_agf(struct xfs_mount *mp, struct xfs_trans *tp,
                        xfs_agnumber_t agno, int flags, struct xfs_buf **bpp);
+extern const struct xfs_buf_ops xfs_agf_buf_ops;
 /*
 * Size of the unlinked inode hash table in the agi.
 */
@@ -161,6 +163,8 @@ typedef struct xfs_agi {
 extern int xfs_read_agi(struct xfs_mount *mp, struct xfs_trans *tp,
                                xfs_agnumber_t agno, struct xfs_buf **bpp);
+extern const struct xfs_buf_ops xfs_agi_buf_ops;
 /*
 * The third a.g. block contains the a.g. freelist, an array
 * of block pointers to blocks owned by the allocation btree code.
@@ -233,6 +237,7 @@ typedef struct xfs_perag {
 #define XFS_ICI_NO_TAG          (-1)    /* special flag for an untagged lookup
                                           in xfs_inode_ag_iterator */
 #define XFS_ICI_RECLAIM_TAG     0       /* inode is to be reclaimed */
+#define XFS_ICI_EOFBLOCKS_TAG   1       /* inode has blocks beyond EOF */
 #define XFS_AG_MAXLEVELS(mp)            ((mp)->m_ag_maxlevels)
 #define XFS_MIN_FREELIST_RAW(bl,cl,mp)  \
diff --git a/fs/xfs/xfs_alloc.c b/fs/xfs/xfs_alloc.c
index 4f33c32affe3..393055fe3aef 100644
--- a/fs/xfs/xfs_alloc.c
+++ b/fs/xfs/xfs_alloc.c
@@ -430,6 +430,60 @@ xfs_alloc_fixup_trees(
        return 0;
 }
+static void
+xfs_agfl_verify(
+        struct xfs_buf  *bp)
+{
+#ifdef WHEN_CRCS_COME_ALONG
+        /*
+         * we cannot actually do any verification of the AGFL because mkfs does
+         * not initialise the AGFL to zero or NULL. Hence the only valid part of
+         * the AGFL is what the AGF says is active. We can't get to the AGF, so
+         * we can't verify just those entries are valid.
+         *
+         * This problem goes away when the CRC format change comes along as that
+         * requires the AGFL to be initialised by mkfs. At that point, we can
+         * verify the blocks in the agfl -active or not- lie within the bounds
+         * of the AG. Until then, just leave this check ifdef'd out.
+         */
+        struct xfs_mount *mp = bp->b_target->bt_mount;
+        struct xfs_agfl *agfl = XFS_BUF_TO_AGFL(bp);
+        int             agfl_ok = 1;
+        int             i;
+        for (i = 0; i < XFS_AGFL_SIZE(mp); i++) {
+                if (be32_to_cpu(agfl->agfl_bno[i]) == NULLAGBLOCK ||
+                    be32_to_cpu(agfl->agfl_bno[i]) >= mp->m_sb.sb_agblocks)
+                        agfl_ok = 0;
+        }
+        if (!agfl_ok) {
+                XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_LOW, mp, agfl);
+                xfs_buf_ioerror(bp, EFSCORRUPTED);
+        }
+#endif
+}
+static void
+xfs_agfl_write_verify(
+        struct xfs_buf  *bp)
+{
+        xfs_agfl_verify(bp);
+}
+static void
+xfs_agfl_read_verify(
+        struct xfs_buf  *bp)
+{
+        xfs_agfl_verify(bp);
+}
+const struct xfs_buf_ops xfs_agfl_buf_ops = {
+        .verify_read = xfs_agfl_read_verify,
+        .verify_write = xfs_agfl_write_verify,
+};
 /*
 * Read in the allocation group free block array.
 */
@@ -447,7 +501,7 @@ xfs_alloc_read_agfl(
        error = xfs_trans_read_buf(
                        mp, tp, mp->m_ddev_targp,
                        XFS_AG_DADDR(mp, agno, XFS_AGFL_DADDR(mp)),
-                        XFS_FSS_TO_BB(mp, 1), 0, &bp);
+                        XFS_FSS_TO_BB(mp, 1), 0, &bp, &xfs_agfl_buf_ops);
        if (error)
                return error;
        ASSERT(!xfs_buf_geterror(bp));
@@ -1866,6 +1920,7 @@ xfs_alloc_fix_freelist(
        /*
         * Initialize the args structure.
         */
+        memset(&targs, 0, sizeof(targs));
        targs.tp = tp;
        targs.mp = mp;
        targs.agbp = agbp;
@@ -2090,6 +2145,63 @@ xfs_alloc_put_freelist(
        return 0;
 }
+static void
+xfs_agf_verify(
+        struct xfs_buf  *bp)
+ {
+        struct xfs_mount *mp = bp->b_target->bt_mount;
+        struct xfs_agf  *agf;
+        int             agf_ok;
+        agf = XFS_BUF_TO_AGF(bp);
+        agf_ok = agf->agf_magicnum == cpu_to_be32(XFS_AGF_MAGIC) &&
+                XFS_AGF_GOOD_VERSION(be32_to_cpu(agf->agf_versionnum)) &&
+                be32_to_cpu(agf->agf_freeblks) <= be32_to_cpu(agf->agf_length) &&
+                be32_to_cpu(agf->agf_flfirst) < XFS_AGFL_SIZE(mp) &&
+                be32_to_cpu(agf->agf_fllast) < XFS_AGFL_SIZE(mp) &&
+                be32_to_cpu(agf->agf_flcount) <= XFS_AGFL_SIZE(mp);
+        /*
+         * during growfs operations, the perag is not fully initialised,
+         * so we can't use it for any useful checking. growfs ensures we can't
+         * use it by using uncached buffers that don't have the perag attached
+         * so we can detect and avoid this problem.
+         */
+        if (bp->b_pag)
+                agf_ok = agf_ok && be32_to_cpu(agf->agf_seqno) ==
+                                                bp->b_pag->pag_agno;
+        if (xfs_sb_version_haslazysbcount(&mp->m_sb))
+                agf_ok = agf_ok && be32_to_cpu(agf->agf_btreeblks) <=
+                                                be32_to_cpu(agf->agf_length);
+        if (unlikely(XFS_TEST_ERROR(!agf_ok, mp, XFS_ERRTAG_ALLOC_READ_AGF,
+                        XFS_RANDOM_ALLOC_READ_AGF))) {
+                XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_LOW, mp, agf);
+                xfs_buf_ioerror(bp, EFSCORRUPTED);
+        }
+}
+static void
+xfs_agf_read_verify(
+        struct xfs_buf  *bp)
+{
+        xfs_agf_verify(bp);
+}
+static void
+xfs_agf_write_verify(
+        struct xfs_buf  *bp)
+{
+        xfs_agf_verify(bp);
+}
+const struct xfs_buf_ops xfs_agf_buf_ops = {
+        .verify_read = xfs_agf_read_verify,
+        .verify_write = xfs_agf_write_verify,
+};
 /*
 * Read in the allocation group header (free/alloc section).
 */
@@ -2101,44 +2213,19 @@ xfs_read_agf(
        int                     flags,  /* XFS_BUF_ */
        struct xfs_buf          **bpp)  /* buffer for the ag freelist header */
 {
-        struct xfs_agf  *agf;           /* ag freelist header */
-        int             agf_ok;         /* set if agf is consistent */
        int             error;
        ASSERT(agno != NULLAGNUMBER);
        error = xfs_trans_read_buf(
                        mp, tp, mp->m_ddev_targp,
                        XFS_AG_DADDR(mp, agno, XFS_AGF_DADDR(mp)),
-                        XFS_FSS_TO_BB(mp, 1), flags, bpp);
+                        XFS_FSS_TO_BB(mp, 1), flags, bpp, &xfs_agf_buf_ops);
        if (error)
                return error;
        if (!*bpp)
                return 0;
        ASSERT(!(*bpp)->b_error);
-        agf = XFS_BUF_TO_AGF(*bpp);
-        /*
-         * Validate the magic number of the agf block.
-         */
-        agf_ok =
-                agf->agf_magicnum == cpu_to_be32(XFS_AGF_MAGIC) &&
-                XFS_AGF_GOOD_VERSION(be32_to_cpu(agf->agf_versionnum)) &&
-                be32_to_cpu(agf->agf_freeblks) <= be32_to_cpu(agf->agf_length) &&
-                be32_to_cpu(agf->agf_flfirst) < XFS_AGFL_SIZE(mp) &&
-                be32_to_cpu(agf->agf_fllast) < XFS_AGFL_SIZE(mp) &&
-                be32_to_cpu(agf->agf_flcount) <= XFS_AGFL_SIZE(mp) &&
-                be32_to_cpu(agf->agf_seqno) == agno;
-        if (xfs_sb_version_haslazysbcount(&mp->m_sb))
-                agf_ok = agf_ok && be32_to_cpu(agf->agf_btreeblks) <=
-                                                be32_to_cpu(agf->agf_length);
-        if (unlikely(XFS_TEST_ERROR(!agf_ok, mp, XFS_ERRTAG_ALLOC_READ_AGF,
-                        XFS_RANDOM_ALLOC_READ_AGF))) {
-                XFS_CORRUPTION_ERROR("xfs_alloc_read_agf",
-                                     XFS_ERRLEVEL_LOW, mp, agf);
-                xfs_trans_brelse(tp, *bpp);
-                return XFS_ERROR(EFSCORRUPTED);
-        }
        xfs_buf_set_ref(*bpp, XFS_AGF_REF);
        return 0;
 }
@@ -2207,7 +2294,7 @@ xfs_alloc_read_agf(
 * group or loop over the allocation groups to find the result.
 */
 int                             /* error */
-__xfs_alloc_vextent(
+xfs_alloc_vextent(
        xfs_alloc_arg_t *args)  /* allocation argument structure */
 {
        xfs_agblock_t   agsize; /* allocation group size */
@@ -2417,46 +2504,6 @@ error0:
        return error;
 }
-static void
-xfs_alloc_vextent_worker(
-        struct work_struct      *work)
-{
-        struct xfs_alloc_arg    *args = container_of(work,
-                                                struct xfs_alloc_arg, work);
-        unsigned long           pflags;
-        /* we are in a transaction context here */
-        current_set_flags_nested(&pflags, PF_FSTRANS);
-        args->result = __xfs_alloc_vextent(args);
-        complete(args->done);
-        current_restore_flags_nested(&pflags, PF_FSTRANS);
-}
-/*
- * Data allocation requests often come in with little stack to work on. Push
- * them off to a worker thread so there is lots of stack to use. Metadata
- * requests, OTOH, are generally from low stack usage paths, so avoid the
- * context switch overhead here.
- */
-int
-xfs_alloc_vextent(
-        struct xfs_alloc_arg    *args)
-{
-        DECLARE_COMPLETION_ONSTACK(done);
-        if (!args->userdata)
-                return __xfs_alloc_vextent(args);
-        args->done = &done;
-        INIT_WORK_ONSTACK(&args->work, xfs_alloc_vextent_worker);
-        queue_work(xfs_alloc_wq, &args->work);
-        wait_for_completion(&done);
-        return args->result;
-}
 /*
 * Free an extent.
 * Just break up the extent address and hand off to xfs_free_ag_extent
diff --git a/fs/xfs/xfs_alloc.h b/fs/xfs/xfs_alloc.h
index 93be4a667ca1..99d0a6101558 100644
--- a/fs/xfs/xfs_alloc.h
+++ b/fs/xfs/xfs_alloc.h
@@ -120,9 +120,6 @@ typedef struct xfs_alloc_arg {
        char            isfl;           /* set if is freelist blocks - !acctg */
        char            userdata;       /* set if this is user data */
        xfs_fsblock_t   firstblock;     /* io first block allocated */
-        struct completion *done;
-        struct work_struct work;
-        int             result;
 } xfs_alloc_arg_t;
 /*
@@ -234,4 +231,7 @@ xfs_alloc_get_rec(
        xfs_extlen_t            *len,   /* output: length of extent */
        int                     *stat); /* output: success/failure */
+extern const struct xfs_buf_ops xfs_agf_buf_ops;
+extern const struct xfs_buf_ops xfs_agfl_buf_ops;
 #endif  /* __XFS_ALLOC_H__ */
diff --git a/fs/xfs/xfs_alloc_btree.c b/fs/xfs/xfs_alloc_btree.c
index f1647caace8f..b1ddef6b2689 100644
--- a/fs/xfs/xfs_alloc_btree.c
+++ b/fs/xfs/xfs_alloc_btree.c
@@ -121,6 +121,8 @@ xfs_allocbt_free_block(
        xfs_extent_busy_insert(cur->bc_tp, be32_to_cpu(agf->agf_seqno), bno, 1,
                              XFS_EXTENT_BUSY_SKIP_DISCARD);
        xfs_trans_agbtree_delta(cur->bc_tp, -1);
+        xfs_trans_binval(cur->bc_tp, bp);
        return 0;
 }
@@ -270,6 +272,82 @@ xfs_allocbt_key_diff(
        return (__int64_t)be32_to_cpu(kp->ar_startblock) - rec->ar_startblock;
 }
+static void
+xfs_allocbt_verify(
+        struct xfs_buf          *bp)
+{
+        struct xfs_mount        *mp = bp->b_target->bt_mount;
+        struct xfs_btree_block  *block = XFS_BUF_TO_BLOCK(bp);
+        struct xfs_perag        *pag = bp->b_pag;
+        unsigned int            level;
+        int                     sblock_ok; /* block passes checks */
+        /*
+         * magic number and level verification
+         *
+         * During growfs operations, we can't verify the exact level as the
+         * perag is not fully initialised and hence not attached to the buffer.
+         * In this case, check against the maximum tree depth.
+         */
+        level = be16_to_cpu(block->bb_level);
+        switch (block->bb_magic) {
+        case cpu_to_be32(XFS_ABTB_MAGIC):
+                if (pag)
+                        sblock_ok = level < pag->pagf_levels[XFS_BTNUM_BNOi];
+                else
+                        sblock_ok = level < mp->m_ag_maxlevels;
+                break;
+        case cpu_to_be32(XFS_ABTC_MAGIC):
+                if (pag)
+                        sblock_ok = level < pag->pagf_levels[XFS_BTNUM_CNTi];
+                else
+                        sblock_ok = level < mp->m_ag_maxlevels;
+                break;
+        default:
+                sblock_ok = 0;
+                break;
+        }
+        /* numrecs verification */
+        sblock_ok = sblock_ok &&
+                be16_to_cpu(block->bb_numrecs) <= mp->m_alloc_mxr[level != 0];
+        /* sibling pointer verification */
+        sblock_ok = sblock_ok &&
+                (block->bb_u.s.bb_leftsib == cpu_to_be32(NULLAGBLOCK) ||
+                 be32_to_cpu(block->bb_u.s.bb_leftsib) < mp->m_sb.sb_agblocks) &&
+                block->bb_u.s.bb_leftsib &&
+                (block->bb_u.s.bb_rightsib == cpu_to_be32(NULLAGBLOCK) ||
+                 be32_to_cpu(block->bb_u.s.bb_rightsib) < mp->m_sb.sb_agblocks) &&
+                block->bb_u.s.bb_rightsib;
+        if (!sblock_ok) {
+                trace_xfs_btree_corrupt(bp, _RET_IP_);
+                XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_LOW, mp, block);
+                xfs_buf_ioerror(bp, EFSCORRUPTED);
+        }
+}
+static void
+xfs_allocbt_read_verify(
+        struct xfs_buf  *bp)
+{
+        xfs_allocbt_verify(bp);
+}
+static void
+xfs_allocbt_write_verify(
+        struct xfs_buf  *bp)
+{
+        xfs_allocbt_verify(bp);
+}
+const struct xfs_buf_ops xfs_allocbt_buf_ops = {
+        .verify_read = xfs_allocbt_read_verify,
+        .verify_write = xfs_allocbt_write_verify,
+};
 #ifdef DEBUG
 STATIC int
 xfs_allocbt_keys_inorder(
@@ -325,6 +403,7 @@ static const struct xfs_btree_ops xfs_allocbt_ops = {
        .init_rec_from_cur      = xfs_allocbt_init_rec_from_cur,
        .init_ptr_from_cur      = xfs_allocbt_init_ptr_from_cur,
        .key_diff               = xfs_allocbt_key_diff,
+        .buf_ops                = &xfs_allocbt_buf_ops,
 #ifdef DEBUG
        .keys_inorder           = xfs_allocbt_keys_inorder,
        .recs_inorder           = xfs_allocbt_recs_inorder,
diff --git a/fs/xfs/xfs_alloc_btree.h b/fs/xfs/xfs_alloc_btree.h
index 359fb86ed876..7e89a2b429dd 100644
--- a/fs/xfs/xfs_alloc_btree.h
+++ b/fs/xfs/xfs_alloc_btree.h
@@ -93,4 +93,6 @@ extern struct xfs_btree_cur *xfs_allocbt_init_cursor(struct xfs_mount *,
                xfs_agnumber_t, xfs_btnum_t);
 extern int xfs_allocbt_maxrecs(struct xfs_mount *, int, int);
+extern const struct xfs_buf_ops xfs_allocbt_buf_ops;
 #endif  /* __XFS_ALLOC_BTREE_H__ */
diff --git a/fs/xfs/xfs_aops.c b/fs/xfs/xfs_aops.c
index e562dd43f41f..4111a40ebe1a 100644
--- a/fs/xfs/xfs_aops.c
+++ b/fs/xfs/xfs_aops.c
@@ -124,7 +124,7 @@ xfs_setfilesize_trans_alloc(
        ioend->io_append_trans = tp;
        /*
-         * We will pass freeze protection with a transaction.  So tell lockdep
+         * We may pass freeze protection with a transaction.  So tell lockdep
         * we released it.
         */
        rwsem_release(&ioend->io_inode->i_sb->s_writers.lock_map[SB_FREEZE_FS-1],
@@ -149,11 +149,13 @@ xfs_setfilesize(
        xfs_fsize_t             isize;
        /*
-         * The transaction was allocated in the I/O submission thread,
+         * The transaction may have been allocated in the I/O submission thread,
-         * thus we need to mark ourselves as beeing in a transaction
+         * thus we need to mark ourselves as beeing in a transaction manually.
-         * manually.
+         * Similarly for freeze protection.
         */
        current_set_flags_nested(&tp->t_pflags, PF_FSTRANS);
+        rwsem_acquire_read(&VFS_I(ip)->i_sb->s_writers.lock_map[SB_FREEZE_FS-1],
+                           0, 1, _THIS_IP_);
        xfs_ilock(ip, XFS_ILOCK_EXCL);
        isize = xfs_new_eof(ip, ioend->io_offset + ioend->io_size);
@@ -187,7 +189,8 @@ xfs_finish_ioend(
                if (ioend->io_type == XFS_IO_UNWRITTEN)
                        queue_work(mp->m_unwritten_workqueue, &ioend->io_work);
-                else if (ioend->io_append_trans)
+                else if (ioend->io_append_trans ||
+                         (ioend->io_isdirect && xfs_ioend_is_append(ioend)))
                        queue_work(mp->m_data_workqueue, &ioend->io_work);
                else
                        xfs_destroy_ioend(ioend);
@@ -205,15 +208,6 @@ xfs_end_io(
        struct xfs_inode *ip = XFS_I(ioend->io_inode);
        int             error = 0;
-        if (ioend->io_append_trans) {
-                /*
-                 * We've got freeze protection passed with the transaction.
-                 * Tell lockdep about it.
-                 */
-                rwsem_acquire_read(
-                        &ioend->io_inode->i_sb->s_writers.lock_map[SB_FREEZE_FS-1],
-                        0, 1, _THIS_IP_);
-        }
        if (XFS_FORCED_SHUTDOWN(ip->i_mount)) {
                ioend->io_error = -EIO;
                goto done;
@@ -226,35 +220,31 @@ xfs_end_io(
         * range to normal written extens after the data I/O has finished.
         */
        if (ioend->io_type == XFS_IO_UNWRITTEN) {
+                error = xfs_iomap_write_unwritten(ip, ioend->io_offset,
+                                                  ioend->io_size);
+        } else if (ioend->io_isdirect && xfs_ioend_is_append(ioend)) {
                /*
-                 * For buffered I/O we never preallocate a transaction when
+                 * For direct I/O we do not know if we need to allocate blocks
-                 * doing the unwritten extent conversion, but for direct I/O
+                 * or not so we can't preallocate an append transaction as that
-                 * we do not know if we are converting an unwritten extent
+                 * results in nested reservations and log space deadlocks. Hence
-                 * or not at the point where we preallocate the transaction.
+                 * allocate the transaction here. While this is sub-optimal and
+                 * can block IO completion for some time, we're stuck with doing
+                 * it this way until we can pass the ioend to the direct IO
+                 * allocation callbacks and avoid nesting that way.
                 */
-                if (ioend->io_append_trans) {
+                error = xfs_setfilesize_trans_alloc(ioend);
-                        ASSERT(ioend->io_isdirect);
+                if (error)
-                        current_set_flags_nested(
-                                &ioend->io_append_trans->t_pflags, PF_FSTRANS);
-                        xfs_trans_cancel(ioend->io_append_trans, 0);
-                }
-                error = xfs_iomap_write_unwritten(ip, ioend->io_offset,
-                                                 ioend->io_size);
-                if (error) {
-                        ioend->io_error = -error;
                        goto done;
-                }
+                error = xfs_setfilesize(ioend);
        } else if (ioend->io_append_trans) {
                error = xfs_setfilesize(ioend);
-                if (error)
-                        ioend->io_error = -error;
        } else {
                ASSERT(!xfs_ioend_is_append(ioend));
        }
 done:
+        if (error)
+                ioend->io_error = -error;
        xfs_destroy_ioend(ioend);
 }
@@ -481,11 +471,17 @@ static inline int bio_add_buffer(struct bio *bio, struct buffer_head *bh)
 *
 * The fix is two passes across the ioend list - one to start writeback on the
 * buffer_heads, and then submit them for I/O on the second pass.
+ *
+ * If @fail is non-zero, it means that we have a situation where some part of
+ * the submission process has failed after we have marked paged for writeback
+ * and unlocked them. In this situation, we need to fail the ioend chain rather
+ * than submit it to IO. This typically only happens on a filesystem shutdown.
 */
 STATIC void
 xfs_submit_ioend(
        struct writeback_control *wbc,
-        xfs_ioend_t             *ioend)
+        xfs_ioend_t             *ioend,
+        int                     fail)
 {
        xfs_ioend_t             *head = ioend;
        xfs_ioend_t             *next;
@@ -506,6 +502,18 @@ xfs_submit_ioend(
                next = ioend->io_list;
                bio = NULL;
+                /*
+                 * If we are failing the IO now, just mark the ioend with an
+                 * error and finish it. This will run IO completion immediately
+                 * as there is only one reference to the ioend at this point in
+                 * time.
+                 */
+                if (fail) {
+                        ioend->io_error = -fail;
+                        xfs_finish_ioend(ioend);
+                        continue;
+                }
                for (bh = ioend->io_buffer_head; bh; bh = bh->b_private) {
                        if (!bio) {
@@ -1060,7 +1068,18 @@ xfs_vm_writepage(
        xfs_start_page_writeback(page, 1, count);
-        if (ioend && imap_valid) {
+        /* if there is no IO to be submitted for this page, we are done */
+        if (!ioend)
+                return 0;
+        ASSERT(iohead);
+        /*
+         * Any errors from this point onwards need tobe reported through the IO
+         * completion path as we have marked the initial page as under writeback
+         * and unlocked it.
+         */
+        if (imap_valid) {
                xfs_off_t               end_index;
                end_index = imap.br_startoff + imap.br_blockcount;
@@ -1079,20 +1098,15 @@ xfs_vm_writepage(
                                  wbc, end_index);
        }
-        if (iohead) {
-                /*
-                 * Reserve log space if we might write beyond the on-disk
-                 * inode size.
-                 */
-                if (ioend->io_type != XFS_IO_UNWRITTEN &&
-                    xfs_ioend_is_append(ioend)) {
-                        err = xfs_setfilesize_trans_alloc(ioend);
-                        if (err)
-                                goto error;
-                }
-                xfs_submit_ioend(wbc, iohead);
+        /*
-        }
+         * Reserve log space if we might write beyond the on-disk inode size.
+         */
+        err = 0;
+        if (ioend->io_type != XFS_IO_UNWRITTEN && xfs_ioend_is_append(ioend))
+                err = xfs_setfilesize_trans_alloc(ioend);
+        xfs_submit_ioend(wbc, iohead, err);
        return 0;
@@ -1408,25 +1422,21 @@ xfs_vm_direct_IO(
                size_t size = iov_length(iov, nr_segs);
                /*
-                 * We need to preallocate a transaction for a size update
+                 * We cannot preallocate a size update transaction here as we
-                 * here.  In the case that this write both updates the size
+                 * don't know whether allocation is necessary or not. Hence we
-                 * and converts at least on unwritten extent we will cancel
+                 * can only tell IO completion that one is necessary if we are
-                 * the still clean transaction after the I/O has finished.
+                 * not doing unwritten extent conversion.
                 */
                iocb->private = ioend = xfs_alloc_ioend(inode, XFS_IO_DIRECT);
-                if (offset + size > XFS_I(inode)->i_d.di_size) {
+                if (offset + size > XFS_I(inode)->i_d.di_size)
-                        ret = xfs_setfilesize_trans_alloc(ioend);
-                        if (ret)
-                                goto out_destroy_ioend;
                        ioend->io_isdirect = 1;
-                }
                ret = __blockdev_direct_IO(rw, iocb, inode, bdev, iov,
                                            offset, nr_segs,
                                            xfs_get_blocks_direct,
                                            xfs_end_io_direct_write, NULL, 0);
                if (ret != -EIOCBQUEUED && iocb->private)
-                        goto out_trans_cancel;
+                        goto out_destroy_ioend;
        } else {
                ret = __blockdev_direct_IO(rw, iocb, inode, bdev, iov,
                                            offset, nr_segs,
@@ -1436,15 +1446,6 @@ xfs_vm_direct_IO(
        return ret;
-out_trans_cancel:
-        if (ioend->io_append_trans) {
-                current_set_flags_nested(&ioend->io_append_trans->t_pflags,
-                                         PF_FSTRANS);
-                rwsem_acquire_read(
-                        &inode->i_sb->s_writers.lock_map[SB_FREEZE_FS-1],
-                        0, 1, _THIS_IP_);
-                xfs_trans_cancel(ioend->io_append_trans, 0);
-        }
 out_destroy_ioend:
        xfs_destroy_ioend(ioend);
        return ret;
@@ -1617,7 +1618,7 @@ xfs_vm_bmap(
        trace_xfs_vm_bmap(XFS_I(inode));
        xfs_ilock(ip, XFS_IOLOCK_SHARED);
-        xfs_flush_pages(ip, (xfs_off_t)0, -1, 0, FI_REMAPF);
+        filemap_write_and_wait(mapping);
        xfs_iunlock(ip, XFS_IOLOCK_SHARED);
        return generic_block_bmap(mapping, block, xfs_get_blocks);
 }
diff --git a/fs/xfs/xfs_attr.c b/fs/xfs/xfs_attr.c
index 0ca1f0be62d2..aaf472532b3c 100644
--- a/fs/xfs/xfs_attr.c
+++ b/fs/xfs/xfs_attr.c
@@ -903,11 +903,9 @@ xfs_attr_leaf_addname(xfs_da_args_t *args)
         */
        dp = args->dp;
        args->blkno = 0;
-        error = xfs_da_read_buf(args->trans, args->dp, args->blkno, -1, &bp,
+        error = xfs_attr_leaf_read(args->trans, args->dp, args->blkno, -1, &bp);
-                                             XFS_ATTR_FORK);
        if (error)
-                return(error);
+                return error;
-        ASSERT(bp != NULL);
        /*
         * Look up the given attribute in the leaf block.  Figure out if
@@ -1031,12 +1029,12 @@ xfs_attr_leaf_addname(xfs_da_args_t *args)
                 * Read in the block containing the "old" attr, then
                 * remove the "old" attr from that block (neat, huh!)
                 */
-                error = xfs_da_read_buf(args->trans, args->dp, args->blkno, -1,
+                error = xfs_attr_leaf_read(args->trans, args->dp, args->blkno,
-                                                     &bp, XFS_ATTR_FORK);
+                                           -1, &bp);
                if (error)
-                        return(error);
+                        return error;
-                ASSERT(bp != NULL);
-                (void)xfs_attr_leaf_remove(bp, args);
+                xfs_attr_leaf_remove(bp, args);
                /*
                 * If the result is small enough, shrink it all into the inode.
@@ -1100,20 +1098,17 @@ xfs_attr_leaf_removename(xfs_da_args_t *args)
         */
        dp = args->dp;
        args->blkno = 0;
-        error = xfs_da_read_buf(args->trans, args->dp, args->blkno, -1, &bp,
+        error = xfs_attr_leaf_read(args->trans, args->dp, args->blkno, -1, &bp);
-                                             XFS_ATTR_FORK);
+        if (error)
-        if (error) {
+                return error;
-                return(error);
-        }
-        ASSERT(bp != NULL);
        error = xfs_attr_leaf_lookup_int(bp, args);
        if (error == ENOATTR) {
                xfs_trans_brelse(args->trans, bp);
                return(error);
        }
-        (void)xfs_attr_leaf_remove(bp, args);
+        xfs_attr_leaf_remove(bp, args);
        /*
         * If the result is small enough, shrink it all into the inode.
@@ -1155,12 +1150,12 @@ xfs_attr_leaf_get(xfs_da_args_t *args)
        struct xfs_buf *bp;
        int error;
+        trace_xfs_attr_leaf_get(args);
        args->blkno = 0;
-        error = xfs_da_read_buf(args->trans, args->dp, args->blkno, -1, &bp,
+        error = xfs_attr_leaf_read(args->trans, args->dp, args->blkno, -1, &bp);
-                                             XFS_ATTR_FORK);
        if (error)
-                return(error);
+                return error;
-        ASSERT(bp != NULL);
        error = xfs_attr_leaf_lookup_int(bp, args);
        if (error != EEXIST)  {
@@ -1181,22 +1176,15 @@ xfs_attr_leaf_get(xfs_da_args_t *args)
 STATIC int
 xfs_attr_leaf_list(xfs_attr_list_context_t *context)
 {
-        xfs_attr_leafblock_t *leaf;
        int error;
        struct xfs_buf *bp;
+        trace_xfs_attr_leaf_list(context);
        context->cursor->blkno = 0;
-        error = xfs_da_read_buf(NULL, context->dp, 0, -1, &bp, XFS_ATTR_FORK);
+        error = xfs_attr_leaf_read(NULL, context->dp, 0, -1, &bp);
        if (error)
                return XFS_ERROR(error);
-        ASSERT(bp != NULL);
-        leaf = bp->b_addr;
-        if (unlikely(leaf->hdr.info.magic != cpu_to_be16(XFS_ATTR_LEAF_MAGIC))) {
-                XFS_CORRUPTION_ERROR("xfs_attr_leaf_list", XFS_ERRLEVEL_LOW,
-                                     context->dp->i_mount, leaf);
-                xfs_trans_brelse(NULL, bp);
-                return XFS_ERROR(EFSCORRUPTED);
-        }
        error = xfs_attr_leaf_list_int(bp, context);
        xfs_trans_brelse(NULL, bp);
@@ -1600,12 +1588,9 @@ xfs_attr_node_removename(xfs_da_args_t *args)
                ASSERT(state->path.blk[0].bp);
                state->path.blk[0].bp = NULL;
-                error = xfs_da_read_buf(args->trans, args->dp, 0, -1, &bp,
+                error = xfs_attr_leaf_read(args->trans, args->dp, 0, -1, &bp);
-                                                     XFS_ATTR_FORK);
                if (error)
                        goto out;
-                ASSERT((((xfs_attr_leafblock_t *)bp->b_addr)->hdr.info.magic) ==
-                       cpu_to_be16(XFS_ATTR_LEAF_MAGIC));
                if ((forkoff = xfs_attr_shortform_allfit(bp, dp))) {
                        xfs_bmap_init(args->flist, args->firstblock);
@@ -1653,6 +1638,8 @@ xfs_attr_fillstate(xfs_da_state_t *state)
        xfs_da_state_blk_t *blk;
        int level;
+        trace_xfs_attr_fillstate(state->args);
        /*
         * Roll down the "path" in the state structure, storing the on-disk
         * block number for those buffers in the "path".
@@ -1699,6 +1686,8 @@ xfs_attr_refillstate(xfs_da_state_t *state)
        xfs_da_state_blk_t *blk;
        int level, error;
+        trace_xfs_attr_refillstate(state->args);
        /*
         * Roll down the "path" in the state structure, storing the on-disk
         * block number for those buffers in the "path".
@@ -1707,7 +1696,7 @@ xfs_attr_refillstate(xfs_da_state_t *state)
        ASSERT((path->active >= 0) && (path->active < XFS_DA_NODE_MAXDEPTH));
        for (blk = path->blk, level = 0; level < path->active; blk++, level++) {
                if (blk->disk_blkno) {
-                        error = xfs_da_read_buf(state->args->trans,
+                        error = xfs_da_node_read(state->args->trans,
                                                state->args->dp,
                                                blk->blkno, blk->disk_blkno,
                                                &blk->bp, XFS_ATTR_FORK);
@@ -1726,7 +1715,7 @@ xfs_attr_refillstate(xfs_da_state_t *state)
        ASSERT((path->active >= 0) && (path->active < XFS_DA_NODE_MAXDEPTH));
        for (blk = path->blk, level = 0; level < path->active; blk++, level++) {
                if (blk->disk_blkno) {
-                        error = xfs_da_read_buf(state->args->trans,
+                        error = xfs_da_node_read(state->args->trans,
                                                state->args->dp,
                                                blk->blkno, blk->disk_blkno,
                                                &blk->bp, XFS_ATTR_FORK);
@@ -1755,6 +1744,8 @@ xfs_attr_node_get(xfs_da_args_t *args)
        int error, retval;
        int i;
+        trace_xfs_attr_node_get(args);
        state = xfs_da_state_alloc();
        state->args = args;
        state->mp = args->dp->i_mount;
@@ -1804,6 +1795,8 @@ xfs_attr_node_list(xfs_attr_list_context_t *context)
        int error, i;
        struct xfs_buf *bp;
+        trace_xfs_attr_node_list(context);
        cursor = context->cursor;
        cursor->initted = 1;
@@ -1814,7 +1807,7 @@ xfs_attr_node_list(xfs_attr_list_context_t *context)
         */
        bp = NULL;
        if (cursor->blkno > 0) {
-                error = xfs_da_read_buf(NULL, context->dp, cursor->blkno, -1,
+                error = xfs_da_node_read(NULL, context->dp, cursor->blkno, -1,
                                              &bp, XFS_ATTR_FORK);
                if ((error != 0) && (error != EFSCORRUPTED))
                        return(error);
@@ -1856,17 +1849,11 @@ xfs_attr_node_list(xfs_attr_list_context_t *context)
        if (bp == NULL) {
                cursor->blkno = 0;
                for (;;) {
-                        error = xfs_da_read_buf(NULL, context->dp,
+                        error = xfs_da_node_read(NULL, context->dp,
                                                      cursor->blkno, -1, &bp,
                                                      XFS_ATTR_FORK);
                        if (error)
                                return(error);
-                        if (unlikely(bp == NULL)) {
-                                XFS_ERROR_REPORT("xfs_attr_node_list(2)",
-                                                 XFS_ERRLEVEL_LOW,
-                                                 context->dp->i_mount);
-                                return(XFS_ERROR(EFSCORRUPTED));
-                        }
                        node = bp->b_addr;
                        if (node->hdr.info.magic ==
                            cpu_to_be16(XFS_ATTR_LEAF_MAGIC))
@@ -1907,14 +1894,6 @@ xfs_attr_node_list(xfs_attr_list_context_t *context)
         */
        for (;;) {
                leaf = bp->b_addr;
-                if (unlikely(leaf->hdr.info.magic !=
-                             cpu_to_be16(XFS_ATTR_LEAF_MAGIC))) {
-                        XFS_CORRUPTION_ERROR("xfs_attr_node_list(4)",
-                                             XFS_ERRLEVEL_LOW,
-                                             context->dp->i_mount, leaf);
-                        xfs_trans_brelse(NULL, bp);
-                        return(XFS_ERROR(EFSCORRUPTED));
-                }
                error = xfs_attr_leaf_list_int(bp, context);
                if (error) {
                        xfs_trans_brelse(NULL, bp);
@@ -1924,16 +1903,10 @@ xfs_attr_node_list(xfs_attr_list_context_t *context)
                        break;
                cursor->blkno = be32_to_cpu(leaf->hdr.info.forw);
                xfs_trans_brelse(NULL, bp);
-                error = xfs_da_read_buf(NULL, context->dp, cursor->blkno, -1,
+                error = xfs_attr_leaf_read(NULL, context->dp, cursor->blkno, -1,
-                                              &bp, XFS_ATTR_FORK);
+                                           &bp);
                if (error)
-                        return(error);
+                        return error;
-                if (unlikely((bp == NULL))) {
-                        XFS_ERROR_REPORT("xfs_attr_node_list(5)",
-                                         XFS_ERRLEVEL_LOW,
-                                         context->dp->i_mount);
-                        return(XFS_ERROR(EFSCORRUPTED));
-                }
        }
        xfs_trans_brelse(NULL, bp);
        return(0);
@@ -1959,6 +1932,8 @@ xfs_attr_rmtval_get(xfs_da_args_t *args)
        int nmap, error, tmp, valuelen, blkcnt, i;
        xfs_dablk_t lblkno;
+        trace_xfs_attr_rmtval_get(args);
        ASSERT(!(args->flags & ATTR_KERNOVAL));
        mp = args->dp->i_mount;
@@ -1980,7 +1955,7 @@ xfs_attr_rmtval_get(xfs_da_args_t *args)
                        dblkno = XFS_FSB_TO_DADDR(mp, map[i].br_startblock);
                        blkcnt = XFS_FSB_TO_BB(mp, map[i].br_blockcount);
                        error = xfs_trans_read_buf(mp, NULL, mp->m_ddev_targp,
-                                                   dblkno, blkcnt, 0, &bp);
+                                                   dblkno, blkcnt, 0, &bp, NULL);
                        if (error)
                                return(error);
@@ -2014,6 +1989,8 @@ xfs_attr_rmtval_set(xfs_da_args_t *args)
        xfs_dablk_t lblkno;
        int blkcnt, valuelen, nmap, error, tmp, committed;
+        trace_xfs_attr_rmtval_set(args);
        dp = args->dp;
        mp = dp->i_mount;
        src = args->value;
@@ -2143,6 +2120,8 @@ xfs_attr_rmtval_remove(xfs_da_args_t *args)
        xfs_dablk_t lblkno;
        int valuelen, blkcnt, nmap, error, done, committed;
+        trace_xfs_attr_rmtval_remove(args);
        mp = args->dp->i_mount;
        /*
diff --git a/fs/xfs/xfs_attr_leaf.c b/fs/xfs/xfs_attr_leaf.c
index d330111ca738..ee24993c7d12 100644
--- a/fs/xfs/xfs_attr_leaf.c
+++ b/fs/xfs/xfs_attr_leaf.c
@@ -57,7 +57,8 @@ STATIC int xfs_attr_leaf_create(xfs_da_args_t *args, xfs_dablk_t which_block,
                                struct xfs_buf **bpp);
 STATIC int xfs_attr_leaf_add_work(struct xfs_buf *leaf_buffer,
                                  xfs_da_args_t *args, int freemap_index);
-STATIC void xfs_attr_leaf_compact(xfs_trans_t *tp, struct xfs_buf *leaf_buffer);
+STATIC void xfs_attr_leaf_compact(struct xfs_da_args *args,
+                                  struct xfs_buf *leaf_buffer);
 STATIC void xfs_attr_leaf_rebalance(xfs_da_state_t *state,
                                                   xfs_da_state_blk_t *blk1,
                                                   xfs_da_state_blk_t *blk2);
@@ -87,6 +88,52 @@ STATIC void xfs_attr_leaf_moveents(xfs_attr_leafblock_t *src_leaf,
                                         xfs_mount_t *mp);
 STATIC int xfs_attr_leaf_entsize(xfs_attr_leafblock_t *leaf, int index);
+static void
+xfs_attr_leaf_verify(
+        struct xfs_buf          *bp)
+{
+        struct xfs_mount        *mp = bp->b_target->bt_mount;
+        struct xfs_attr_leaf_hdr *hdr = bp->b_addr;
+        int                     block_ok = 0;
+        block_ok = hdr->info.magic == cpu_to_be16(XFS_ATTR_LEAF_MAGIC);
+        if (!block_ok) {
+                XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_LOW, mp, hdr);
+                xfs_buf_ioerror(bp, EFSCORRUPTED);
+        }
+}
+static void
+xfs_attr_leaf_read_verify(
+        struct xfs_buf  *bp)
+{
+        xfs_attr_leaf_verify(bp);
+}
+static void
+xfs_attr_leaf_write_verify(
+        struct xfs_buf  *bp)
+{
+        xfs_attr_leaf_verify(bp);
+}
+const struct xfs_buf_ops xfs_attr_leaf_buf_ops = {
+        .verify_read = xfs_attr_leaf_read_verify,
+        .verify_write = xfs_attr_leaf_write_verify,
+};
+int
+xfs_attr_leaf_read(
+        struct xfs_trans        *tp,
+        struct xfs_inode        *dp,
+        xfs_dablk_t             bno,
+        xfs_daddr_t             mappedbno,
+        struct xfs_buf          **bpp)
+{
+        return xfs_da_read_buf(tp, dp, bno, mappedbno, bpp,
+                                XFS_ATTR_FORK, &xfs_attr_leaf_buf_ops);
+}
 /*========================================================================
 * Namespace helper routines
 *========================================================================*/
@@ -869,17 +916,16 @@ xfs_attr_leaf_to_node(xfs_da_args_t *args)
        error = xfs_da_grow_inode(args, &blkno);
        if (error)
                goto out;
-        error = xfs_da_read_buf(args->trans, args->dp, 0, -1, &bp1,
+        error = xfs_attr_leaf_read(args->trans, args->dp, 0, -1, &bp1);
-                                             XFS_ATTR_FORK);
        if (error)
                goto out;
-        ASSERT(bp1 != NULL);
        bp2 = NULL;
        error = xfs_da_get_buf(args->trans, args->dp, blkno, -1, &bp2,
                                            XFS_ATTR_FORK);
        if (error)
                goto out;
-        ASSERT(bp2 != NULL);
+        bp2->b_ops = bp1->b_ops;
        memcpy(bp2->b_addr, bp1->b_addr, XFS_LBSIZE(dp->i_mount));
        bp1 = NULL;
        xfs_trans_log_buf(args->trans, bp2, 0, XFS_LBSIZE(dp->i_mount) - 1);
@@ -933,7 +979,7 @@ xfs_attr_leaf_create(
                                            XFS_ATTR_FORK);
        if (error)
                return(error);
-        ASSERT(bp != NULL);
+        bp->b_ops = &xfs_attr_leaf_buf_ops;
        leaf = bp->b_addr;
        memset((char *)leaf, 0, XFS_LBSIZE(dp->i_mount));
        hdr = &leaf->hdr;
@@ -1071,7 +1117,7 @@ xfs_attr_leaf_add(
         * Compact the entries to coalesce free space.
         * This may change the hdr->count via dropping INCOMPLETE entries.
         */
-        xfs_attr_leaf_compact(args->trans, bp);
+        xfs_attr_leaf_compact(args, bp);
        /*
         * After compaction, the block is guaranteed to have only one
@@ -1102,6 +1148,8 @@ xfs_attr_leaf_add_work(
        xfs_mount_t *mp;
        int tmp, i;
+        trace_xfs_attr_leaf_add_work(args);
        leaf = bp->b_addr;
        ASSERT(leaf->hdr.info.magic == cpu_to_be16(XFS_ATTR_LEAF_MAGIC));
        hdr = &leaf->hdr;
@@ -1214,15 +1262,17 @@ xfs_attr_leaf_add_work(
 */
 STATIC void
 xfs_attr_leaf_compact(
-        struct xfs_trans *trans,
+        struct xfs_da_args      *args,
-        struct xfs_buf  *bp)
+        struct xfs_buf          *bp)
 {
-        xfs_attr_leafblock_t *leaf_s, *leaf_d;
+        xfs_attr_leafblock_t    *leaf_s, *leaf_d;
-        xfs_attr_leaf_hdr_t *hdr_s, *hdr_d;
+        xfs_attr_leaf_hdr_t     *hdr_s, *hdr_d;
-        xfs_mount_t *mp;
+        struct xfs_trans        *trans = args->trans;
-        char *tmpbuffer;
+        struct xfs_mount        *mp = trans->t_mountp;
+        char                    *tmpbuffer;
+        trace_xfs_attr_leaf_compact(args);
-        mp = trans->t_mountp;
        tmpbuffer = kmem_alloc(XFS_LBSIZE(mp), KM_SLEEP);
        ASSERT(tmpbuffer != NULL);
        memcpy(tmpbuffer, bp->b_addr, XFS_LBSIZE(mp));
@@ -1291,6 +1341,7 @@ xfs_attr_leaf_rebalance(xfs_da_state_t *state, xfs_da_state_blk_t *blk1,
        leaf2 = blk2->bp->b_addr;
        ASSERT(leaf1->hdr.info.magic == cpu_to_be16(XFS_ATTR_LEAF_MAGIC));
        ASSERT(leaf2->hdr.info.magic == cpu_to_be16(XFS_ATTR_LEAF_MAGIC));
+        ASSERT(leaf2->hdr.count == 0);
        args = state->args;
        trace_xfs_attr_leaf_rebalance(args);
@@ -1344,9 +1395,8 @@ xfs_attr_leaf_rebalance(xfs_da_state_t *state, xfs_da_state_blk_t *blk1,
                max  = be16_to_cpu(hdr2->firstused)
                                                - sizeof(xfs_attr_leaf_hdr_t);
                max -= be16_to_cpu(hdr2->count) * sizeof(xfs_attr_leaf_entry_t);
-                if (space > max) {
+                if (space > max)
-                        xfs_attr_leaf_compact(args->trans, blk2->bp);
+                        xfs_attr_leaf_compact(args, blk2->bp);
-                }
                /*
                 * Move high entries from leaf1 to low end of leaf2.
@@ -1361,6 +1411,7 @@ xfs_attr_leaf_rebalance(xfs_da_state_t *state, xfs_da_state_blk_t *blk1,
                 * I assert that since all callers pass in an empty
                 * second buffer, this code should never execute.
                 */
+                ASSERT(0);
                /*
                 * Figure the total bytes to be added to the destination leaf.
@@ -1376,9 +1427,8 @@ xfs_attr_leaf_rebalance(xfs_da_state_t *state, xfs_da_state_blk_t *blk1,
                max  = be16_to_cpu(hdr1->firstused)
                                                - sizeof(xfs_attr_leaf_hdr_t);
                max -= be16_to_cpu(hdr1->count) * sizeof(xfs_attr_leaf_entry_t);
-                if (space > max) {
+                if (space > max)
-                        xfs_attr_leaf_compact(args->trans, blk1->bp);
+                        xfs_attr_leaf_compact(args, blk1->bp);
-                }
                /*
                 * Move low entries from leaf2 to high end of leaf1.
@@ -1422,10 +1472,24 @@ xfs_attr_leaf_rebalance(xfs_da_state_t *state, xfs_da_state_blk_t *blk1,
                        args->index2 = 0;
                        args->blkno2 = blk2->blkno;
                } else {
+                        /*
+                         * On a double leaf split, the original attr location
+                         * is already stored in blkno2/index2, so don't
+                         * overwrite it overwise we corrupt the tree.
+                         */
                        blk2->index = blk1->index
                                    - be16_to_cpu(leaf1->hdr.count);
-                        args->index = args->index2 = blk2->index;
+                        args->index = blk2->index;
-                        args->blkno = args->blkno2 = blk2->blkno;
+                        args->blkno = blk2->blkno;
+                        if (!state->extravalid) {
+                                /*
+                                 * set the new attr location to match the old
+                                 * one and let the higher level split code
+                                 * decide where in the leaf to place it.
+                                 */
+                                args->index2 = blk2->index;
+                                args->blkno2 = blk2->blkno;
+                        }
                }
        } else {
                ASSERT(state->inleaf == 1);
@@ -1561,6 +1625,8 @@ xfs_attr_leaf_toosmall(xfs_da_state_t *state, int *action)
        xfs_dablk_t blkno;
        struct xfs_buf *bp;
+        trace_xfs_attr_leaf_toosmall(state->args);
        /*
         * Check for the degenerate case of the block being over 50% full.
         * If so, it's not worth even looking to see if we might be able
@@ -1620,18 +1686,16 @@ xfs_attr_leaf_toosmall(xfs_da_state_t *state, int *action)
                        blkno = be32_to_cpu(info->back);
                if (blkno == 0)
                        continue;
-                error = xfs_da_read_buf(state->args->trans, state->args->dp,
+                error = xfs_attr_leaf_read(state->args->trans, state->args->dp,
-                                        blkno, -1, &bp, XFS_ATTR_FORK);
+                                        blkno, -1, &bp);
                if (error)
                        return(error);
-                ASSERT(bp != NULL);
                leaf = (xfs_attr_leafblock_t *)info;
                count  = be16_to_cpu(leaf->hdr.count);
                bytes  = state->blocksize - (state->blocksize>>2);
                bytes -= be16_to_cpu(leaf->hdr.usedbytes);
                leaf = bp->b_addr;
-                ASSERT(leaf->hdr.info.magic == cpu_to_be16(XFS_ATTR_LEAF_MAGIC));
                count += be16_to_cpu(leaf->hdr.count);
                bytes -= be16_to_cpu(leaf->hdr.usedbytes);
                bytes -= count * sizeof(xfs_attr_leaf_entry_t);
@@ -1686,6 +1750,8 @@ xfs_attr_leaf_remove(
        int tablesize, tmp, i;
        xfs_mount_t *mp;
+        trace_xfs_attr_leaf_remove(args);
        leaf = bp->b_addr;
        ASSERT(leaf->hdr.info.magic == cpu_to_be16(XFS_ATTR_LEAF_MAGIC));
        hdr = &leaf->hdr;
@@ -2495,15 +2561,11 @@ xfs_attr_leaf_clearflag(xfs_da_args_t *args)
        /*
         * Set up the operation.
         */
-        error = xfs_da_read_buf(args->trans, args->dp, args->blkno, -1, &bp,
+        error = xfs_attr_leaf_read(args->trans, args->dp, args->blkno, -1, &bp);
-                                             XFS_ATTR_FORK);
+        if (error)
-        if (error) {
                return(error);
-        }
-        ASSERT(bp != NULL);
        leaf = bp->b_addr;
-        ASSERT(leaf->hdr.info.magic == cpu_to_be16(XFS_ATTR_LEAF_MAGIC));
        ASSERT(args->index < be16_to_cpu(leaf->hdr.count));
        ASSERT(args->index >= 0);
        entry = &leaf->entries[ args->index ];
@@ -2560,15 +2622,11 @@ xfs_attr_leaf_setflag(xfs_da_args_t *args)
        /*
         * Set up the operation.
         */
-        error = xfs_da_read_buf(args->trans, args->dp, args->blkno, -1, &bp,
+        error = xfs_attr_leaf_read(args->trans, args->dp, args->blkno, -1, &bp);
-                                             XFS_ATTR_FORK);
+        if (error)
-        if (error) {
                return(error);
-        }
-        ASSERT(bp != NULL);
        leaf = bp->b_addr;
-        ASSERT(leaf->hdr.info.magic == cpu_to_be16(XFS_ATTR_LEAF_MAGIC));
        ASSERT(args->index < be16_to_cpu(leaf->hdr.count));
        ASSERT(args->index >= 0);
        entry = &leaf->entries[ args->index ];
@@ -2617,35 +2675,28 @@ xfs_attr_leaf_flipflags(xfs_da_args_t *args)
        /*
         * Read the block containing the "old" attr
         */
-        error = xfs_da_read_buf(args->trans, args->dp, args->blkno, -1, &bp1,
+        error = xfs_attr_leaf_read(args->trans, args->dp, args->blkno, -1, &bp1);
-                                             XFS_ATTR_FORK);
+        if (error)
-        if (error) {
+                return error;
-                return(error);
-        }
-        ASSERT(bp1 != NULL);
        /*
         * Read the block containing the "new" attr, if it is different
         */
        if (args->blkno2 != args->blkno) {
-                error = xfs_da_read_buf(args->trans, args->dp, args->blkno2,
+                error = xfs_attr_leaf_read(args->trans, args->dp, args->blkno2,
-                                        -1, &bp2, XFS_ATTR_FORK);
+                                           -1, &bp2);
-                if (error) {
+                if (error)
-                        return(error);
+                        return error;
-                }
-                ASSERT(bp2 != NULL);
        } else {
                bp2 = bp1;
        }
        leaf1 = bp1->b_addr;
-        ASSERT(leaf1->hdr.info.magic == cpu_to_be16(XFS_ATTR_LEAF_MAGIC));
        ASSERT(args->index < be16_to_cpu(leaf1->hdr.count));
        ASSERT(args->index >= 0);
        entry1 = &leaf1->entries[ args->index ];
        leaf2 = bp2->b_addr;
-        ASSERT(leaf2->hdr.info.magic == cpu_to_be16(XFS_ATTR_LEAF_MAGIC));
        ASSERT(args->index2 < be16_to_cpu(leaf2->hdr.count));
        ASSERT(args->index2 >= 0);
        entry2 = &leaf2->entries[ args->index2 ];
@@ -2730,7 +2781,7 @@ xfs_attr_root_inactive(xfs_trans_t **trans, xfs_inode_t *dp)
         * the extents in reverse order the extent containing
         * block 0 must still be there.
         */
-        error = xfs_da_read_buf(*trans, dp, 0, -1, &bp, XFS_ATTR_FORK);
+        error = xfs_da_node_read(*trans, dp, 0, -1, &bp, XFS_ATTR_FORK);
        if (error)
                return(error);
        blkno = XFS_BUF_ADDR(bp);
@@ -2815,7 +2866,7 @@ xfs_attr_node_inactive(
                 * traversal of the tree so we may deal with many blocks
                 * before we come back to this one.
                 */
-                error = xfs_da_read_buf(*trans, dp, child_fsb, -2, &child_bp,
+                error = xfs_da_node_read(*trans, dp, child_fsb, -2, &child_bp,
                                                XFS_ATTR_FORK);
                if (error)
                        return(error);
@@ -2856,8 +2907,8 @@ xfs_attr_node_inactive(
                 * child block number.
                 */
                if ((i+1) < count) {
-                        error = xfs_da_read_buf(*trans, dp, 0, parent_blkno,
+                        error = xfs_da_node_read(*trans, dp, 0, parent_blkno,
-                                &bp, XFS_ATTR_FORK);
+                                                 &bp, XFS_ATTR_FORK);
                        if (error)
                                return(error);
                        child_fsb = be32_to_cpu(node->btree[i+1].before);
diff --git a/fs/xfs/xfs_attr_leaf.h b/fs/xfs/xfs_attr_leaf.h
index dea17722945e..77de139a58f0 100644
--- a/fs/xfs/xfs_attr_leaf.h
+++ b/fs/xfs/xfs_attr_leaf.h
@@ -261,4 +261,10 @@ int	xfs_attr_leaf_order(struct xfs_buf *leaf1_bp,
                                   struct xfs_buf *leaf2_bp);
 int     xfs_attr_leaf_newentsize(int namelen, int valuelen, int blocksize,
                                        int *local);
+int     xfs_attr_leaf_read(struct xfs_trans *tp, struct xfs_inode *dp,
+                        xfs_dablk_t bno, xfs_daddr_t mappedbno,
+                        struct xfs_buf **bpp);
+extern const struct xfs_buf_ops xfs_attr_leaf_buf_ops;
 #endif  /* __XFS_ATTR_LEAF_H__ */
diff --git a/fs/xfs/xfs_bmap.c b/fs/xfs/xfs_bmap.c
index 848ffa77707b..0e92d12765d2 100644
--- a/fs/xfs/xfs_bmap.c
+++ b/fs/xfs/xfs_bmap.c
@@ -2437,6 +2437,7 @@ xfs_bmap_btalloc(
         * Normal allocation, done through xfs_alloc_vextent.
         */
        tryagain = isaligned = 0;
+        memset(&args, 0, sizeof(args));
        args.tp = ap->tp;
        args.mp = mp;
        args.fsbno = ap->blkno;
@@ -2661,8 +2662,9 @@ xfs_bmap_btree_to_extents(
        if ((error = xfs_btree_check_lptr(cur, cbno, 1)))
                return error;
 #endif
-        if ((error = xfs_btree_read_bufl(mp, tp, cbno, 0, &cbp,
+        error = xfs_btree_read_bufl(mp, tp, cbno, 0, &cbp, XFS_BMAP_BTREE_REF,
-                        XFS_BMAP_BTREE_REF)))
+                                &xfs_bmbt_buf_ops);
+        if (error)
                return error;
        cblock = XFS_BUF_TO_BLOCK(cbp);
        if ((error = xfs_btree_check_block(cur, cblock, 0, cbp)))
@@ -3082,6 +3084,7 @@ xfs_bmap_extents_to_btree(
         * Convert to a btree with two levels, one record in root.
         */
        XFS_IFORK_FMT_SET(ip, whichfork, XFS_DINODE_FMT_BTREE);
+        memset(&args, 0, sizeof(args));
        args.tp = tp;
        args.mp = mp;
        args.firstblock = *firstblock;
@@ -3121,6 +3124,7 @@ xfs_bmap_extents_to_btree(
        /*
         * Fill in the child block.
         */
+        abp->b_ops = &xfs_bmbt_buf_ops;
        ablock = XFS_BUF_TO_BLOCK(abp);
        ablock->bb_magic = cpu_to_be32(XFS_BMAP_MAGIC);
        ablock->bb_level = 0;
@@ -3237,6 +3241,7 @@ xfs_bmap_local_to_extents(
                xfs_buf_t       *bp;    /* buffer for extent block */
                xfs_bmbt_rec_host_t *ep;/* extent record pointer */
+                memset(&args, 0, sizeof(args));
                args.tp = tp;
                args.mp = ip->i_mount;
                args.firstblock = *firstblock;
@@ -3266,6 +3271,7 @@ xfs_bmap_local_to_extents(
                ASSERT(args.len == 1);
                *firstblock = args.fsbno;
                bp = xfs_btree_get_bufl(args.mp, tp, args.fsbno, 0);
+                bp->b_ops = &xfs_bmbt_buf_ops;
                memcpy(bp->b_addr, ifp->if_u1.if_data, ifp->if_bytes);
                xfs_trans_log_buf(tp, bp, 0, ifp->if_bytes - 1);
                xfs_bmap_forkoff_reset(args.mp, ip, whichfork);
@@ -4075,8 +4081,9 @@ xfs_bmap_read_extents(
         * pointer (leftmost) at each level.
         */
        while (level-- > 0) {
-                if ((error = xfs_btree_read_bufl(mp, tp, bno, 0, &bp,
+                error = xfs_btree_read_bufl(mp, tp, bno, 0, &bp,
-                                XFS_BMAP_BTREE_REF)))
+                                XFS_BMAP_BTREE_REF, &xfs_bmbt_buf_ops);
+                if (error)
                        return error;
                block = XFS_BUF_TO_BLOCK(bp);
                XFS_WANT_CORRUPTED_GOTO(
@@ -4121,7 +4128,8 @@ xfs_bmap_read_extents(
                 */
                nextbno = be64_to_cpu(block->bb_u.l.bb_rightsib);
                if (nextbno != NULLFSBLOCK)
-                        xfs_btree_reada_bufl(mp, nextbno, 1);
+                        xfs_btree_reada_bufl(mp, nextbno, 1,
+                                             &xfs_bmbt_buf_ops);
                /*
                 * Copy records into the extent records.
                 */
@@ -4153,8 +4161,9 @@ xfs_bmap_read_extents(
                 */
                if (bno == NULLFSBLOCK)
                        break;
-                if ((error = xfs_btree_read_bufl(mp, tp, bno, 0, &bp,
+                error = xfs_btree_read_bufl(mp, tp, bno, 0, &bp,
-                                XFS_BMAP_BTREE_REF)))
+                                XFS_BMAP_BTREE_REF, &xfs_bmbt_buf_ops);
+                if (error)
                        return error;
                block = XFS_BUF_TO_BLOCK(bp);
        }
@@ -4616,12 +4625,11 @@ xfs_bmapi_delay(
 STATIC int
-xfs_bmapi_allocate(
+__xfs_bmapi_allocate(
-        struct xfs_bmalloca     *bma,
+        struct xfs_bmalloca     *bma)
-        int                     flags)
 {
        struct xfs_mount        *mp = bma->ip->i_mount;
-        int                     whichfork = (flags & XFS_BMAPI_ATTRFORK) ?
+        int                     whichfork = (bma->flags & XFS_BMAPI_ATTRFORK) ?
                                                XFS_ATTR_FORK : XFS_DATA_FORK;
        struct xfs_ifork        *ifp = XFS_IFORK_PTR(bma->ip, whichfork);
        int                     tmp_logflags = 0;
@@ -4654,24 +4662,27 @@ xfs_bmapi_allocate(
         * Indicate if this is the first user data in the file, or just any
         * user data.
         */
-        if (!(flags & XFS_BMAPI_METADATA)) {
+        if (!(bma->flags & XFS_BMAPI_METADATA)) {
                bma->userdata = (bma->offset == 0) ?
                        XFS_ALLOC_INITIAL_USER_DATA : XFS_ALLOC_USERDATA;
        }
-        bma->minlen = (flags & XFS_BMAPI_CONTIG) ? bma->length : 1;
+        bma->minlen = (bma->flags & XFS_BMAPI_CONTIG) ? bma->length : 1;
        /*
         * Only want to do the alignment at the eof if it is userdata and
         * allocation length is larger than a stripe unit.
         */
        if (mp->m_dalign && bma->length >= mp->m_dalign &&
-            !(flags & XFS_BMAPI_METADATA) && whichfork == XFS_DATA_FORK) {
+            !(bma->flags & XFS_BMAPI_METADATA) && whichfork == XFS_DATA_FORK) {
                error = xfs_bmap_isaeof(bma, whichfork);
                if (error)
                        return error;
        }
+        if (bma->flags & XFS_BMAPI_STACK_SWITCH)
+                bma->stack_switch = 1;
        error = xfs_bmap_alloc(bma);
        if (error)
                return error;
@@ -4706,7 +4717,7 @@ xfs_bmapi_allocate(
         * A wasdelay extent has been initialized, so shouldn't be flagged
         * as unwritten.
         */
-        if (!bma->wasdel && (flags & XFS_BMAPI_PREALLOC) &&
+        if (!bma->wasdel && (bma->flags & XFS_BMAPI_PREALLOC) &&
            xfs_sb_version_hasextflgbit(&mp->m_sb))
                bma->got.br_state = XFS_EXT_UNWRITTEN;
@@ -4734,6 +4745,45 @@ xfs_bmapi_allocate(
        return 0;
 }
+static void
+xfs_bmapi_allocate_worker(
+        struct work_struct      *work)
+{
+        struct xfs_bmalloca     *args = container_of(work,
+                                                struct xfs_bmalloca, work);
+        unsigned long           pflags;
+        /* we are in a transaction context here */
+        current_set_flags_nested(&pflags, PF_FSTRANS);
+        args->result = __xfs_bmapi_allocate(args);
+        complete(args->done);
+        current_restore_flags_nested(&pflags, PF_FSTRANS);
+}
+/*
+ * Some allocation requests often come in with little stack to work on. Push
+ * them off to a worker thread so there is lots of stack to use. Otherwise just
+ * call directly to avoid the context switch overhead here.
+ */
+int
+xfs_bmapi_allocate(
+        struct xfs_bmalloca     *args)
+{
+        DECLARE_COMPLETION_ONSTACK(done);
+        if (!args->stack_switch)
+                return __xfs_bmapi_allocate(args);
+        args->done = &done;
+        INIT_WORK_ONSTACK(&args->work, xfs_bmapi_allocate_worker);
+        queue_work(xfs_alloc_wq, &args->work);
+        wait_for_completion(&done);
+        return args->result;
+}
 STATIC int
 xfs_bmapi_convert_unwritten(
        struct xfs_bmalloca     *bma,
@@ -4919,6 +4969,7 @@ xfs_bmapi_write(
                        bma.conv = !!(flags & XFS_BMAPI_CONVERT);
                        bma.wasdel = wasdelay;
                        bma.offset = bno;
+                        bma.flags = flags;
                        /*
                         * There's a 32/64 bit type mismatch between the
@@ -4934,7 +4985,7 @@ xfs_bmapi_write(
                        ASSERT(len > 0);
                        ASSERT(bma.length > 0);
-                        error = xfs_bmapi_allocate(&bma, flags);
+                        error = xfs_bmapi_allocate(&bma);
                        if (error)
                                goto error0;
                        if (bma.blkno == NULLFSBLOCK)
@@ -5554,7 +5605,7 @@ xfs_getbmap(
        xfs_ilock(ip, XFS_IOLOCK_SHARED);
        if (whichfork == XFS_DATA_FORK && !(iflags & BMV_IF_DELALLOC)) {
                if (ip->i_delayed_blks || XFS_ISIZE(ip) > ip->i_d.di_size) {
-                        error = xfs_flush_pages(ip, 0, -1, 0, FI_REMAPF);
+                        error = -filemap_write_and_wait(VFS_I(ip)->i_mapping);
                        if (error)
                                goto out_unlock_iolock;
                }
@@ -5823,15 +5874,16 @@ xfs_bmap_check_leaf_extents(
         */
        while (level-- > 0) {
                /* See if buf is in cur first */
+                bp_release = 0;
                bp = xfs_bmap_get_bp(cur, XFS_FSB_TO_DADDR(mp, bno));
-                if (bp) {
+                if (!bp) {
-                        bp_release = 0;
-                } else {
                        bp_release = 1;
+                        error = xfs_btree_read_bufl(mp, NULL, bno, 0, &bp,
+                                                XFS_BMAP_BTREE_REF,
+                                                &xfs_bmbt_buf_ops);
+                        if (error)
+                                goto error_norelse;
                }
-                if (!bp && (error = xfs_btree_read_bufl(mp, NULL, bno, 0, &bp,
-                                XFS_BMAP_BTREE_REF)))
-                        goto error_norelse;
                block = XFS_BUF_TO_BLOCK(bp);
                XFS_WANT_CORRUPTED_GOTO(
                        xfs_bmap_sanity_check(mp, bp, level),
@@ -5908,15 +5960,16 @@ xfs_bmap_check_leaf_extents(
                if (bno == NULLFSBLOCK)
                        break;
+                bp_release = 0;
                bp = xfs_bmap_get_bp(cur, XFS_FSB_TO_DADDR(mp, bno));
-                if (bp) {
+                if (!bp) {
-                        bp_release = 0;
-                } else {
                        bp_release = 1;
+                        error = xfs_btree_read_bufl(mp, NULL, bno, 0, &bp,
+                                                XFS_BMAP_BTREE_REF,
+                                                &xfs_bmbt_buf_ops);
+                        if (error)
+                                goto error_norelse;
                }
-                if (!bp && (error = xfs_btree_read_bufl(mp, NULL, bno, 0, &bp,
-                                XFS_BMAP_BTREE_REF)))
-                        goto error_norelse;
                block = XFS_BUF_TO_BLOCK(bp);
        }
        if (bp_release) {
@@ -6007,7 +6060,9 @@ xfs_bmap_count_tree(
        struct xfs_btree_block  *block, *nextblock;
        int                     numrecs;
-        if ((error = xfs_btree_read_bufl(mp, tp, bno, 0, &bp, XFS_BMAP_BTREE_REF)))
+        error = xfs_btree_read_bufl(mp, tp, bno, 0, &bp, XFS_BMAP_BTREE_REF,
+                                                &xfs_bmbt_buf_ops);
+        if (error)
                return error;
        *count += 1;
        block = XFS_BUF_TO_BLOCK(bp);
@@ -6016,8 +6071,10 @@ xfs_bmap_count_tree(
                /* Not at node above leaves, count this level of nodes */
                nextbno = be64_to_cpu(block->bb_u.l.bb_rightsib);
                while (nextbno != NULLFSBLOCK) {
-                        if ((error = xfs_btree_read_bufl(mp, tp, nextbno,
+                        error = xfs_btree_read_bufl(mp, tp, nextbno, 0, &nbp,
-                                0, &nbp, XFS_BMAP_BTREE_REF)))
+                                                XFS_BMAP_BTREE_REF,
+                                                &xfs_bmbt_buf_ops);
+                        if (error)
                                return error;
                        *count += 1;
                        nextblock = XFS_BUF_TO_BLOCK(nbp);
@@ -6046,8 +6103,10 @@ xfs_bmap_count_tree(
                        if (nextbno == NULLFSBLOCK)
                                break;
                        bno = nextbno;
-                        if ((error = xfs_btree_read_bufl(mp, tp, bno, 0, &bp,
+                        error = xfs_btree_read_bufl(mp, tp, bno, 0, &bp,
-                                XFS_BMAP_BTREE_REF)))
+                                                XFS_BMAP_BTREE_REF,
+                                                &xfs_bmbt_buf_ops);
+                        if (error)
                                return error;
                        *count += 1;
                        block = XFS_BUF_TO_BLOCK(bp);
diff --git a/fs/xfs/xfs_bmap.h b/fs/xfs/xfs_bmap.h
index 803b56d7ce16..5f469c3516eb 100644
--- a/fs/xfs/xfs_bmap.h
+++ b/fs/xfs/xfs_bmap.h
@@ -77,6 +77,7 @@ typedef	struct xfs_bmap_free
 * from written to unwritten, otherwise convert from unwritten to written.
 */
 #define XFS_BMAPI_CONVERT       0x040
+#define XFS_BMAPI_STACK_SWITCH  0x080
 #define XFS_BMAPI_FLAGS \
        { XFS_BMAPI_ENTIRE,     "ENTIRE" }, \
@@ -85,7 +86,8 @@ typedef	struct xfs_bmap_free
        { XFS_BMAPI_PREALLOC,   "PREALLOC" }, \
        { XFS_BMAPI_IGSTATE,    "IGSTATE" }, \
        { XFS_BMAPI_CONTIG,     "CONTIG" }, \
-        { XFS_BMAPI_CONVERT,    "CONVERT" }
+        { XFS_BMAPI_CONVERT,    "CONVERT" }, \
+        { XFS_BMAPI_STACK_SWITCH, "STACK_SWITCH" }
 static inline int xfs_bmapi_aflag(int w)
@@ -133,6 +135,11 @@ typedef struct xfs_bmalloca {
        char                    userdata;/* set if is user data */
        char                    aeof;   /* allocated space at eof */
        char                    conv;   /* overwriting unwritten extents */
+        char                    stack_switch;
+        int                     flags;
+        struct completion       *done;
+        struct work_struct      work;
+        int                     result;
 } xfs_bmalloca_t;
 /*
diff --git a/fs/xfs/xfs_bmap_btree.c b/fs/xfs/xfs_bmap_btree.c
index 862084a47a7e..061b45cbe614 100644
--- a/fs/xfs/xfs_bmap_btree.c
+++ b/fs/xfs/xfs_bmap_btree.c
@@ -36,6 +36,7 @@
 #include "xfs_bmap.h"
 #include "xfs_error.h"
 #include "xfs_quota.h"
+#include "xfs_trace.h"
 /*
 * Determine the extent state.
@@ -707,6 +708,67 @@ xfs_bmbt_key_diff(
                                      cur->bc_rec.b.br_startoff;
 }
+static void
+xfs_bmbt_verify(
+        struct xfs_buf          *bp)
+{
+        struct xfs_mount        *mp = bp->b_target->bt_mount;
+        struct xfs_btree_block  *block = XFS_BUF_TO_BLOCK(bp);
+        unsigned int            level;
+        int                     lblock_ok; /* block passes checks */
+        /* magic number and level verification.
+         *
+         * We don't know waht fork we belong to, so just verify that the level
+         * is less than the maximum of the two. Later checks will be more
+         * precise.
+         */
+        level = be16_to_cpu(block->bb_level);
+        lblock_ok = block->bb_magic == cpu_to_be32(XFS_BMAP_MAGIC) &&
+                    level < max(mp->m_bm_maxlevels[0], mp->m_bm_maxlevels[1]);
+        /* numrecs verification */
+        lblock_ok = lblock_ok &&
+                be16_to_cpu(block->bb_numrecs) <= mp->m_bmap_dmxr[level != 0];
+        /* sibling pointer verification */
+        lblock_ok = lblock_ok &&
+                block->bb_u.l.bb_leftsib &&
+                (block->bb_u.l.bb_leftsib == cpu_to_be64(NULLDFSBNO) ||
+                 XFS_FSB_SANITY_CHECK(mp,
+                        be64_to_cpu(block->bb_u.l.bb_leftsib))) &&
+                block->bb_u.l.bb_rightsib &&
+                (block->bb_u.l.bb_rightsib == cpu_to_be64(NULLDFSBNO) ||
+                 XFS_FSB_SANITY_CHECK(mp,
+                        be64_to_cpu(block->bb_u.l.bb_rightsib)));
+        if (!lblock_ok) {
+                trace_xfs_btree_corrupt(bp, _RET_IP_);
+                XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_LOW, mp, block);
+                xfs_buf_ioerror(bp, EFSCORRUPTED);
+        }
+}
+static void
+xfs_bmbt_read_verify(
+        struct xfs_buf  *bp)
+{
+        xfs_bmbt_verify(bp);
+}
+static void
+xfs_bmbt_write_verify(
+        struct xfs_buf  *bp)
+{
+        xfs_bmbt_verify(bp);
+}
+const struct xfs_buf_ops xfs_bmbt_buf_ops = {
+        .verify_read = xfs_bmbt_read_verify,
+        .verify_write = xfs_bmbt_write_verify,
+};
 #ifdef DEBUG
 STATIC int
 xfs_bmbt_keys_inorder(
@@ -746,6 +808,7 @@ static const struct xfs_btree_ops xfs_bmbt_ops = {
        .init_rec_from_cur      = xfs_bmbt_init_rec_from_cur,
        .init_ptr_from_cur      = xfs_bmbt_init_ptr_from_cur,
        .key_diff               = xfs_bmbt_key_diff,
+        .buf_ops                = &xfs_bmbt_buf_ops,
 #ifdef DEBUG
        .keys_inorder           = xfs_bmbt_keys_inorder,
        .recs_inorder           = xfs_bmbt_recs_inorder,
diff --git a/fs/xfs/xfs_bmap_btree.h b/fs/xfs/xfs_bmap_btree.h
index 0e66c4ea0f85..88469ca08696 100644
--- a/fs/xfs/xfs_bmap_btree.h
+++ b/fs/xfs/xfs_bmap_btree.h
@@ -236,5 +236,6 @@ extern int xfs_bmbt_maxrecs(struct xfs_mount *, int blocklen, int leaf);
 extern struct xfs_btree_cur *xfs_bmbt_init_cursor(struct xfs_mount *,
                struct xfs_trans *, struct xfs_inode *, int);
+extern const struct xfs_buf_ops xfs_bmbt_buf_ops;
 #endif  /* __XFS_BMAP_BTREE_H__ */
diff --git a/fs/xfs/xfs_btree.c b/fs/xfs/xfs_btree.c
index e53e317b1582..db010408d701 100644
--- a/fs/xfs/xfs_btree.c
+++ b/fs/xfs/xfs_btree.c
@@ -266,9 +266,13 @@ xfs_btree_dup_cursor(
        for (i = 0; i < new->bc_nlevels; i++) {
                new->bc_ptrs[i] = cur->bc_ptrs[i];
                new->bc_ra[i] = cur->bc_ra[i];
-                if ((bp = cur->bc_bufs[i])) {
+                bp = cur->bc_bufs[i];
-                        if ((error = xfs_trans_read_buf(mp, tp, mp->m_ddev_targp,
+                if (bp) {
-                                XFS_BUF_ADDR(bp), mp->m_bsize, 0, &bp))) {
+                        error = xfs_trans_read_buf(mp, tp, mp->m_ddev_targp,
+                                                   XFS_BUF_ADDR(bp), mp->m_bsize,
+                                                   0, &bp,
+                                                   cur->bc_ops->buf_ops);
+                        if (error) {
                                xfs_btree_del_cursor(new, error);
                                *ncur = NULL;
                                return error;
@@ -609,25 +613,26 @@ xfs_btree_offsets(
 * Get a buffer for the block, return it read in.
 * Long-form addressing.
 */
-int                                     /* error */
+int
 xfs_btree_read_bufl(
-        xfs_mount_t     *mp,            /* file system mount point */
+        struct xfs_mount        *mp,            /* file system mount point */
-        xfs_trans_t     *tp,            /* transaction pointer */
+        struct xfs_trans        *tp,            /* transaction pointer */
-        xfs_fsblock_t   fsbno,          /* file system block number */
+        xfs_fsblock_t           fsbno,          /* file system block number */
-        uint            lock,           /* lock flags for read_buf */
+        uint                    lock,           /* lock flags for read_buf */
-        xfs_buf_t       **bpp,          /* buffer for fsbno */
+        struct xfs_buf          **bpp,          /* buffer for fsbno */
-        int             refval)         /* ref count value for buffer */
+        int                     refval,         /* ref count value for buffer */
-{
+        const struct xfs_buf_ops *ops)
-        xfs_buf_t       *bp;            /* return value */
+{
+        struct xfs_buf          *bp;            /* return value */
        xfs_daddr_t             d;              /* real disk block address */
-        int             error;
+        int                     error;
        ASSERT(fsbno != NULLFSBLOCK);
        d = XFS_FSB_TO_DADDR(mp, fsbno);
-        if ((error = xfs_trans_read_buf(mp, tp, mp->m_ddev_targp, d,
+        error = xfs_trans_read_buf(mp, tp, mp->m_ddev_targp, d,
-                        mp->m_bsize, lock, &bp))) {
+                                   mp->m_bsize, lock, &bp, ops);
+        if (error)
                return error;
-        }
        ASSERT(!xfs_buf_geterror(bp));
        if (bp)
                xfs_buf_set_ref(bp, refval);
@@ -642,15 +647,16 @@ xfs_btree_read_bufl(
 /* ARGSUSED */
 void
 xfs_btree_reada_bufl(
-        xfs_mount_t     *mp,            /* file system mount point */
+        struct xfs_mount        *mp,            /* file system mount point */
-        xfs_fsblock_t   fsbno,          /* file system block number */
+        xfs_fsblock_t           fsbno,          /* file system block number */
-        xfs_extlen_t    count)          /* count of filesystem blocks */
+        xfs_extlen_t            count,          /* count of filesystem blocks */
+        const struct xfs_buf_ops *ops)
 {
        xfs_daddr_t             d;
        ASSERT(fsbno != NULLFSBLOCK);
        d = XFS_FSB_TO_DADDR(mp, fsbno);
-        xfs_buf_readahead(mp->m_ddev_targp, d, mp->m_bsize * count);
+        xfs_buf_readahead(mp->m_ddev_targp, d, mp->m_bsize * count, ops);
 }
 /*
@@ -660,17 +666,18 @@ xfs_btree_reada_bufl(
 /* ARGSUSED */
 void
 xfs_btree_reada_bufs(
-        xfs_mount_t     *mp,            /* file system mount point */
+        struct xfs_mount        *mp,            /* file system mount point */
-        xfs_agnumber_t  agno,           /* allocation group number */
+        xfs_agnumber_t          agno,           /* allocation group number */
-        xfs_agblock_t   agbno,          /* allocation group block number */
+        xfs_agblock_t           agbno,          /* allocation group block number */
-        xfs_extlen_t    count)          /* count of filesystem blocks */
+        xfs_extlen_t            count,          /* count of filesystem blocks */
+        const struct xfs_buf_ops *ops)
 {
        xfs_daddr_t             d;
        ASSERT(agno != NULLAGNUMBER);
        ASSERT(agbno != NULLAGBLOCK);
        d = XFS_AGB_TO_DADDR(mp, agno, agbno);
-        xfs_buf_readahead(mp->m_ddev_targp, d, mp->m_bsize * count);
+        xfs_buf_readahead(mp->m_ddev_targp, d, mp->m_bsize * count, ops);
 }
 STATIC int
@@ -684,12 +691,14 @@ xfs_btree_readahead_lblock(
        xfs_dfsbno_t            right = be64_to_cpu(block->bb_u.l.bb_rightsib);
        if ((lr & XFS_BTCUR_LEFTRA) && left != NULLDFSBNO) {
-                xfs_btree_reada_bufl(cur->bc_mp, left, 1);
+                xfs_btree_reada_bufl(cur->bc_mp, left, 1,
+                                     cur->bc_ops->buf_ops);
                rval++;
        }
        if ((lr & XFS_BTCUR_RIGHTRA) && right != NULLDFSBNO) {
-                xfs_btree_reada_bufl(cur->bc_mp, right, 1);
+                xfs_btree_reada_bufl(cur->bc_mp, right, 1,
+                                     cur->bc_ops->buf_ops);
                rval++;
        }
@@ -709,13 +718,13 @@ xfs_btree_readahead_sblock(
        if ((lr & XFS_BTCUR_LEFTRA) && left != NULLAGBLOCK) {
                xfs_btree_reada_bufs(cur->bc_mp, cur->bc_private.a.agno,
-                                     left, 1);
+                                     left, 1, cur->bc_ops->buf_ops);
                rval++;
        }
        if ((lr & XFS_BTCUR_RIGHTRA) && right != NULLAGBLOCK) {
                xfs_btree_reada_bufs(cur->bc_mp, cur->bc_private.a.agno,
-                                     right, 1);
+                                     right, 1, cur->bc_ops->buf_ops);
                rval++;
        }
@@ -853,18 +862,22 @@ xfs_btree_set_sibling(
        }
 }
-STATIC void
+void
 xfs_btree_init_block(
-        struct xfs_btree_cur    *cur,
+        struct xfs_mount *mp,
-        int                     level,
+        struct xfs_buf  *bp,
-        int                     numrecs,
+        __u32           magic,
-        struct xfs_btree_block  *new)   /* new block */
+        __u16           level,
+        __u16           numrecs,
+        unsigned int    flags)
 {
-        new->bb_magic = cpu_to_be32(xfs_magics[cur->bc_btnum]);
+        struct xfs_btree_block  *new = XFS_BUF_TO_BLOCK(bp);
+        new->bb_magic = cpu_to_be32(magic);
        new->bb_level = cpu_to_be16(level);
        new->bb_numrecs = cpu_to_be16(numrecs);
-        if (cur->bc_flags & XFS_BTREE_LONG_PTRS) {
+        if (flags & XFS_BTREE_LONG_PTRS) {
                new->bb_u.l.bb_leftsib = cpu_to_be64(NULLDFSBNO);
                new->bb_u.l.bb_rightsib = cpu_to_be64(NULLDFSBNO);
        } else {
@@ -873,6 +886,17 @@ xfs_btree_init_block(
        }
 }
+STATIC void
+xfs_btree_init_block_cur(
+        struct xfs_btree_cur    *cur,
+        int                     level,
+        int                     numrecs,
+        struct xfs_buf          *bp)
+{
+        xfs_btree_init_block(cur->bc_mp, bp, xfs_magics[cur->bc_btnum],
+                               level, numrecs, cur->bc_flags);
+}
 /*
 * Return true if ptr is the last record in the btree and
 * we need to track updateѕ to this record.  The decision
@@ -972,6 +996,7 @@ xfs_btree_get_buf_block(
        if (!*bpp)
                return ENOMEM;
+        (*bpp)->b_ops = cur->bc_ops->buf_ops;
        *block = XFS_BUF_TO_BLOCK(*bpp);
        return 0;
 }
@@ -998,19 +1023,15 @@ xfs_btree_read_buf_block(
        d = xfs_btree_ptr_to_daddr(cur, ptr);
        error = xfs_trans_read_buf(mp, cur->bc_tp, mp->m_ddev_targp, d,
-                                   mp->m_bsize, flags, bpp);
+                                   mp->m_bsize, flags, bpp,
+                                   cur->bc_ops->buf_ops);
        if (error)
                return error;
        ASSERT(!xfs_buf_geterror(*bpp));
        xfs_btree_set_refs(cur, *bpp);
        *block = XFS_BUF_TO_BLOCK(*bpp);
+        return 0;
-        error = xfs_btree_check_block(cur, *block, level, *bpp);
-        if (error)
-                xfs_trans_brelse(cur->bc_tp, *bpp);
-        return error;
 }
 /*
@@ -2183,7 +2204,7 @@ xfs_btree_split(
                goto error0;
        /* Fill in the btree header for the new right block. */
-        xfs_btree_init_block(cur, xfs_btree_get_level(left), 0, right);
+        xfs_btree_init_block_cur(cur, xfs_btree_get_level(left), 0, rbp);
        /*
         * Split the entries between the old and the new block evenly.
@@ -2492,7 +2513,7 @@ xfs_btree_new_root(
                nptr = 2;
        }
        /* Fill in the new block's btree header and log it. */
-        xfs_btree_init_block(cur, cur->bc_nlevels, 2, new);
+        xfs_btree_init_block_cur(cur, cur->bc_nlevels, 2, nbp);
        xfs_btree_log_block(cur, nbp, XFS_BB_ALL_BITS);
        ASSERT(!xfs_btree_ptr_is_null(cur, &lptr) &&
                        !xfs_btree_ptr_is_null(cur, &rptr));
diff --git a/fs/xfs/xfs_btree.h b/fs/xfs/xfs_btree.h
index 5b240de104c0..f932897194eb 100644
--- a/fs/xfs/xfs_btree.h
+++ b/fs/xfs/xfs_btree.h
@@ -188,6 +188,8 @@ struct xfs_btree_ops {
        __int64_t (*key_diff)(struct xfs_btree_cur *cur,
                              union xfs_btree_key *key);
+        const struct xfs_buf_ops        *buf_ops;
 #ifdef DEBUG
        /* check that k1 is lower than k2 */
        int     (*keys_inorder)(struct xfs_btree_cur *cur,
@@ -355,7 +357,8 @@ xfs_btree_read_bufl(
        xfs_fsblock_t           fsbno,  /* file system block number */
        uint                    lock,   /* lock flags for read_buf */
        struct xfs_buf          **bpp,  /* buffer for fsbno */
-        int                     refval);/* ref count value for buffer */
+        int                     refval, /* ref count value for buffer */
+        const struct xfs_buf_ops *ops);
 /*
 * Read-ahead the block, don't wait for it, don't return a buffer.
@@ -365,7 +368,8 @@ void					/* error */
 xfs_btree_reada_bufl(
        struct xfs_mount        *mp,    /* file system mount point */
        xfs_fsblock_t           fsbno,  /* file system block number */
-        xfs_extlen_t            count); /* count of filesystem blocks */
+        xfs_extlen_t            count,  /* count of filesystem blocks */
+        const struct xfs_buf_ops *ops);
 /*
 * Read-ahead the block, don't wait for it, don't return a buffer.
@@ -376,8 +380,20 @@ xfs_btree_reada_bufs(
        struct xfs_mount        *mp,    /* file system mount point */
        xfs_agnumber_t          agno,   /* allocation group number */
        xfs_agblock_t           agbno,  /* allocation group block number */
-        xfs_extlen_t            count); /* count of filesystem blocks */
+        xfs_extlen_t            count,  /* count of filesystem blocks */
+        const struct xfs_buf_ops *ops);
+/*
+ * Initialise a new btree block header
+ */
+void
+xfs_btree_init_block(
+        struct xfs_mount *mp,
+        struct xfs_buf  *bp,
+        __u32           magic,
+        __u16           level,
+        __u16           numrecs,
+        unsigned int    flags);
 /*
 * Common btree core entry points.
diff --git a/fs/xfs/xfs_buf.c b/fs/xfs/xfs_buf.c
index 933b7930b863..26673a0b20e7 100644
--- a/fs/xfs/xfs_buf.c
+++ b/fs/xfs/xfs_buf.c
@@ -569,7 +569,9 @@ found:
         */
        if (bp->b_flags & XBF_STALE) {
                ASSERT((bp->b_flags & _XBF_DELWRI_Q) == 0);
+                ASSERT(bp->b_iodone == NULL);
                bp->b_flags &= _XBF_KMEM | _XBF_PAGES;
+                bp->b_ops = NULL;
        }
        trace_xfs_buf_find(bp, flags, _RET_IP_);
@@ -654,7 +656,8 @@ xfs_buf_read_map(
        struct xfs_buftarg      *target,
        struct xfs_buf_map      *map,
        int                     nmaps,
-        xfs_buf_flags_t         flags)
+        xfs_buf_flags_t         flags,
+        const struct xfs_buf_ops *ops)
 {
        struct xfs_buf          *bp;
@@ -666,6 +669,7 @@ xfs_buf_read_map(
                if (!XFS_BUF_ISDONE(bp)) {
                        XFS_STATS_INC(xb_get_read);
+                        bp->b_ops = ops;
                        _xfs_buf_read(bp, flags);
                } else if (flags & XBF_ASYNC) {
                        /*
@@ -691,13 +695,14 @@ void
 xfs_buf_readahead_map(
        struct xfs_buftarg      *target,
        struct xfs_buf_map      *map,
-        int                     nmaps)
+        int                     nmaps,
+        const struct xfs_buf_ops *ops)
 {
        if (bdi_read_congested(target->bt_bdi))
                return;
        xfs_buf_read_map(target, map, nmaps,
-                     XBF_TRYLOCK|XBF_ASYNC|XBF_READ_AHEAD);
+                     XBF_TRYLOCK|XBF_ASYNC|XBF_READ_AHEAD, ops);
 }
 /*
@@ -709,10 +714,10 @@ xfs_buf_read_uncached(
        struct xfs_buftarg      *target,
        xfs_daddr_t             daddr,
        size_t                  numblks,
-        int                     flags)
+        int                     flags,
+        const struct xfs_buf_ops *ops)
 {
-        xfs_buf_t               *bp;
+        struct xfs_buf          *bp;
-        int                     error;
        bp = xfs_buf_get_uncached(target, numblks, flags);
        if (!bp)
@@ -723,13 +728,10 @@ xfs_buf_read_uncached(
        bp->b_bn = daddr;
        bp->b_maps[0].bm_bn = daddr;
        bp->b_flags |= XBF_READ;
+        bp->b_ops = ops;
        xfsbdstrat(target->bt_mount, bp);
-        error = xfs_buf_iowait(bp);
+        xfs_buf_iowait(bp);
-        if (error) {
-                xfs_buf_relse(bp);
-                return NULL;
-        }
        return bp;
 }
@@ -999,27 +1001,37 @@ STATIC void
 xfs_buf_iodone_work(
        struct work_struct      *work)
 {
-        xfs_buf_t               *bp =
+        struct xfs_buf          *bp =
                container_of(work, xfs_buf_t, b_iodone_work);
+        bool                    read = !!(bp->b_flags & XBF_READ);
+        bp->b_flags &= ~(XBF_READ | XBF_WRITE | XBF_READ_AHEAD);
+        if (read && bp->b_ops)
+                bp->b_ops->verify_read(bp);
        if (bp->b_iodone)
                (*(bp->b_iodone))(bp);
        else if (bp->b_flags & XBF_ASYNC)
                xfs_buf_relse(bp);
+        else {
+                ASSERT(read && bp->b_ops);
+                complete(&bp->b_iowait);
+        }
 }
 void
 xfs_buf_ioend(
-        xfs_buf_t               *bp,
+        struct xfs_buf  *bp,
-        int                     schedule)
+        int             schedule)
 {
+        bool            read = !!(bp->b_flags & XBF_READ);
        trace_xfs_buf_iodone(bp, _RET_IP_);
-        bp->b_flags &= ~(XBF_READ | XBF_WRITE | XBF_READ_AHEAD);
        if (bp->b_error == 0)
                bp->b_flags |= XBF_DONE;
-        if ((bp->b_iodone) || (bp->b_flags & XBF_ASYNC)) {
+        if (bp->b_iodone || (read && bp->b_ops) || (bp->b_flags & XBF_ASYNC)) {
                if (schedule) {
                        INIT_WORK(&bp->b_iodone_work, xfs_buf_iodone_work);
                        queue_work(xfslogd_workqueue, &bp->b_iodone_work);
@@ -1027,6 +1039,7 @@ xfs_buf_ioend(
                        xfs_buf_iodone_work(&bp->b_iodone_work);
                }
        } else {
+                bp->b_flags &= ~(XBF_READ | XBF_WRITE | XBF_READ_AHEAD);
                complete(&bp->b_iowait);
        }
 }
@@ -1197,9 +1210,14 @@ xfs_buf_bio_end_io(
 {
        xfs_buf_t               *bp = (xfs_buf_t *)bio->bi_private;
-        xfs_buf_ioerror(bp, -error);
+        /*
+         * don't overwrite existing errors - otherwise we can lose errors on
+         * buffers that require multiple bios to complete.
+         */
+        if (!bp->b_error)
+                xfs_buf_ioerror(bp, -error);
-        if (!error && xfs_buf_is_vmapped(bp) && (bp->b_flags & XBF_READ))
+        if (!bp->b_error && xfs_buf_is_vmapped(bp) && (bp->b_flags & XBF_READ))
                invalidate_kernel_vmap_range(bp->b_addr, xfs_buf_vmap_len(bp));
        _xfs_buf_ioend(bp, 1);
@@ -1279,6 +1297,11 @@ next_chunk:
                if (size)
                        goto next_chunk;
        } else {
+                /*
+                 * This is guaranteed not to be the last io reference count
+                 * because the caller (xfs_buf_iorequest) holds a count itself.
+                 */
+                atomic_dec(&bp->b_io_remaining);
                xfs_buf_ioerror(bp, EIO);
                bio_put(bio);
        }
@@ -1304,6 +1327,20 @@ _xfs_buf_ioapply(
                        rw |= REQ_FUA;
                if (bp->b_flags & XBF_FLUSH)
                        rw |= REQ_FLUSH;
+                /*
+                 * Run the write verifier callback function if it exists. If
+                 * this function fails it will mark the buffer with an error and
+                 * the IO should not be dispatched.
+                 */
+                if (bp->b_ops) {
+                        bp->b_ops->verify_write(bp);
+                        if (bp->b_error) {
+                                xfs_force_shutdown(bp->b_target->bt_mount,
+                                                   SHUTDOWN_CORRUPT_INCORE);
+                                return;
+                        }
+                }
        } else if (bp->b_flags & XBF_READ_AHEAD) {
                rw = READA;
        } else {
diff --git a/fs/xfs/xfs_buf.h b/fs/xfs/xfs_buf.h
index 7c0b6a0a1557..23f5642480bb 100644
--- a/fs/xfs/xfs_buf.h
+++ b/fs/xfs/xfs_buf.h
@@ -100,6 +100,7 @@ typedef struct xfs_buftarg {
 struct xfs_buf;
 typedef void (*xfs_buf_iodone_t)(struct xfs_buf *);
 #define XB_PAGES        2
 struct xfs_buf_map {
@@ -110,6 +111,11 @@ struct xfs_buf_map {
 #define DEFINE_SINGLE_BUF_MAP(map, blkno, numblk) \
        struct xfs_buf_map (map) = { .bm_bn = (blkno), .bm_len = (numblk) };
+struct xfs_buf_ops {
+        void (*verify_read)(struct xfs_buf *);
+        void (*verify_write)(struct xfs_buf *);
+};
 typedef struct xfs_buf {
        /*
         * first cacheline holds all the fields needed for an uncontended cache
@@ -153,13 +159,13 @@ typedef struct xfs_buf {
        unsigned int            b_page_count;   /* size of page array */
        unsigned int            b_offset;       /* page offset in first page */
        unsigned short          b_error;        /* error code on I/O */
+        const struct xfs_buf_ops        *b_ops;
 #ifdef XFS_BUF_LOCK_TRACKING
        int                     b_last_holder;
 #endif
 } xfs_buf_t;
 /* Finding and Reading Buffers */
 struct xfs_buf *_xfs_buf_find(struct xfs_buftarg *target,
                              struct xfs_buf_map *map, int nmaps,
@@ -196,9 +202,11 @@ struct xfs_buf *xfs_buf_get_map(struct xfs_buftarg *target,
                               xfs_buf_flags_t flags);
 struct xfs_buf *xfs_buf_read_map(struct xfs_buftarg *target,
                               struct xfs_buf_map *map, int nmaps,
-                               xfs_buf_flags_t flags);
+                               xfs_buf_flags_t flags,
+                               const struct xfs_buf_ops *ops);
 void xfs_buf_readahead_map(struct xfs_buftarg *target,
-                               struct xfs_buf_map *map, int nmaps);
+                               struct xfs_buf_map *map, int nmaps,
+                               const struct xfs_buf_ops *ops);
 static inline struct xfs_buf *
 xfs_buf_get(
@@ -216,20 +224,22 @@ xfs_buf_read(
        struct xfs_buftarg      *target,
        xfs_daddr_t             blkno,
        size_t                  numblks,
-        xfs_buf_flags_t         flags)
+        xfs_buf_flags_t         flags,
+        const struct xfs_buf_ops *ops)
 {
        DEFINE_SINGLE_BUF_MAP(map, blkno, numblks);
-        return xfs_buf_read_map(target, &map, 1, flags);
+        return xfs_buf_read_map(target, &map, 1, flags, ops);
 }
 static inline void
 xfs_buf_readahead(
        struct xfs_buftarg      *target,
        xfs_daddr_t             blkno,
-        size_t                  numblks)
+        size_t                  numblks,
+        const struct xfs_buf_ops *ops)
 {
        DEFINE_SINGLE_BUF_MAP(map, blkno, numblks);
-        return xfs_buf_readahead_map(target, &map, 1);
+        return xfs_buf_readahead_map(target, &map, 1, ops);
 }
 struct xfs_buf *xfs_buf_get_empty(struct xfs_buftarg *target, size_t numblks);
@@ -239,7 +249,8 @@ int xfs_buf_associate_memory(struct xfs_buf *bp, void *mem, size_t length);
 struct xfs_buf *xfs_buf_get_uncached(struct xfs_buftarg *target, size_t numblks,
                                int flags);
 struct xfs_buf *xfs_buf_read_uncached(struct xfs_buftarg *target,
-                                xfs_daddr_t daddr, size_t numblks, int flags);
+                                xfs_daddr_t daddr, size_t numblks, int flags,
+                                const struct xfs_buf_ops *ops);
 void xfs_buf_hold(struct xfs_buf *bp);
 /* Releasing Buffers */
diff --git a/fs/xfs/xfs_buf_item.c b/fs/xfs/xfs_buf_item.c
index a8d0ed911196..becf4a97efc6 100644
--- a/fs/xfs/xfs_buf_item.c
+++ b/fs/xfs/xfs_buf_item.c
@@ -526,7 +526,25 @@ xfs_buf_item_unpin(
                }
                xfs_buf_relse(bp);
        } else if (freed && remove) {
+                /*
+                 * There are currently two references to the buffer - the active
+                 * LRU reference and the buf log item. What we are about to do
+                 * here - simulate a failed IO completion - requires 3
+                 * references.
+                 *
+                 * The LRU reference is removed by the xfs_buf_stale() call. The
+                 * buf item reference is removed by the xfs_buf_iodone()
+                 * callback that is run by xfs_buf_do_callbacks() during ioend
+                 * processing (via the bp->b_iodone callback), and then finally
+                 * the ioend processing will drop the IO reference if the buffer
+                 * is marked XBF_ASYNC.
+                 *
+                 * Hence we need to take an additional reference here so that IO
+                 * completion processing doesn't free the buffer prematurely.
+                 */
                xfs_buf_lock(bp);
+                xfs_buf_hold(bp);
+                bp->b_flags |= XBF_ASYNC;
                xfs_buf_ioerror(bp, EIO);
                XFS_BUF_UNDONE(bp);
                xfs_buf_stale(bp);
diff --git a/fs/xfs/xfs_cksum.h b/fs/xfs/xfs_cksum.h
new file mode 100644
index 000000000000..fad1676ad8cd
--- /dev/null
+++ b/fs/xfs/xfs_cksum.h
@@ -0,0 +1,63 @@
+#ifndef _XFS_CKSUM_H
+#define _XFS_CKSUM_H 1
+#define XFS_CRC_SEED    (~(__uint32_t)0)
+/*
+ * Calculate the intermediate checksum for a buffer that has the CRC field
+ * inside it.  The offset of the 32bit crc fields is passed as the
+ * cksum_offset parameter.
+ */
+static inline __uint32_t
+xfs_start_cksum(char *buffer, size_t length, unsigned long cksum_offset)
+{
+        __uint32_t zero = 0;
+        __uint32_t crc;
+        /* Calculate CRC up to the checksum. */
+        crc = crc32c(XFS_CRC_SEED, buffer, cksum_offset);
+        /* Skip checksum field */
+        crc = crc32c(crc, &zero, sizeof(__u32));
+        /* Calculate the rest of the CRC. */
+        return crc32c(crc, &buffer[cksum_offset + sizeof(__be32)],
+                      length - (cksum_offset + sizeof(__be32)));
+}
+/*
+ * Convert the intermediate checksum to the final ondisk format.
+ *
+ * The CRC32c calculation uses LE format even on BE machines, but returns the
+ * result in host endian format. Hence we need to byte swap it back to LE format
+ * so that it is consistent on disk.
+ */
+static inline __le32
+xfs_end_cksum(__uint32_t crc)
+{
+        return ~cpu_to_le32(crc);
+}
+/*
+ * Helper to generate the checksum for a buffer.
+ */
+static inline void
+xfs_update_cksum(char *buffer, size_t length, unsigned long cksum_offset)
+{
+        __uint32_t crc = xfs_start_cksum(buffer, length, cksum_offset);
+        *(__le32 *)(buffer + cksum_offset) = xfs_end_cksum(crc);
+}
+/*
+ * Helper to verify the checksum for a buffer.
+ */
+static inline int
+xfs_verify_cksum(char *buffer, size_t length, unsigned long cksum_offset)
+{
+        __uint32_t crc = xfs_start_cksum(buffer, length, cksum_offset);
+        return *(__le32 *)(buffer + cksum_offset) == xfs_end_cksum(crc);
+}
+#endif /* _XFS_CKSUM_H */
diff --git a/fs/xfs/xfs_da_btree.c b/fs/xfs/xfs_da_btree.c
index 7bfb7dd334fc..4d7696a02418 100644
--- a/fs/xfs/xfs_da_btree.c
+++ b/fs/xfs/xfs_da_btree.c
@@ -91,6 +91,84 @@ STATIC int	xfs_da_blk_unlink(xfs_da_state_t *state,
                                  xfs_da_state_blk_t *save_blk);
 STATIC void     xfs_da_state_kill_altpath(xfs_da_state_t *state);
+static void
+xfs_da_node_verify(
+        struct xfs_buf          *bp)
+{
+        struct xfs_mount        *mp = bp->b_target->bt_mount;
+        struct xfs_da_node_hdr *hdr = bp->b_addr;
+        int                     block_ok = 0;
+        block_ok = hdr->info.magic == cpu_to_be16(XFS_DA_NODE_MAGIC);
+        block_ok = block_ok &&
+                        be16_to_cpu(hdr->level) > 0 &&
+                        be16_to_cpu(hdr->count) > 0 ;
+        if (!block_ok) {
+                XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_LOW, mp, hdr);
+                xfs_buf_ioerror(bp, EFSCORRUPTED);
+        }
+}
+static void
+xfs_da_node_write_verify(
+        struct xfs_buf  *bp)
+{
+        xfs_da_node_verify(bp);
+}
+/*
+ * leaf/node format detection on trees is sketchy, so a node read can be done on
+ * leaf level blocks when detection identifies the tree as a node format tree
+ * incorrectly. In this case, we need to swap the verifier to match the correct
+ * format of the block being read.
+ */
+static void
+xfs_da_node_read_verify(
+        struct xfs_buf          *bp)
+{
+        struct xfs_mount        *mp = bp->b_target->bt_mount;
+        struct xfs_da_blkinfo   *info = bp->b_addr;
+        switch (be16_to_cpu(info->magic)) {
+                case XFS_DA_NODE_MAGIC:
+                        xfs_da_node_verify(bp);
+                        break;
+                case XFS_ATTR_LEAF_MAGIC:
+                        bp->b_ops = &xfs_attr_leaf_buf_ops;
+                        bp->b_ops->verify_read(bp);
+                        return;
+                case XFS_DIR2_LEAFN_MAGIC:
+                        bp->b_ops = &xfs_dir2_leafn_buf_ops;
+                        bp->b_ops->verify_read(bp);
+                        return;
+                default:
+                        XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_LOW,
+                                             mp, info);
+                        xfs_buf_ioerror(bp, EFSCORRUPTED);
+                        break;
+        }
+}
+const struct xfs_buf_ops xfs_da_node_buf_ops = {
+        .verify_read = xfs_da_node_read_verify,
+        .verify_write = xfs_da_node_write_verify,
+};
+int
+xfs_da_node_read(
+        struct xfs_trans        *tp,
+        struct xfs_inode        *dp,
+        xfs_dablk_t             bno,
+        xfs_daddr_t             mappedbno,
+        struct xfs_buf          **bpp,
+        int                     which_fork)
+{
+        return xfs_da_read_buf(tp, dp, bno, mappedbno, bpp,
+                                        which_fork, &xfs_da_node_buf_ops);
+}
 /*========================================================================
 * Routines used for growing the Btree.
 *========================================================================*/
@@ -125,6 +203,7 @@ xfs_da_node_create(xfs_da_args_t *args, xfs_dablk_t blkno, int level,
        xfs_trans_log_buf(tp, bp,
                XFS_DA_LOGRANGE(node, &node->hdr, sizeof(node->hdr)));
+        bp->b_ops = &xfs_da_node_buf_ops;
        *bpp = bp;
        return(0);
 }
@@ -324,6 +403,8 @@ xfs_da_root_split(xfs_da_state_t *state, xfs_da_state_blk_t *blk1,
        }
        memcpy(node, oldroot, size);
        xfs_trans_log_buf(tp, bp, 0, size - 1);
+        bp->b_ops = blk1->bp->b_ops;
        blk1->bp = bp;
        blk1->blkno = blkno;
@@ -746,7 +827,7 @@ xfs_da_root_join(xfs_da_state_t *state, xfs_da_state_blk_t *root_blk)
         */
        child = be32_to_cpu(oldroot->btree[0].before);
        ASSERT(child != 0);
-        error = xfs_da_read_buf(args->trans, args->dp, child, -1, &bp,
+        error = xfs_da_node_read(args->trans, args->dp, child, -1, &bp,
                                             args->whichfork);
        if (error)
                return(error);
@@ -754,7 +835,14 @@ xfs_da_root_join(xfs_da_state_t *state, xfs_da_state_blk_t *root_blk)
        xfs_da_blkinfo_onlychild_validate(bp->b_addr,
                                        be16_to_cpu(oldroot->hdr.level));
+        /*
+         * This could be copying a leaf back into the root block in the case of
+         * there only being a single leaf block left in the tree. Hence we have
+         * to update the b_ops pointer as well to match the buffer type change
+         * that could occur.
+         */
        memcpy(root_blk->bp->b_addr, bp->b_addr, state->blocksize);
+        root_blk->bp->b_ops = bp->b_ops;
        xfs_trans_log_buf(args->trans, root_blk->bp, 0, state->blocksize - 1);
        error = xfs_da_shrink_inode(args, child, bp);
        return(error);
@@ -779,6 +867,8 @@ xfs_da_node_toosmall(xfs_da_state_t *state, int *action)
        xfs_dablk_t blkno;
        struct xfs_buf *bp;
+        trace_xfs_da_node_toosmall(state->args);
        /*
         * Check for the degenerate case of the block being over 50% full.
         * If so, it's not worth even looking to see if we might be able
@@ -835,7 +925,7 @@ xfs_da_node_toosmall(xfs_da_state_t *state, int *action)
                        blkno = be32_to_cpu(info->back);
                if (blkno == 0)
                        continue;
-                error = xfs_da_read_buf(state->args->trans, state->args->dp,
+                error = xfs_da_node_read(state->args->trans, state->args->dp,
                                        blkno, -1, &bp, state->args->whichfork);
                if (error)
                        return(error);
@@ -900,6 +990,8 @@ xfs_da_fixhashpath(xfs_da_state_t *state, xfs_da_state_path_t *path)
        xfs_dahash_t lasthash=0;
        int level, count;
+        trace_xfs_da_fixhashpath(state->args);
        level = path->active-1;
        blk = &path->blk[ level ];
        switch (blk->magic) {
@@ -1079,7 +1171,7 @@ xfs_da_node_lookup_int(xfs_da_state_t *state, int *result)
                 * Read the next node down in the tree.
                 */
                blk->blkno = blkno;
-                error = xfs_da_read_buf(args->trans, args->dp, blkno,
+                error = xfs_da_node_read(args->trans, args->dp, blkno,
                                        -1, &blk->bp, args->whichfork);
                if (error) {
                        blk->blkno = 0;
@@ -1241,7 +1333,7 @@ xfs_da_blk_link(xfs_da_state_t *state, xfs_da_state_blk_t *old_blk,
                new_info->forw = cpu_to_be32(old_blk->blkno);
                new_info->back = old_info->back;
                if (old_info->back) {
-                        error = xfs_da_read_buf(args->trans, args->dp,
+                        error = xfs_da_node_read(args->trans, args->dp,
                                                be32_to_cpu(old_info->back),
                                                -1, &bp, args->whichfork);
                        if (error)
@@ -1262,7 +1354,7 @@ xfs_da_blk_link(xfs_da_state_t *state, xfs_da_state_blk_t *old_blk,
                new_info->forw = old_info->forw;
                new_info->back = cpu_to_be32(old_blk->blkno);
                if (old_info->forw) {
-                        error = xfs_da_read_buf(args->trans, args->dp,
+                        error = xfs_da_node_read(args->trans, args->dp,
                                                be32_to_cpu(old_info->forw),
                                                -1, &bp, args->whichfork);
                        if (error)
@@ -1362,7 +1454,7 @@ xfs_da_blk_unlink(xfs_da_state_t *state, xfs_da_state_blk_t *drop_blk,
                trace_xfs_da_unlink_back(args);
                save_info->back = drop_info->back;
                if (drop_info->back) {
-                        error = xfs_da_read_buf(args->trans, args->dp,
+                        error = xfs_da_node_read(args->trans, args->dp,
                                                be32_to_cpu(drop_info->back),
                                                -1, &bp, args->whichfork);
                        if (error)
@@ -1379,7 +1471,7 @@ xfs_da_blk_unlink(xfs_da_state_t *state, xfs_da_state_blk_t *drop_blk,
                trace_xfs_da_unlink_forward(args);
                save_info->forw = drop_info->forw;
                if (drop_info->forw) {
-                        error = xfs_da_read_buf(args->trans, args->dp,
+                        error = xfs_da_node_read(args->trans, args->dp,
                                                be32_to_cpu(drop_info->forw),
                                                -1, &bp, args->whichfork);
                        if (error)
@@ -1417,6 +1509,8 @@ xfs_da_path_shift(xfs_da_state_t *state, xfs_da_state_path_t *path,
        xfs_dablk_t blkno=0;
        int level, error;
+        trace_xfs_da_path_shift(state->args);
        /*
         * Roll up the Btree looking for the first block where our
         * current index is not at the edge of the block.  Note that
@@ -1463,8 +1557,8 @@ xfs_da_path_shift(xfs_da_state_t *state, xfs_da_state_path_t *path,
                 * Read the next child block.
                 */
                blk->blkno = blkno;
-                error = xfs_da_read_buf(args->trans, args->dp, blkno, -1,
+                error = xfs_da_node_read(args->trans, args->dp, blkno, -1,
-                                                     &blk->bp, args->whichfork);
+                                        &blk->bp, args->whichfork);
                if (error)
                        return(error);
                ASSERT(blk->bp != NULL);
@@ -1727,7 +1821,8 @@ xfs_da_swap_lastblock(
         * Read the last block in the btree space.
         */
        last_blkno = (xfs_dablk_t)lastoff - mp->m_dirblkfsbs;
-        if ((error = xfs_da_read_buf(tp, ip, last_blkno, -1, &last_buf, w)))
+        error = xfs_da_node_read(tp, ip, last_blkno, -1, &last_buf, w);
+        if (error)
                return error;
        /*
         * Copy the last block into the dead buffer and log it.
@@ -1753,7 +1848,8 @@ xfs_da_swap_lastblock(
         * If the moved block has a left sibling, fix up the pointers.
         */
        if ((sib_blkno = be32_to_cpu(dead_info->back))) {
-                if ((error = xfs_da_read_buf(tp, ip, sib_blkno, -1, &sib_buf, w)))
+                error = xfs_da_node_read(tp, ip, sib_blkno, -1, &sib_buf, w);
+                if (error)
                        goto done;
                sib_info = sib_buf->b_addr;
                if (unlikely(
@@ -1774,7 +1870,8 @@ xfs_da_swap_lastblock(
         * If the moved block has a right sibling, fix up the pointers.
         */
        if ((sib_blkno = be32_to_cpu(dead_info->forw))) {
-                if ((error = xfs_da_read_buf(tp, ip, sib_blkno, -1, &sib_buf, w)))
+                error = xfs_da_node_read(tp, ip, sib_blkno, -1, &sib_buf, w);
+                if (error)
                        goto done;
                sib_info = sib_buf->b_addr;
                if (unlikely(
@@ -1797,7 +1894,8 @@ xfs_da_swap_lastblock(
         * Walk down the tree looking for the parent of the moved block.
         */
        for (;;) {
-                if ((error = xfs_da_read_buf(tp, ip, par_blkno, -1, &par_buf, w)))
+                error = xfs_da_node_read(tp, ip, par_blkno, -1, &par_buf, w);
+                if (error)
                        goto done;
                par_node = par_buf->b_addr;
                if (unlikely(par_node->hdr.info.magic !=
@@ -1847,7 +1945,8 @@ xfs_da_swap_lastblock(
                        error = XFS_ERROR(EFSCORRUPTED);
                        goto done;
                }
-                if ((error = xfs_da_read_buf(tp, ip, par_blkno, -1, &par_buf, w)))
+                error = xfs_da_node_read(tp, ip, par_blkno, -1, &par_buf, w);
+                if (error)
                        goto done;
                par_node = par_buf->b_addr;
                if (unlikely(
@@ -2133,7 +2232,8 @@ xfs_da_read_buf(
        xfs_dablk_t             bno,
        xfs_daddr_t             mappedbno,
        struct xfs_buf          **bpp,
-        int                     whichfork)
+        int                     whichfork,
+        const struct xfs_buf_ops *ops)
 {
        struct xfs_buf          *bp;
        struct xfs_buf_map      map;
@@ -2155,7 +2255,7 @@ xfs_da_read_buf(
        error = xfs_trans_read_buf_map(dp->i_mount, trans,
                                        dp->i_mount->m_ddev_targp,
-                                        mapp, nmap, 0, &bp);
+                                        mapp, nmap, 0, &bp, ops);
        if (error)
                goto out_free;
@@ -2211,9 +2311,10 @@ xfs_da_reada_buf(
        struct xfs_trans        *trans,
        struct xfs_inode        *dp,
        xfs_dablk_t             bno,
-        int                     whichfork)
+        xfs_daddr_t             mappedbno,
+        int                     whichfork,
+        const struct xfs_buf_ops *ops)
 {
-        xfs_daddr_t             mappedbno = -1;
        struct xfs_buf_map      map;
        struct xfs_buf_map      *mapp;
        int                     nmap;
@@ -2221,7 +2322,7 @@ xfs_da_reada_buf(
        mapp = &map;
        nmap = 1;
-        error = xfs_dabuf_map(trans, dp, bno, -1, whichfork,
+        error = xfs_dabuf_map(trans, dp, bno, mappedbno, whichfork,
                                &mapp, &nmap);
        if (error) {
                /* mapping a hole is not an error, but we don't continue */
@@ -2231,7 +2332,7 @@ xfs_da_reada_buf(
        }
        mappedbno = mapp[0].bm_bn;
-        xfs_buf_readahead_map(dp->i_mount->m_ddev_targp, mapp, nmap);
+        xfs_buf_readahead_map(dp->i_mount->m_ddev_targp, mapp, nmap, ops);
 out_free:
        if (mapp != &map)
diff --git a/fs/xfs/xfs_da_btree.h b/fs/xfs/xfs_da_btree.h
index 132adafb041e..ee5170c46ae1 100644
--- a/fs/xfs/xfs_da_btree.h
+++ b/fs/xfs/xfs_da_btree.h
@@ -18,7 +18,6 @@
 #ifndef __XFS_DA_BTREE_H__
 #define __XFS_DA_BTREE_H__
-struct xfs_buf;
 struct xfs_bmap_free;
 struct xfs_inode;
 struct xfs_mount;
@@ -214,6 +213,9 @@ int	xfs_da_path_shift(xfs_da_state_t *state, xfs_da_state_path_t *path,
 */
 int     xfs_da_blk_link(xfs_da_state_t *state, xfs_da_state_blk_t *old_blk,
                                       xfs_da_state_blk_t *new_blk);
+int     xfs_da_node_read(struct xfs_trans *tp, struct xfs_inode *dp,
+                         xfs_dablk_t bno, xfs_daddr_t mappedbno,
+                         struct xfs_buf **bpp, int which_fork);
 /*
 * Utility routines.
@@ -226,9 +228,11 @@ int	xfs_da_get_buf(struct xfs_trans *trans, struct xfs_inode *dp,
                              struct xfs_buf **bp, int whichfork);
 int     xfs_da_read_buf(struct xfs_trans *trans, struct xfs_inode *dp,
                               xfs_dablk_t bno, xfs_daddr_t mappedbno,
-                               struct xfs_buf **bpp, int whichfork);
+                               struct xfs_buf **bpp, int whichfork,
+                               const struct xfs_buf_ops *ops);
 xfs_daddr_t     xfs_da_reada_buf(struct xfs_trans *trans, struct xfs_inode *dp,
-                        xfs_dablk_t bno, int whichfork);
+                                xfs_dablk_t bno, xfs_daddr_t mapped_bno,
+                                int whichfork, const struct xfs_buf_ops *ops);
 int     xfs_da_shrink_inode(xfs_da_args_t *args, xfs_dablk_t dead_blkno,
                                          struct xfs_buf *dead_buf);
diff --git a/fs/xfs/xfs_dfrag.c b/fs/xfs/xfs_dfrag.c
index b9b8646e62db..d0e9c74d3d96 100644
--- a/fs/xfs/xfs_dfrag.c
+++ b/fs/xfs/xfs_dfrag.c
@@ -246,12 +246,10 @@ xfs_swap_extents(
                goto out_unlock;
        }
-        if (VN_CACHED(VFS_I(tip)) != 0) {
+        error = -filemap_write_and_wait(VFS_I(ip)->i_mapping);
-                error = xfs_flushinval_pages(tip, 0, -1,
+        if (error)
-                                FI_REMAPF_LOCKED);
+                goto out_unlock;
-                if (error)
+        truncate_pagecache_range(VFS_I(ip), 0, -1);
-                        goto out_unlock;
-        }
        /* Verify O_DIRECT for ftmp */
        if (VN_CACHED(VFS_I(tip)) != 0) {
@@ -315,8 +313,7 @@ xfs_swap_extents(
         * are safe.  We don't really care if non-io related
         * fields change.
         */
+        truncate_pagecache_range(VFS_I(ip), 0, -1);
-        xfs_tosspages(ip, 0, -1, FI_REMAPF);
        tp = xfs_trans_alloc(mp, XFS_TRANS_SWAPEXT);
        if ((error = xfs_trans_reserve(tp, 0,
diff --git a/fs/xfs/xfs_dir2_block.c b/fs/xfs/xfs_dir2_block.c
index e93ca8f054f4..7536faaa61e7 100644
--- a/fs/xfs/xfs_dir2_block.c
+++ b/fs/xfs/xfs_dir2_block.c
@@ -56,6 +56,214 @@ xfs_dir_startup(void)
        xfs_dir_hash_dotdot = xfs_da_hashname((unsigned char *)"..", 2);
 }
+static void
+xfs_dir2_block_verify(
+        struct xfs_buf          *bp)
+{
+        struct xfs_mount        *mp = bp->b_target->bt_mount;
+        struct xfs_dir2_data_hdr *hdr = bp->b_addr;
+        int                     block_ok = 0;
+        block_ok = hdr->magic == cpu_to_be32(XFS_DIR2_BLOCK_MAGIC);
+        block_ok = block_ok && __xfs_dir2_data_check(NULL, bp) == 0;
+        if (!block_ok) {
+                XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_LOW, mp, hdr);
+                xfs_buf_ioerror(bp, EFSCORRUPTED);
+        }
+}
+static void
+xfs_dir2_block_read_verify(
+        struct xfs_buf  *bp)
+{
+        xfs_dir2_block_verify(bp);
+}
+static void
+xfs_dir2_block_write_verify(
+        struct xfs_buf  *bp)
+{
+        xfs_dir2_block_verify(bp);
+}
+const struct xfs_buf_ops xfs_dir2_block_buf_ops = {
+        .verify_read = xfs_dir2_block_read_verify,
+        .verify_write = xfs_dir2_block_write_verify,
+};
+static int
+xfs_dir2_block_read(
+        struct xfs_trans        *tp,
+        struct xfs_inode        *dp,
+        struct xfs_buf          **bpp)
+{
+        struct xfs_mount        *mp = dp->i_mount;
+        return xfs_da_read_buf(tp, dp, mp->m_dirdatablk, -1, bpp,
+                                XFS_DATA_FORK, &xfs_dir2_block_buf_ops);
+}
+static void
+xfs_dir2_block_need_space(
+        struct xfs_dir2_data_hdr        *hdr,
+        struct xfs_dir2_block_tail      *btp,
+        struct xfs_dir2_leaf_entry      *blp,
+        __be16                          **tagpp,
+        struct xfs_dir2_data_unused     **dupp,
+        struct xfs_dir2_data_unused     **enddupp,
+        int                             *compact,
+        int                             len)
+{
+        struct xfs_dir2_data_free       *bf;
+        __be16                          *tagp = NULL;
+        struct xfs_dir2_data_unused     *dup = NULL;
+        struct xfs_dir2_data_unused     *enddup = NULL;
+        *compact = 0;
+        bf = hdr->bestfree;
+        /*
+         * If there are stale entries we'll use one for the leaf.
+         */
+        if (btp->stale) {
+                if (be16_to_cpu(bf[0].length) >= len) {
+                        /*
+                         * The biggest entry enough to avoid compaction.
+                         */
+                        dup = (xfs_dir2_data_unused_t *)
+                              ((char *)hdr + be16_to_cpu(bf[0].offset));
+                        goto out;
+                }
+                /*
+                 * Will need to compact to make this work.
+                 * Tag just before the first leaf entry.
+                 */
+                *compact = 1;
+                tagp = (__be16 *)blp - 1;
+                /* Data object just before the first leaf entry.  */
+                dup = (xfs_dir2_data_unused_t *)((char *)hdr + be16_to_cpu(*tagp));
+                /*
+                 * If it's not free then the data will go where the
+                 * leaf data starts now, if it works at all.
+                 */
+                if (be16_to_cpu(dup->freetag) == XFS_DIR2_DATA_FREE_TAG) {
+                        if (be16_to_cpu(dup->length) + (be32_to_cpu(btp->stale) - 1) *
+                            (uint)sizeof(*blp) < len)
+                                dup = NULL;
+                } else if ((be32_to_cpu(btp->stale) - 1) * (uint)sizeof(*blp) < len)
+                        dup = NULL;
+                else
+                        dup = (xfs_dir2_data_unused_t *)blp;
+                goto out;
+        }
+        /*
+         * no stale entries, so just use free space.
+         * Tag just before the first leaf entry.
+         */
+        tagp = (__be16 *)blp - 1;
+        /* Data object just before the first leaf entry.  */
+        enddup = (xfs_dir2_data_unused_t *)((char *)hdr + be16_to_cpu(*tagp));
+        /*
+         * If it's not free then can't do this add without cleaning up:
+         * the space before the first leaf entry needs to be free so it
+         * can be expanded to hold the pointer to the new entry.
+         */
+        if (be16_to_cpu(enddup->freetag) == XFS_DIR2_DATA_FREE_TAG) {
+                /*
+                 * Check out the biggest freespace and see if it's the same one.
+                 */
+                dup = (xfs_dir2_data_unused_t *)
+                      ((char *)hdr + be16_to_cpu(bf[0].offset));
+                if (dup != enddup) {
+                        /*
+                         * Not the same free entry, just check its length.
+                         */
+                        if (be16_to_cpu(dup->length) < len)
+                                dup = NULL;
+                        goto out;
+                }
+                /*
+                 * It is the biggest freespace, can it hold the leaf too?
+                 */
+                if (be16_to_cpu(dup->length) < len + (uint)sizeof(*blp)) {
+                        /*
+                         * Yes, use the second-largest entry instead if it works.
+                         */
+                        if (be16_to_cpu(bf[1].length) >= len)
+                                dup = (xfs_dir2_data_unused_t *)
+                                      ((char *)hdr + be16_to_cpu(bf[1].offset));
+                        else
+                                dup = NULL;
+                }
+        }
+out:
+        *tagpp = tagp;
+        *dupp = dup;
+        *enddupp = enddup;
+}
+/*
+ * compact the leaf entries.
+ * Leave the highest-numbered stale entry stale.
+ * XXX should be the one closest to mid but mid is not yet computed.
+ */
+static void
+xfs_dir2_block_compact(
+        struct xfs_trans                *tp,
+        struct xfs_buf                  *bp,
+        struct xfs_dir2_data_hdr        *hdr,
+        struct xfs_dir2_block_tail      *btp,
+        struct xfs_dir2_leaf_entry      *blp,
+        int                             *needlog,
+        int                             *lfloghigh,
+        int                             *lfloglow)
+{
+        int                     fromidx;        /* source leaf index */
+        int                     toidx;          /* target leaf index */
+        int                     needscan = 0;
+        int                     highstale;      /* high stale index */
+        fromidx = toidx = be32_to_cpu(btp->count) - 1;
+        highstale = *lfloghigh = -1;
+        for (; fromidx >= 0; fromidx--) {
+                if (blp[fromidx].address == cpu_to_be32(XFS_DIR2_NULL_DATAPTR)) {
+                        if (highstale == -1)
+                                highstale = toidx;
+                        else {
+                                if (*lfloghigh == -1)
+                                        *lfloghigh = toidx;
+                                continue;
+                        }
+                }
+                if (fromidx < toidx)
+                        blp[toidx] = blp[fromidx];
+                toidx--;
+        }
+        *lfloglow = toidx + 1 - (be32_to_cpu(btp->stale) - 1);
+        *lfloghigh -= be32_to_cpu(btp->stale) - 1;
+        be32_add_cpu(&btp->count, -(be32_to_cpu(btp->stale) - 1));
+        xfs_dir2_data_make_free(tp, bp,
+                (xfs_dir2_data_aoff_t)((char *)blp - (char *)hdr),
+                (xfs_dir2_data_aoff_t)((be32_to_cpu(btp->stale) - 1) * sizeof(*blp)),
+                needlog, &needscan);
+        blp += be32_to_cpu(btp->stale) - 1;
+        btp->stale = cpu_to_be32(1);
+        /*
+         * If we now need to rebuild the bestfree map, do so.
+         * This needs to happen before the next call to use_free.
+         */
+        if (needscan)
+                xfs_dir2_data_freescan(tp->t_mountp, hdr, needlog);
+}
 /*
 * Add an entry to a block directory.
 */
@@ -63,7 +271,6 @@ int						/* error */
 xfs_dir2_block_addname(
        xfs_da_args_t           *args)          /* directory op arguments */
 {
-        xfs_dir2_data_free_t    *bf;            /* bestfree table in block */
        xfs_dir2_data_hdr_t     *hdr;           /* block header */
        xfs_dir2_leaf_entry_t   *blp;           /* block leaf entries */
        struct xfs_buf          *bp;            /* buffer for block */
@@ -94,134 +301,44 @@ xfs_dir2_block_addname(
        dp = args->dp;
        tp = args->trans;
        mp = dp->i_mount;
-        /*
-         * Read the (one and only) directory block into dabuf bp.
+        /* Read the (one and only) directory block into bp. */
-         */
+        error = xfs_dir2_block_read(tp, dp, &bp);
-        if ((error =
+        if (error)
-            xfs_da_read_buf(tp, dp, mp->m_dirdatablk, -1, &bp, XFS_DATA_FORK))) {
                return error;
-        }
-        ASSERT(bp != NULL);
-        hdr = bp->b_addr;
-        /*
-         * Check the magic number, corrupted if wrong.
-         */
-        if (unlikely(hdr->magic != cpu_to_be32(XFS_DIR2_BLOCK_MAGIC))) {
-                XFS_CORRUPTION_ERROR("xfs_dir2_block_addname",
-                                     XFS_ERRLEVEL_LOW, mp, hdr);
-                xfs_trans_brelse(tp, bp);
-                return XFS_ERROR(EFSCORRUPTED);
-        }
        len = xfs_dir2_data_entsize(args->namelen);
        /*
         * Set up pointers to parts of the block.
         */
-        bf = hdr->bestfree;
+        hdr = bp->b_addr;
        btp = xfs_dir2_block_tail_p(mp, hdr);
        blp = xfs_dir2_block_leaf_p(btp);
        /*
-         * No stale entries?  Need space for entry and new leaf.
+         * Find out if we can reuse stale entries or whether we need extra
-         */
+         * space for entry and new leaf.
-        if (!btp->stale) {
-                /*
-                 * Tag just before the first leaf entry.
-                 */
-                tagp = (__be16 *)blp - 1;
-                /*
-                 * Data object just before the first leaf entry.
-                 */
-                enddup = (xfs_dir2_data_unused_t *)((char *)hdr + be16_to_cpu(*tagp));
-                /*
-                 * If it's not free then can't do this add without cleaning up:
-                 * the space before the first leaf entry needs to be free so it
-                 * can be expanded to hold the pointer to the new entry.
-                 */
-                if (be16_to_cpu(enddup->freetag) != XFS_DIR2_DATA_FREE_TAG)
-                        dup = enddup = NULL;
-                /*
-                 * Check out the biggest freespace and see if it's the same one.
-                 */
-                else {
-                        dup = (xfs_dir2_data_unused_t *)
-                              ((char *)hdr + be16_to_cpu(bf[0].offset));
-                        if (dup == enddup) {
-                                /*
-                                 * It is the biggest freespace, is it too small
-                                 * to hold the new leaf too?
-                                 */
-                                if (be16_to_cpu(dup->length) < len + (uint)sizeof(*blp)) {
-                                        /*
-                                         * Yes, we use the second-largest
-                                         * entry instead if it works.
-                                         */
-                                        if (be16_to_cpu(bf[1].length) >= len)
-                                                dup = (xfs_dir2_data_unused_t *)
-                                                      ((char *)hdr +
-                                                       be16_to_cpu(bf[1].offset));
-                                        else
-                                                dup = NULL;
-                                }
-                        } else {
-                                /*
-                                 * Not the same free entry,
-                                 * just check its length.
-                                 */
-                                if (be16_to_cpu(dup->length) < len) {
-                                        dup = NULL;
-                                }
-                        }
-                }
-                compact = 0;
-        }
-        /*
-         * If there are stale entries we'll use one for the leaf.
-         * Is the biggest entry enough to avoid compaction?
-         */
-        else if (be16_to_cpu(bf[0].length) >= len) {
-                dup = (xfs_dir2_data_unused_t *)
-                      ((char *)hdr + be16_to_cpu(bf[0].offset));
-                compact = 0;
-        }
-        /*
-         * Will need to compact to make this work.
         */
-        else {
+        xfs_dir2_block_need_space(hdr, btp, blp, &tagp, &dup,
-                /*
+                                  &enddup, &compact, len);
-                 * Tag just before the first leaf entry.
-                 */
-                tagp = (__be16 *)blp - 1;
-                /*
-                 * Data object just before the first leaf entry.
-                 */
-                dup = (xfs_dir2_data_unused_t *)((char *)hdr + be16_to_cpu(*tagp));
-                /*
-                 * If it's not free then the data will go where the
-                 * leaf data starts now, if it works at all.
-                 */
-                if (be16_to_cpu(dup->freetag) == XFS_DIR2_DATA_FREE_TAG) {
-                        if (be16_to_cpu(dup->length) + (be32_to_cpu(btp->stale) - 1) *
-                            (uint)sizeof(*blp) < len)
-                                dup = NULL;
-                } else if ((be32_to_cpu(btp->stale) - 1) * (uint)sizeof(*blp) < len)
-                        dup = NULL;
-                else
-                        dup = (xfs_dir2_data_unused_t *)blp;
-                compact = 1;
-        }
        /*
-         * If this isn't a real add, we're done with the buffer.
+         * Done everything we need for a space check now.
         */
-        if (args->op_flags & XFS_DA_OP_JUSTCHECK)
+        if (args->op_flags & XFS_DA_OP_JUSTCHECK) {
                xfs_trans_brelse(tp, bp);
+                if (!dup)
+                        return XFS_ERROR(ENOSPC);
+                return 0;
+        }
        /*
         * If we don't have space for the new entry & leaf ...
         */
        if (!dup) {
-                /*
+                /* Don't have a space reservation: return no-space.  */
-                 * Not trying to actually do anything, or don't have
+                if (args->total == 0)
-                 * a space reservation: return no-space.
-                 */
-                if ((args->op_flags & XFS_DA_OP_JUSTCHECK) || args->total == 0)
                        return XFS_ERROR(ENOSPC);
                /*
                 * Convert to the next larger format.
@@ -232,65 +349,24 @@ xfs_dir2_block_addname(
                        return error;
                return xfs_dir2_leaf_addname(args);
        }
-        /*
-         * Just checking, and it would work, so say so.
-         */
-        if (args->op_flags & XFS_DA_OP_JUSTCHECK)
-                return 0;
        needlog = needscan = 0;
        /*
         * If need to compact the leaf entries, do it now.
-         * Leave the highest-numbered stale entry stale.
-         * XXX should be the one closest to mid but mid is not yet computed.
-         */
-        if (compact) {
-                int     fromidx;                /* source leaf index */
-                int     toidx;                  /* target leaf index */
-                for (fromidx = toidx = be32_to_cpu(btp->count) - 1,
-                        highstale = lfloghigh = -1;
-                     fromidx >= 0;
-                     fromidx--) {
-                        if (blp[fromidx].address ==
-                            cpu_to_be32(XFS_DIR2_NULL_DATAPTR)) {
-                                if (highstale == -1)
-                                        highstale = toidx;
-                                else {
-                                        if (lfloghigh == -1)
-                                                lfloghigh = toidx;
-                                        continue;
-                                }
-                        }
-                        if (fromidx < toidx)
-                                blp[toidx] = blp[fromidx];
-                        toidx--;
-                }
-                lfloglow = toidx + 1 - (be32_to_cpu(btp->stale) - 1);
-                lfloghigh -= be32_to_cpu(btp->stale) - 1;
-                be32_add_cpu(&btp->count, -(be32_to_cpu(btp->stale) - 1));
-                xfs_dir2_data_make_free(tp, bp,
-                        (xfs_dir2_data_aoff_t)((char *)blp - (char *)hdr),
-                        (xfs_dir2_data_aoff_t)((be32_to_cpu(btp->stale) - 1) * sizeof(*blp)),
-                        &needlog, &needscan);
-                blp += be32_to_cpu(btp->stale) - 1;
-                btp->stale = cpu_to_be32(1);
-                /*
-                 * If we now need to rebuild the bestfree map, do so.
-                 * This needs to happen before the next call to use_free.
-                 */
-                if (needscan) {
-                        xfs_dir2_data_freescan(mp, hdr, &needlog);
-                        needscan = 0;
-                }
-        }
-        /*
-         * Set leaf logging boundaries to impossible state.
-         * For the no-stale case they're set explicitly.
         */
+        if (compact)
+                xfs_dir2_block_compact(tp, bp, hdr, btp, blp, &needlog,
+                                      &lfloghigh, &lfloglow);
        else if (btp->stale) {
+                /*
+                 * Set leaf logging boundaries to impossible state.
+                 * For the no-stale case they're set explicitly.
+                 */
                lfloglow = be32_to_cpu(btp->count);
                lfloghigh = -1;
        }
        /*
         * Find the slot that's first lower than our hash value, -1 if none.
         */
@@ -450,18 +526,13 @@ xfs_dir2_block_getdents(
        /*
         * If the block number in the offset is out of range, we're done.
         */
-        if (xfs_dir2_dataptr_to_db(mp, *offset) > mp->m_dirdatablk) {
+        if (xfs_dir2_dataptr_to_db(mp, *offset) > mp->m_dirdatablk)
                return 0;
-        }
-        /*
+        error = xfs_dir2_block_read(NULL, dp, &bp);
-         * Can't read the block, give up, else get dabuf in bp.
-         */
-        error = xfs_da_read_buf(NULL, dp, mp->m_dirdatablk, -1,
-                                &bp, XFS_DATA_FORK);
        if (error)
                return error;
-        ASSERT(bp != NULL);
        /*
         * Extract the byte offset we start at from the seek pointer.
         * We'll skip entries before this.
@@ -637,14 +708,11 @@ xfs_dir2_block_lookup_int(
        dp = args->dp;
        tp = args->trans;
        mp = dp->i_mount;
-        /*
-         * Read the buffer, return error if we can't get it.
+        error = xfs_dir2_block_read(tp, dp, &bp);
-         */
+        if (error)
-        if ((error =
-            xfs_da_read_buf(tp, dp, mp->m_dirdatablk, -1, &bp, XFS_DATA_FORK))) {
                return error;
-        }
-        ASSERT(bp != NULL);
        hdr = bp->b_addr;
        xfs_dir2_data_check(dp, bp);
        btp = xfs_dir2_block_tail_p(mp, hdr);
@@ -917,10 +985,10 @@ xfs_dir2_leaf_to_block(
        /*
         * Read the data block if we don't already have it, give up if it fails.
         */
-        if (dbp == NULL &&
+        if (!dbp) {
-            (error = xfs_da_read_buf(tp, dp, mp->m_dirdatablk, -1, &dbp,
+                error = xfs_dir2_data_read(tp, dp, mp->m_dirdatablk, -1, &dbp);
-                    XFS_DATA_FORK))) {
+                if (error)
-                return error;
+                        return error;
        }
        hdr = dbp->b_addr;
        ASSERT(hdr->magic == cpu_to_be32(XFS_DIR2_DATA_MAGIC));
@@ -944,6 +1012,7 @@ xfs_dir2_leaf_to_block(
        /*
         * Start converting it to block form.
         */
+        dbp->b_ops = &xfs_dir2_block_buf_ops;
        hdr->magic = cpu_to_be32(XFS_DIR2_BLOCK_MAGIC);
        needlog = 1;
        needscan = 0;
@@ -1073,6 +1142,7 @@ xfs_dir2_sf_to_block(
                kmem_free(sfp);
                return error;
        }
+        bp->b_ops = &xfs_dir2_block_buf_ops;
        hdr = bp->b_addr;
        hdr->magic = cpu_to_be32(XFS_DIR2_BLOCK_MAGIC);
        /*
diff --git a/fs/xfs/xfs_dir2_data.c b/fs/xfs/xfs_dir2_data.c
index 44ffd4d6bc91..ffcf1774152e 100644
--- a/fs/xfs/xfs_dir2_data.c
+++ b/fs/xfs/xfs_dir2_data.c
@@ -34,14 +34,13 @@
 STATIC xfs_dir2_data_free_t *
 xfs_dir2_data_freefind(xfs_dir2_data_hdr_t *hdr, xfs_dir2_data_unused_t *dup);
-#ifdef DEBUG
 /*
 * Check the consistency of the data block.
 * The input can also be a block-format directory.
- * Pop an assert if we find anything bad.
+ * Return 0 is the buffer is good, otherwise an error.
 */
-void
+int
-xfs_dir2_data_check(
+__xfs_dir2_data_check(
        struct xfs_inode        *dp,            /* incore inode pointer */
        struct xfs_buf          *bp)            /* data block's buffer */
 {
@@ -64,18 +63,23 @@ xfs_dir2_data_check(
        int                     stale;          /* count of stale leaves */
        struct xfs_name         name;
-        mp = dp->i_mount;
+        mp = bp->b_target->bt_mount;
        hdr = bp->b_addr;
        bf = hdr->bestfree;
        p = (char *)(hdr + 1);
-        if (hdr->magic == cpu_to_be32(XFS_DIR2_BLOCK_MAGIC)) {
+        switch (hdr->magic) {
+        case cpu_to_be32(XFS_DIR2_BLOCK_MAGIC):
                btp = xfs_dir2_block_tail_p(mp, hdr);
                lep = xfs_dir2_block_leaf_p(btp);
                endp = (char *)lep;
-        } else {
+                break;
-                ASSERT(hdr->magic == cpu_to_be32(XFS_DIR2_DATA_MAGIC));
+        case cpu_to_be32(XFS_DIR2_DATA_MAGIC):
                endp = (char *)hdr + mp->m_dirblksize;
+                break;
+        default:
+                XFS_ERROR_REPORT("Bad Magic", XFS_ERRLEVEL_LOW, mp);
+                return EFSCORRUPTED;
        }
        count = lastfree = freeseen = 0;
@@ -83,19 +87,22 @@ xfs_dir2_data_check(
         * Account for zero bestfree entries.
         */
        if (!bf[0].length) {
-                ASSERT(!bf[0].offset);
+                XFS_WANT_CORRUPTED_RETURN(!bf[0].offset);
                freeseen |= 1 << 0;
        }
        if (!bf[1].length) {
-                ASSERT(!bf[1].offset);
+                XFS_WANT_CORRUPTED_RETURN(!bf[1].offset);
                freeseen |= 1 << 1;
        }
        if (!bf[2].length) {
-                ASSERT(!bf[2].offset);
+                XFS_WANT_CORRUPTED_RETURN(!bf[2].offset);
                freeseen |= 1 << 2;
        }
-        ASSERT(be16_to_cpu(bf[0].length) >= be16_to_cpu(bf[1].length));
-        ASSERT(be16_to_cpu(bf[1].length) >= be16_to_cpu(bf[2].length));
+        XFS_WANT_CORRUPTED_RETURN(be16_to_cpu(bf[0].length) >=
+                                                be16_to_cpu(bf[1].length));
+        XFS_WANT_CORRUPTED_RETURN(be16_to_cpu(bf[1].length) >=
+                                                be16_to_cpu(bf[2].length));
        /*
         * Loop over the data/unused entries.
         */
@@ -107,17 +114,20 @@ xfs_dir2_data_check(
                 * doesn't need to be there.
                 */
                if (be16_to_cpu(dup->freetag) == XFS_DIR2_DATA_FREE_TAG) {
-                        ASSERT(lastfree == 0);
+                        XFS_WANT_CORRUPTED_RETURN(lastfree == 0);
-                        ASSERT(be16_to_cpu(*xfs_dir2_data_unused_tag_p(dup)) ==
+                        XFS_WANT_CORRUPTED_RETURN(
-                               (char *)dup - (char *)hdr);
+                                be16_to_cpu(*xfs_dir2_data_unused_tag_p(dup)) ==
+                                               (char *)dup - (char *)hdr);
                        dfp = xfs_dir2_data_freefind(hdr, dup);
                        if (dfp) {
                                i = (int)(dfp - bf);
-                                ASSERT((freeseen & (1 << i)) == 0);
+                                XFS_WANT_CORRUPTED_RETURN(
+                                        (freeseen & (1 << i)) == 0);
                                freeseen |= 1 << i;
                        } else {
-                                ASSERT(be16_to_cpu(dup->length) <=
+                                XFS_WANT_CORRUPTED_RETURN(
-                                       be16_to_cpu(bf[2].length));
+                                        be16_to_cpu(dup->length) <=
+                                                be16_to_cpu(bf[2].length));
                        }
                        p += be16_to_cpu(dup->length);
                        lastfree = 1;
@@ -130,10 +140,12 @@ xfs_dir2_data_check(
                 * The linear search is crude but this is DEBUG code.
                 */
                dep = (xfs_dir2_data_entry_t *)p;
-                ASSERT(dep->namelen != 0);
+                XFS_WANT_CORRUPTED_RETURN(dep->namelen != 0);
-                ASSERT(xfs_dir_ino_validate(mp, be64_to_cpu(dep->inumber)) == 0);
+                XFS_WANT_CORRUPTED_RETURN(
-                ASSERT(be16_to_cpu(*xfs_dir2_data_entry_tag_p(dep)) ==
+                        !xfs_dir_ino_validate(mp, be64_to_cpu(dep->inumber)));
-                       (char *)dep - (char *)hdr);
+                XFS_WANT_CORRUPTED_RETURN(
+                        be16_to_cpu(*xfs_dir2_data_entry_tag_p(dep)) ==
+                                               (char *)dep - (char *)hdr);
                count++;
                lastfree = 0;
                if (hdr->magic == cpu_to_be32(XFS_DIR2_BLOCK_MAGIC)) {
@@ -148,27 +160,122 @@ xfs_dir2_data_check(
                                    be32_to_cpu(lep[i].hashval) == hash)
                                        break;
                        }
-                        ASSERT(i < be32_to_cpu(btp->count));
+                        XFS_WANT_CORRUPTED_RETURN(i < be32_to_cpu(btp->count));
                }
                p += xfs_dir2_data_entsize(dep->namelen);
        }
        /*
         * Need to have seen all the entries and all the bestfree slots.
         */
-        ASSERT(freeseen == 7);
+        XFS_WANT_CORRUPTED_RETURN(freeseen == 7);
        if (hdr->magic == cpu_to_be32(XFS_DIR2_BLOCK_MAGIC)) {
                for (i = stale = 0; i < be32_to_cpu(btp->count); i++) {
                        if (lep[i].address ==
                            cpu_to_be32(XFS_DIR2_NULL_DATAPTR))
                                stale++;
                        if (i > 0)
-                                ASSERT(be32_to_cpu(lep[i].hashval) >= be32_to_cpu(lep[i - 1].hashval));
+                                XFS_WANT_CORRUPTED_RETURN(
+                                        be32_to_cpu(lep[i].hashval) >=
+                                                be32_to_cpu(lep[i - 1].hashval));
                }
-                ASSERT(count == be32_to_cpu(btp->count) - be32_to_cpu(btp->stale));
+                XFS_WANT_CORRUPTED_RETURN(count ==
-                ASSERT(stale == be32_to_cpu(btp->stale));
+                        be32_to_cpu(btp->count) - be32_to_cpu(btp->stale));
+                XFS_WANT_CORRUPTED_RETURN(stale == be32_to_cpu(btp->stale));
        }
+        return 0;
+}
+static void
+xfs_dir2_data_verify(
+        struct xfs_buf          *bp)
+{
+        struct xfs_mount        *mp = bp->b_target->bt_mount;
+        struct xfs_dir2_data_hdr *hdr = bp->b_addr;
+        int                     block_ok = 0;
+        block_ok = hdr->magic == cpu_to_be32(XFS_DIR2_DATA_MAGIC);
+        block_ok = block_ok && __xfs_dir2_data_check(NULL, bp) == 0;
+        if (!block_ok) {
+                XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_LOW, mp, hdr);
+                xfs_buf_ioerror(bp, EFSCORRUPTED);
+        }
+}
+/*
+ * Readahead of the first block of the directory when it is opened is completely
+ * oblivious to the format of the directory. Hence we can either get a block
+ * format buffer or a data format buffer on readahead.
+ */
+static void
+xfs_dir2_data_reada_verify(
+        struct xfs_buf          *bp)
+{
+        struct xfs_mount        *mp = bp->b_target->bt_mount;
+        struct xfs_dir2_data_hdr *hdr = bp->b_addr;
+        switch (hdr->magic) {
+        case cpu_to_be32(XFS_DIR2_BLOCK_MAGIC):
+                bp->b_ops = &xfs_dir2_block_buf_ops;
+                bp->b_ops->verify_read(bp);
+                return;
+        case cpu_to_be32(XFS_DIR2_DATA_MAGIC):
+                xfs_dir2_data_verify(bp);
+                return;
+        default:
+                XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_LOW, mp, hdr);
+                xfs_buf_ioerror(bp, EFSCORRUPTED);
+                break;
+        }
+}
+static void
+xfs_dir2_data_read_verify(
+        struct xfs_buf  *bp)
+{
+        xfs_dir2_data_verify(bp);
+}
+static void
+xfs_dir2_data_write_verify(
+        struct xfs_buf  *bp)
+{
+        xfs_dir2_data_verify(bp);
+}
+const struct xfs_buf_ops xfs_dir2_data_buf_ops = {
+        .verify_read = xfs_dir2_data_read_verify,
+        .verify_write = xfs_dir2_data_write_verify,
+};
+static const struct xfs_buf_ops xfs_dir2_data_reada_buf_ops = {
+        .verify_read = xfs_dir2_data_reada_verify,
+        .verify_write = xfs_dir2_data_write_verify,
+};
+int
+xfs_dir2_data_read(
+        struct xfs_trans        *tp,
+        struct xfs_inode        *dp,
+        xfs_dablk_t             bno,
+        xfs_daddr_t             mapped_bno,
+        struct xfs_buf          **bpp)
+{
+        return xfs_da_read_buf(tp, dp, bno, mapped_bno, bpp,
+                                XFS_DATA_FORK, &xfs_dir2_data_buf_ops);
+}
+int
+xfs_dir2_data_readahead(
+        struct xfs_trans        *tp,
+        struct xfs_inode        *dp,
+        xfs_dablk_t             bno,
+        xfs_daddr_t             mapped_bno)
+{
+        return xfs_da_reada_buf(tp, dp, bno, mapped_bno,
+                                XFS_DATA_FORK, &xfs_dir2_data_reada_buf_ops);
 }
-#endif
 /*
 * Given a data block and an unused entry from that block,
@@ -409,10 +516,9 @@ xfs_dir2_data_init(
         */
        error = xfs_da_get_buf(tp, dp, xfs_dir2_db_to_da(mp, blkno), -1, &bp,
                XFS_DATA_FORK);
-        if (error) {
+        if (error)
                return error;
-        }
+        bp->b_ops = &xfs_dir2_data_buf_ops;
-        ASSERT(bp != NULL);
        /*
         * Initialize the header.
diff --git a/fs/xfs/xfs_dir2_leaf.c b/fs/xfs/xfs_dir2_leaf.c
index 0b296253bd01..60cd2fa4e047 100644
--- a/fs/xfs/xfs_dir2_leaf.c
+++ b/fs/xfs/xfs_dir2_leaf.c
@@ -48,6 +48,83 @@ static void xfs_dir2_leaf_log_bests(struct xfs_trans *tp, struct xfs_buf *bp,
                                    int first, int last);
 static void xfs_dir2_leaf_log_tail(struct xfs_trans *tp, struct xfs_buf *bp);
+static void
+xfs_dir2_leaf_verify(
+        struct xfs_buf          *bp,
+        __be16                  magic)
+{
+        struct xfs_mount        *mp = bp->b_target->bt_mount;
+        struct xfs_dir2_leaf_hdr *hdr = bp->b_addr;
+        int                     block_ok = 0;
+        block_ok = hdr->info.magic == magic;
+        if (!block_ok) {
+                XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_LOW, mp, hdr);
+                xfs_buf_ioerror(bp, EFSCORRUPTED);
+        }
+}
+static void
+xfs_dir2_leaf1_read_verify(
+        struct xfs_buf  *bp)
+{
+        xfs_dir2_leaf_verify(bp, cpu_to_be16(XFS_DIR2_LEAF1_MAGIC));
+}
+static void
+xfs_dir2_leaf1_write_verify(
+        struct xfs_buf  *bp)
+{
+        xfs_dir2_leaf_verify(bp, cpu_to_be16(XFS_DIR2_LEAF1_MAGIC));
+}
+void
+xfs_dir2_leafn_read_verify(
+        struct xfs_buf  *bp)
+{
+        xfs_dir2_leaf_verify(bp, cpu_to_be16(XFS_DIR2_LEAFN_MAGIC));
+}
+void
+xfs_dir2_leafn_write_verify(
+        struct xfs_buf  *bp)
+{
+        xfs_dir2_leaf_verify(bp, cpu_to_be16(XFS_DIR2_LEAFN_MAGIC));
+}
+static const struct xfs_buf_ops xfs_dir2_leaf1_buf_ops = {
+        .verify_read = xfs_dir2_leaf1_read_verify,
+        .verify_write = xfs_dir2_leaf1_write_verify,
+};
+const struct xfs_buf_ops xfs_dir2_leafn_buf_ops = {
+        .verify_read = xfs_dir2_leafn_read_verify,
+        .verify_write = xfs_dir2_leafn_write_verify,
+};
+static int
+xfs_dir2_leaf_read(
+        struct xfs_trans        *tp,
+        struct xfs_inode        *dp,
+        xfs_dablk_t             fbno,
+        xfs_daddr_t             mappedbno,
+        struct xfs_buf          **bpp)
+{
+        return xfs_da_read_buf(tp, dp, fbno, mappedbno, bpp,
+                                XFS_DATA_FORK, &xfs_dir2_leaf1_buf_ops);
+}
+int
+xfs_dir2_leafn_read(
+        struct xfs_trans        *tp,
+        struct xfs_inode        *dp,
+        xfs_dablk_t             fbno,
+        xfs_daddr_t             mappedbno,
+        struct xfs_buf          **bpp)
+{
+        return xfs_da_read_buf(tp, dp, fbno, mappedbno, bpp,
+                                XFS_DATA_FORK, &xfs_dir2_leafn_buf_ops);
+}
 /*
 * Convert a block form directory to a leaf form directory.
@@ -125,6 +202,7 @@ xfs_dir2_block_to_leaf(
        /*
         * Fix up the block header, make it a data block.
         */
+        dbp->b_ops = &xfs_dir2_data_buf_ops;
        hdr->magic = cpu_to_be32(XFS_DIR2_DATA_MAGIC);
        if (needscan)
                xfs_dir2_data_freescan(mp, hdr, &needlog);
@@ -311,15 +389,11 @@ xfs_dir2_leaf_addname(
        dp = args->dp;
        tp = args->trans;
        mp = dp->i_mount;
-        /*
-         * Read the leaf block.
+        error = xfs_dir2_leaf_read(tp, dp, mp->m_dirleafblk, -1, &lbp);
-         */
+        if (error)
-        error = xfs_da_read_buf(tp, dp, mp->m_dirleafblk, -1, &lbp,
-                XFS_DATA_FORK);
-        if (error) {
                return error;
-        }
-        ASSERT(lbp != NULL);
        /*
         * Look up the entry by hash value and name.
         * We know it's not there, our caller has already done a lookup.
@@ -494,22 +568,21 @@ xfs_dir2_leaf_addname(
                hdr = dbp->b_addr;
                bestsp[use_block] = hdr->bestfree[0].length;
                grown = 1;
-        }
+        } else {
-        /*
+                /*
-         * Already had space in some data block.
+                 * Already had space in some data block.
-         * Just read that one in.
+                 * Just read that one in.
-         */
+                 */
-        else {
+                error = xfs_dir2_data_read(tp, dp,
-                if ((error =
+                                           xfs_dir2_db_to_da(mp, use_block),
-                    xfs_da_read_buf(tp, dp, xfs_dir2_db_to_da(mp, use_block),
+                                           -1, &dbp);
-                            -1, &dbp, XFS_DATA_FORK))) {
+                if (error) {
                        xfs_trans_brelse(tp, lbp);
                        return error;
                }
                hdr = dbp->b_addr;
                grown = 0;
        }
-        xfs_dir2_data_check(dp, dbp);
        /*
         * Point to the biggest freespace in our data block.
         */
@@ -892,10 +965,9 @@ xfs_dir2_leaf_readbuf(
         * Read the directory block starting at the first mapping.
         */
        mip->curdb = xfs_dir2_da_to_db(mp, map->br_startoff);
-        error = xfs_da_read_buf(NULL, dp, map->br_startoff,
+        error = xfs_dir2_data_read(NULL, dp, map->br_startoff,
                        map->br_blockcount >= mp->m_dirblkfsbs ?
-                            XFS_FSB_TO_DADDR(mp, map->br_startblock) : -1,
+                            XFS_FSB_TO_DADDR(mp, map->br_startblock) : -1, &bp);
-                        &bp, XFS_DATA_FORK);
        /*
         * Should just skip over the data block instead of giving up.
@@ -922,11 +994,11 @@ xfs_dir2_leaf_readbuf(
                 */
                if (i > mip->ra_current &&
                    map[mip->ra_index].br_blockcount >= mp->m_dirblkfsbs) {
-                        xfs_buf_readahead(mp->m_ddev_targp,
+                        xfs_dir2_data_readahead(NULL, dp,
+                                map[mip->ra_index].br_startoff + mip->ra_offset,
                                XFS_FSB_TO_DADDR(mp,
                                        map[mip->ra_index].br_startblock +
-                                                        mip->ra_offset),
+                                                        mip->ra_offset));
-                                (int)BTOBB(mp->m_dirblksize));
                        mip->ra_current = i;
                }
@@ -935,10 +1007,9 @@ xfs_dir2_leaf_readbuf(
                 * use our mapping, but this is a very rare case.
                 */
                else if (i > mip->ra_current) {
-                        xfs_da_reada_buf(NULL, dp,
+                        xfs_dir2_data_readahead(NULL, dp,
                                        map[mip->ra_index].br_startoff +
-                                                        mip->ra_offset,
+                                                        mip->ra_offset, -1);
-                                        XFS_DATA_FORK);
                        mip->ra_current = i;
                }
@@ -1177,15 +1248,14 @@ xfs_dir2_leaf_init(
         * Get the buffer for the block.
         */
        error = xfs_da_get_buf(tp, dp, xfs_dir2_db_to_da(mp, bno), -1, &bp,
-                XFS_DATA_FORK);
+                               XFS_DATA_FORK);
-        if (error) {
+        if (error)
                return error;
-        }
-        ASSERT(bp != NULL);
-        leaf = bp->b_addr;
        /*
         * Initialize the header.
         */
+        leaf = bp->b_addr;
        leaf->hdr.info.magic = cpu_to_be16(magic);
        leaf->hdr.info.forw = 0;
        leaf->hdr.info.back = 0;
@@ -1198,10 +1268,12 @@ xfs_dir2_leaf_init(
         * the block.
         */
        if (magic == XFS_DIR2_LEAF1_MAGIC) {
+                bp->b_ops = &xfs_dir2_leaf1_buf_ops;
                ltp = xfs_dir2_leaf_tail_p(mp, leaf);
                ltp->bestcount = 0;
                xfs_dir2_leaf_log_tail(tp, bp);
-        }
+        } else
+                bp->b_ops = &xfs_dir2_leafn_buf_ops;
        *bpp = bp;
        return 0;
 }
@@ -1372,13 +1444,11 @@ xfs_dir2_leaf_lookup_int(
        dp = args->dp;
        tp = args->trans;
        mp = dp->i_mount;
-        /*
-         * Read the leaf block into the buffer.
+        error = xfs_dir2_leaf_read(tp, dp, mp->m_dirleafblk, -1, &lbp);
-         */
-        error = xfs_da_read_buf(tp, dp, mp->m_dirleafblk, -1, &lbp,
-                                                        XFS_DATA_FORK);
        if (error)
                return error;
        *lbpp = lbp;
        leaf = lbp->b_addr;
        xfs_dir2_leaf_check(dp, lbp);
@@ -1409,14 +1479,13 @@ xfs_dir2_leaf_lookup_int(
                if (newdb != curdb) {
                        if (dbp)
                                xfs_trans_brelse(tp, dbp);
-                        error = xfs_da_read_buf(tp, dp,
+                        error = xfs_dir2_data_read(tp, dp,
-                                                xfs_dir2_db_to_da(mp, newdb),
+                                                   xfs_dir2_db_to_da(mp, newdb),
-                                                -1, &dbp, XFS_DATA_FORK);
+                                                   -1, &dbp);
                        if (error) {
                                xfs_trans_brelse(tp, lbp);
                                return error;
                        }
-                        xfs_dir2_data_check(dp, dbp);
                        curdb = newdb;
                }
                /*
@@ -1451,9 +1520,9 @@ xfs_dir2_leaf_lookup_int(
                ASSERT(cidb != -1);
                if (cidb != curdb) {
                        xfs_trans_brelse(tp, dbp);
-                        error = xfs_da_read_buf(tp, dp,
+                        error = xfs_dir2_data_read(tp, dp,
-                                                xfs_dir2_db_to_da(mp, cidb),
+                                                   xfs_dir2_db_to_da(mp, cidb),
-                                                -1, &dbp, XFS_DATA_FORK);
+                                                   -1, &dbp);
                        if (error) {
                                xfs_trans_brelse(tp, lbp);
                                return error;
@@ -1738,10 +1807,9 @@ xfs_dir2_leaf_trim_data(
        /*
         * Read the offending data block.  We need its buffer.
         */
-        if ((error = xfs_da_read_buf(tp, dp, xfs_dir2_db_to_da(mp, db), -1, &dbp,
+        error = xfs_dir2_data_read(tp, dp, xfs_dir2_db_to_da(mp, db), -1, &dbp);
-                        XFS_DATA_FORK))) {
+        if (error)
                return error;
-        }
        leaf = lbp->b_addr;
        ltp = xfs_dir2_leaf_tail_p(mp, leaf);
@@ -1864,10 +1932,9 @@ xfs_dir2_node_to_leaf(
        /*
         * Read the freespace block.
         */
-        if ((error = xfs_da_read_buf(tp, dp, mp->m_dirfreeblk, -1, &fbp,
+        error = xfs_dir2_free_read(tp, dp,  mp->m_dirfreeblk, &fbp);
-                        XFS_DATA_FORK))) {
+        if (error)
                return error;
-        }
        free = fbp->b_addr;
        ASSERT(free->hdr.magic == cpu_to_be32(XFS_DIR2_FREE_MAGIC));
        ASSERT(!free->hdr.firstdb);
@@ -1890,7 +1957,10 @@ xfs_dir2_node_to_leaf(
                xfs_dir2_leaf_compact(args, lbp);
        else
                xfs_dir2_leaf_log_header(tp, lbp);
+        lbp->b_ops = &xfs_dir2_leaf1_buf_ops;
        leaf->hdr.info.magic = cpu_to_be16(XFS_DIR2_LEAF1_MAGIC);
        /*
         * Set up the leaf tail from the freespace block.
         */
diff --git a/fs/xfs/xfs_dir2_node.c b/fs/xfs/xfs_dir2_node.c
index 6c7052406605..5980f9b7fa9b 100644
--- a/fs/xfs/xfs_dir2_node.c
+++ b/fs/xfs/xfs_dir2_node.c
@@ -55,6 +55,74 @@ static int xfs_dir2_leafn_remove(xfs_da_args_t *args, struct xfs_buf *bp,
 static int xfs_dir2_node_addname_int(xfs_da_args_t *args,
                                     xfs_da_state_blk_t *fblk);
+static void
+xfs_dir2_free_verify(
+        struct xfs_buf          *bp)
+{
+        struct xfs_mount        *mp = bp->b_target->bt_mount;
+        struct xfs_dir2_free_hdr *hdr = bp->b_addr;
+        int                     block_ok = 0;
+        block_ok = hdr->magic == cpu_to_be32(XFS_DIR2_FREE_MAGIC);
+        if (!block_ok) {
+                XFS_CORRUPTION_ERROR("xfs_dir2_free_verify magic",
+                                     XFS_ERRLEVEL_LOW, mp, hdr);
+                xfs_buf_ioerror(bp, EFSCORRUPTED);
+        }
+}
+static void
+xfs_dir2_free_read_verify(
+        struct xfs_buf  *bp)
+{
+        xfs_dir2_free_verify(bp);
+}
+static void
+xfs_dir2_free_write_verify(
+        struct xfs_buf  *bp)
+{
+        xfs_dir2_free_verify(bp);
+}
+static const struct xfs_buf_ops xfs_dir2_free_buf_ops = {
+        .verify_read = xfs_dir2_free_read_verify,
+        .verify_write = xfs_dir2_free_write_verify,
+};
+static int
+__xfs_dir2_free_read(
+        struct xfs_trans        *tp,
+        struct xfs_inode        *dp,
+        xfs_dablk_t             fbno,
+        xfs_daddr_t             mappedbno,
+        struct xfs_buf          **bpp)
+{
+        return xfs_da_read_buf(tp, dp, fbno, mappedbno, bpp,
+                                XFS_DATA_FORK, &xfs_dir2_free_buf_ops);
+}
+int
+xfs_dir2_free_read(
+        struct xfs_trans        *tp,
+        struct xfs_inode        *dp,
+        xfs_dablk_t             fbno,
+        struct xfs_buf          **bpp)
+{
+        return __xfs_dir2_free_read(tp, dp, fbno, -1, bpp);
+}
+static int
+xfs_dir2_free_try_read(
+        struct xfs_trans        *tp,
+        struct xfs_inode        *dp,
+        xfs_dablk_t             fbno,
+        struct xfs_buf          **bpp)
+{
+        return __xfs_dir2_free_read(tp, dp, fbno, -2, bpp);
+}
 /*
 * Log entries from a freespace block.
 */
@@ -131,11 +199,12 @@ xfs_dir2_leaf_to_node(
        /*
         * Get the buffer for the new freespace block.
         */
-        if ((error = xfs_da_get_buf(tp, dp, xfs_dir2_db_to_da(mp, fdb), -1, &fbp,
+        error = xfs_da_get_buf(tp, dp, xfs_dir2_db_to_da(mp, fdb), -1, &fbp,
-                        XFS_DATA_FORK))) {
+                                XFS_DATA_FORK);
+        if (error)
                return error;
-        }
+        fbp->b_ops = &xfs_dir2_free_buf_ops;
-        ASSERT(fbp != NULL);
        free = fbp->b_addr;
        leaf = lbp->b_addr;
        ltp = xfs_dir2_leaf_tail_p(mp, leaf);
@@ -157,7 +226,10 @@ xfs_dir2_leaf_to_node(
                *to = cpu_to_be16(off);
        }
        free->hdr.nused = cpu_to_be32(n);
+        lbp->b_ops = &xfs_dir2_leafn_buf_ops;
        leaf->hdr.info.magic = cpu_to_be16(XFS_DIR2_LEAFN_MAGIC);
        /*
         * Log everything.
         */
@@ -394,12 +466,10 @@ xfs_dir2_leafn_lookup_for_addname(
                                 */
                                if (curbp)
                                        xfs_trans_brelse(tp, curbp);
-                                /*
-                                 * Read the free block.
+                                error = xfs_dir2_free_read(tp, dp,
-                                 */
-                                error = xfs_da_read_buf(tp, dp,
                                                xfs_dir2_db_to_da(mp, newfdb),
-                                                -1, &curbp, XFS_DATA_FORK);
+                                                &curbp);
                                if (error)
                                        return error;
                                free = curbp->b_addr;
@@ -534,9 +604,9 @@ xfs_dir2_leafn_lookup_for_entry(
                                ASSERT(state->extravalid);
                                curbp = state->extrablk.bp;
                        } else {
-                                error = xfs_da_read_buf(tp, dp,
+                                error = xfs_dir2_data_read(tp, dp,
                                                xfs_dir2_db_to_da(mp, newdb),
-                                                -1, &curbp, XFS_DATA_FORK);
+                                                -1, &curbp);
                                if (error)
                                        return error;
                        }
@@ -568,6 +638,7 @@ xfs_dir2_leafn_lookup_for_entry(
                        state->extrablk.index = (int)((char *)dep -
                                                        (char *)curbp->b_addr);
                        state->extrablk.magic = XFS_DIR2_DATA_MAGIC;
+                        curbp->b_ops = &xfs_dir2_data_buf_ops;
                        if (cmp == XFS_CMP_EXACT)
                                return XFS_ERROR(EEXIST);
                }
@@ -582,6 +653,7 @@ xfs_dir2_leafn_lookup_for_entry(
                        state->extrablk.index = -1;
                        state->extrablk.blkno = curdb;
                        state->extrablk.magic = XFS_DIR2_DATA_MAGIC;
+                        curbp->b_ops = &xfs_dir2_data_buf_ops;
                } else {
                        /* If the curbp is not the CI match block, drop it */
                        if (state->extrablk.bp != curbp)
@@ -825,6 +897,77 @@ xfs_dir2_leafn_rebalance(
        }
 }
+static int
+xfs_dir2_data_block_free(
+        xfs_da_args_t           *args,
+        struct xfs_dir2_data_hdr *hdr,
+        struct xfs_dir2_free    *free,
+        xfs_dir2_db_t           fdb,
+        int                     findex,
+        struct xfs_buf          *fbp,
+        int                     longest)
+{
+        struct xfs_trans        *tp = args->trans;
+        int                     logfree = 0;
+        if (!hdr) {
+                /* One less used entry in the free table.  */
+                be32_add_cpu(&free->hdr.nused, -1);
+                xfs_dir2_free_log_header(tp, fbp);
+                /*
+                 * If this was the last entry in the table, we can trim the
+                 * table size back.  There might be other entries at the end
+                 * referring to non-existent data blocks, get those too.
+                 */
+                if (findex == be32_to_cpu(free->hdr.nvalid) - 1) {
+                        int     i;              /* free entry index */
+                        for (i = findex - 1; i >= 0; i--) {
+                                if (free->bests[i] != cpu_to_be16(NULLDATAOFF))
+                                        break;
+                        }
+                        free->hdr.nvalid = cpu_to_be32(i + 1);
+                        logfree = 0;
+                } else {
+                        /* Not the last entry, just punch it out.  */
+                        free->bests[findex] = cpu_to_be16(NULLDATAOFF);
+                        logfree = 1;
+                }
+                /*
+                 * If there are no useful entries left in the block,
+                 * get rid of the block if we can.
+                 */
+                if (!free->hdr.nused) {
+                        int error;
+                        error = xfs_dir2_shrink_inode(args, fdb, fbp);
+                        if (error == 0) {
+                                fbp = NULL;
+                                logfree = 0;
+                        } else if (error != ENOSPC || args->total != 0)
+                                return error;
+                        /*
+                         * It's possible to get ENOSPC if there is no
+                         * space reservation.  In this case some one
+                         * else will eventually get rid of this block.
+                         */
+                }
+        } else {
+                /*
+                 * Data block is not empty, just set the free entry to the new
+                 * value.
+                 */
+                free->bests[findex] = cpu_to_be16(longest);
+                logfree = 1;
+        }
+        /* Log the free entry that changed, unless we got rid of it.  */
+        if (logfree)
+                xfs_dir2_free_log_bests(tp, fbp, findex, findex);
+        return 0;
+}
 /*
 * Remove an entry from a node directory.
 * This removes the leaf entry and the data entry,
@@ -908,17 +1051,16 @@ xfs_dir2_leafn_remove(
                xfs_dir2_db_t   fdb;            /* freeblock block number */
                int             findex;         /* index in freeblock entries */
                xfs_dir2_free_t *free;          /* freeblock structure */
-                int             logfree;        /* need to log free entry */
                /*
                 * Convert the data block number to a free block,
                 * read in the free block.
                 */
                fdb = xfs_dir2_db_to_fdb(mp, db);
-                if ((error = xfs_da_read_buf(tp, dp, xfs_dir2_db_to_da(mp, fdb),
+                error = xfs_dir2_free_read(tp, dp, xfs_dir2_db_to_da(mp, fdb),
-                                -1, &fbp, XFS_DATA_FORK))) {
+                                           &fbp);
+                if (error)
                        return error;
-                }
                free = fbp->b_addr;
                ASSERT(free->hdr.magic == cpu_to_be32(XFS_DIR2_FREE_MAGIC));
                ASSERT(be32_to_cpu(free->hdr.firstdb) ==
@@ -954,68 +1096,12 @@ xfs_dir2_leafn_remove(
                 * If we got rid of the data block, we can eliminate that entry
                 * in the free block.
                 */
-                if (hdr == NULL) {
+                error = xfs_dir2_data_block_free(args, hdr, free,
-                        /*
+                                                 fdb, findex, fbp, longest);
-                         * One less used entry in the free table.
+                if (error)
-                         */
+                        return error;
-                        be32_add_cpu(&free->hdr.nused, -1);
-                        xfs_dir2_free_log_header(tp, fbp);
-                        /*
-                         * If this was the last entry in the table, we can
-                         * trim the table size back.  There might be other
-                         * entries at the end referring to non-existent
-                         * data blocks, get those too.
-                         */
-                        if (findex == be32_to_cpu(free->hdr.nvalid) - 1) {
-                                int     i;              /* free entry index */
-                                for (i = findex - 1;
-                                     i >= 0 &&
-                                     free->bests[i] == cpu_to_be16(NULLDATAOFF);
-                                     i--)
-                                        continue;
-                                free->hdr.nvalid = cpu_to_be32(i + 1);
-                                logfree = 0;
-                        }
-                        /*
-                         * Not the last entry, just punch it out.
-                         */
-                        else {
-                                free->bests[findex] = cpu_to_be16(NULLDATAOFF);
-                                logfree = 1;
-                        }
-                        /*
-                         * If there are no useful entries left in the block,
-                         * get rid of the block if we can.
-                         */
-                        if (!free->hdr.nused) {
-                                error = xfs_dir2_shrink_inode(args, fdb, fbp);
-                                if (error == 0) {
-                                        fbp = NULL;
-                                        logfree = 0;
-                                } else if (error != ENOSPC || args->total != 0)
-                                        return error;
-                                /*
-                                 * It's possible to get ENOSPC if there is no
-                                 * space reservation.  In this case some one
-                                 * else will eventually get rid of this block.
-                                 */
-                        }
-                }
-                /*
-                 * Data block is not empty, just set the free entry to
-                 * the new value.
-                 */
-                else {
-                        free->bests[findex] = cpu_to_be16(longest);
-                        logfree = 1;
-                }
-                /*
-                 * Log the free entry that changed, unless we got rid of it.
-                 */
-                if (logfree)
-                        xfs_dir2_free_log_bests(tp, fbp, findex, findex);
        }
        xfs_dir2_leafn_check(dp, bp);
        /*
         * Return indication of whether this leaf block is empty enough
@@ -1169,12 +1255,11 @@ xfs_dir2_leafn_toosmall(
                /*
                 * Read the sibling leaf block.
                 */
-                if ((error =
+                error = xfs_dir2_leafn_read(state->args->trans, state->args->dp,
-                    xfs_da_read_buf(state->args->trans, state->args->dp, blkno,
+                                            blkno, -1, &bp);
-                            -1, &bp, XFS_DATA_FORK))) {
+                if (error)
                        return error;
-                }
-                ASSERT(bp != NULL);
                /*
                 * Count bytes in the two blocks combined.
                 */
@@ -1454,14 +1539,13 @@ xfs_dir2_node_addname_int(
                         * This should be really rare, so there's no reason
                         * to avoid it.
                         */
-                        if ((error = xfs_da_read_buf(tp, dp,
+                        error = xfs_dir2_free_try_read(tp, dp,
-                                        xfs_dir2_db_to_da(mp, fbno), -2, &fbp,
+                                                xfs_dir2_db_to_da(mp, fbno),
-                                        XFS_DATA_FORK))) {
+                                                &fbp);
+                        if (error)
                                return error;
-                        }
+                        if (!fbp)
-                        if (unlikely(fbp == NULL)) {
                                continue;
-                        }
                        free = fbp->b_addr;
                        ASSERT(free->hdr.magic == cpu_to_be32(XFS_DIR2_FREE_MAGIC));
                        findex = 0;
@@ -1520,9 +1604,10 @@ xfs_dir2_node_addname_int(
                 * that was just allocated.
                 */
                fbno = xfs_dir2_db_to_fdb(mp, dbno);
-                if (unlikely(error = xfs_da_read_buf(tp, dp,
+                error = xfs_dir2_free_try_read(tp, dp,
-                                xfs_dir2_db_to_da(mp, fbno), -2, &fbp,
+                                               xfs_dir2_db_to_da(mp, fbno),
-                                XFS_DATA_FORK)))
+                                               &fbp);
+                if (error)
                        return error;
                /*
@@ -1561,12 +1646,12 @@ xfs_dir2_node_addname_int(
                        /*
                         * Get a buffer for the new block.
                         */
-                        if ((error = xfs_da_get_buf(tp, dp,
+                        error = xfs_da_get_buf(tp, dp,
-                                                   xfs_dir2_db_to_da(mp, fbno),
+                                               xfs_dir2_db_to_da(mp, fbno),
-                                                   -1, &fbp, XFS_DATA_FORK))) {
+                                               -1, &fbp, XFS_DATA_FORK);
+                        if (error)
                                return error;
-                        }
+                        fbp->b_ops = &xfs_dir2_free_buf_ops;
-                        ASSERT(fbp != NULL);
                        /*
                         * Initialize the new block to be empty, and remember
@@ -1630,8 +1715,8 @@ xfs_dir2_node_addname_int(
                /*
                 * Read the data block in.
                 */
-                error = xfs_da_read_buf(tp, dp, xfs_dir2_db_to_da(mp, dbno),
+                error = xfs_dir2_data_read(tp, dp, xfs_dir2_db_to_da(mp, dbno),
-                                -1, &dbp, XFS_DATA_FORK);
+                                           -1, &dbp);
                if (error)
                        return error;
                hdr = dbp->b_addr;
@@ -1917,18 +2002,15 @@ xfs_dir2_node_trim_free(
        /*
         * Read the freespace block.
         */
-        if (unlikely(error = xfs_da_read_buf(tp, dp, (xfs_dablk_t)fo, -2, &bp,
+        error = xfs_dir2_free_try_read(tp, dp, fo, &bp);
-                        XFS_DATA_FORK))) {
+        if (error)
                return error;
-        }
        /*
         * There can be holes in freespace.  If fo is a hole, there's
         * nothing to do.
         */
-        if (bp == NULL) {
+        if (!bp)
                return 0;
-        }
        free = bp->b_addr;
        ASSERT(free->hdr.magic == cpu_to_be32(XFS_DIR2_FREE_MAGIC));
        /*
diff --git a/fs/xfs/xfs_dir2_priv.h b/fs/xfs/xfs_dir2_priv.h
index 3523d3e15aa8..7da79f6515fd 100644
--- a/fs/xfs/xfs_dir2_priv.h
+++ b/fs/xfs/xfs_dir2_priv.h
@@ -30,6 +30,8 @@ extern int xfs_dir_cilookup_result(struct xfs_da_args *args,
                                const unsigned char *name, int len);
 /* xfs_dir2_block.c */
+extern const struct xfs_buf_ops xfs_dir2_block_buf_ops;
 extern int xfs_dir2_block_addname(struct xfs_da_args *args);
 extern int xfs_dir2_block_getdents(struct xfs_inode *dp, void *dirent,
                xfs_off_t *offset, filldir_t filldir);
@@ -41,10 +43,19 @@ extern int xfs_dir2_leaf_to_block(struct xfs_da_args *args,
 /* xfs_dir2_data.c */
 #ifdef DEBUG
-extern void xfs_dir2_data_check(struct xfs_inode *dp, struct xfs_buf *bp);
+#define xfs_dir2_data_check(dp,bp) __xfs_dir2_data_check(dp, bp);
 #else
 #define xfs_dir2_data_check(dp,bp)
 #endif
+extern const struct xfs_buf_ops xfs_dir2_data_buf_ops;
+extern int __xfs_dir2_data_check(struct xfs_inode *dp, struct xfs_buf *bp);
+extern int xfs_dir2_data_read(struct xfs_trans *tp, struct xfs_inode *dp,
+                xfs_dablk_t bno, xfs_daddr_t mapped_bno, struct xfs_buf **bpp);
+extern int xfs_dir2_data_readahead(struct xfs_trans *tp, struct xfs_inode *dp,
+                xfs_dablk_t bno, xfs_daddr_t mapped_bno);
 extern struct xfs_dir2_data_free *
 xfs_dir2_data_freeinsert(struct xfs_dir2_data_hdr *hdr,
                struct xfs_dir2_data_unused *dup, int *loghead);
@@ -66,6 +77,10 @@ extern void xfs_dir2_data_use_free(struct xfs_trans *tp, struct xfs_buf *bp,
                xfs_dir2_data_aoff_t len, int *needlogp, int *needscanp);
 /* xfs_dir2_leaf.c */
+extern const struct xfs_buf_ops xfs_dir2_leafn_buf_ops;
+extern int xfs_dir2_leafn_read(struct xfs_trans *tp, struct xfs_inode *dp,
+                xfs_dablk_t fbno, xfs_daddr_t mappedbno, struct xfs_buf **bpp);
 extern int xfs_dir2_block_to_leaf(struct xfs_da_args *args,
                struct xfs_buf *dbp);
 extern int xfs_dir2_leaf_addname(struct xfs_da_args *args);
@@ -115,6 +130,8 @@ extern int xfs_dir2_node_removename(struct xfs_da_args *args);
 extern int xfs_dir2_node_replace(struct xfs_da_args *args);
 extern int xfs_dir2_node_trim_free(struct xfs_da_args *args, xfs_fileoff_t fo,
                int *rvalp);
+extern int xfs_dir2_free_read(struct xfs_trans *tp, struct xfs_inode *dp,
+                xfs_dablk_t fbno, struct xfs_buf **bpp);
 /* xfs_dir2_sf.c */
 extern xfs_ino_t xfs_dir2_sf_get_parent_ino(struct xfs_dir2_sf_hdr *sfp);
diff --git a/fs/xfs/xfs_dquot.c b/fs/xfs/xfs_dquot.c
index bf27fcca4843..9e1bf5294c91 100644
--- a/fs/xfs/xfs_dquot.c
+++ b/fs/xfs/xfs_dquot.c
@@ -248,7 +248,59 @@ xfs_qm_init_dquot_blk(
        xfs_trans_log_buf(tp, bp, 0, BBTOB(q->qi_dqchunklen) - 1);
 }
+static void
+xfs_dquot_buf_verify(
+        struct xfs_buf          *bp)
+{
+        struct xfs_mount        *mp = bp->b_target->bt_mount;
+        struct xfs_dqblk        *d = (struct xfs_dqblk *)bp->b_addr;
+        struct xfs_disk_dquot   *ddq;
+        xfs_dqid_t              id = 0;
+        int                     i;
+        /*
+         * On the first read of the buffer, verify that each dquot is valid.
+         * We don't know what the id of the dquot is supposed to be, just that
+         * they should be increasing monotonically within the buffer. If the
+         * first id is corrupt, then it will fail on the second dquot in the
+         * buffer so corruptions could point to the wrong dquot in this case.
+         */
+        for (i = 0; i < mp->m_quotainfo->qi_dqperchunk; i++) {
+                int     error;
+                ddq = &d[i].dd_diskdq;
+                if (i == 0)
+                        id = be32_to_cpu(ddq->d_id);
+                error = xfs_qm_dqcheck(mp, ddq, id + i, 0, XFS_QMOPT_DOWARN,
+                                        "xfs_dquot_read_verify");
+                if (error) {
+                        XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_LOW, mp, d);
+                        xfs_buf_ioerror(bp, EFSCORRUPTED);
+                        break;
+                }
+        }
+}
+static void
+xfs_dquot_buf_read_verify(
+        struct xfs_buf  *bp)
+{
+        xfs_dquot_buf_verify(bp);
+}
+void
+xfs_dquot_buf_write_verify(
+        struct xfs_buf  *bp)
+{
+        xfs_dquot_buf_verify(bp);
+}
+const struct xfs_buf_ops xfs_dquot_buf_ops = {
+        .verify_read = xfs_dquot_buf_read_verify,
+        .verify_write = xfs_dquot_buf_write_verify,
+};
 /*
 * Allocate a block and fill it with dquots.
@@ -315,6 +367,7 @@ xfs_qm_dqalloc(
        error = xfs_buf_geterror(bp);
        if (error)
                goto error1;
+        bp->b_ops = &xfs_dquot_buf_ops;
        /*
         * Make a chunk of dquots out of this buffer and log
@@ -359,6 +412,51 @@ xfs_qm_dqalloc(
        return (error);
 }
+STATIC int
+xfs_qm_dqrepair(
+        struct xfs_mount        *mp,
+        struct xfs_trans        *tp,
+        struct xfs_dquot        *dqp,
+        xfs_dqid_t              firstid,
+        struct xfs_buf          **bpp)
+{
+        int                     error;
+        struct xfs_disk_dquot   *ddq;
+        struct xfs_dqblk        *d;
+        int                     i;
+        /*
+         * Read the buffer without verification so we get the corrupted
+         * buffer returned to us. make sure we verify it on write, though.
+         */
+        error = xfs_trans_read_buf(mp, tp, mp->m_ddev_targp, dqp->q_blkno,
+                                   mp->m_quotainfo->qi_dqchunklen,
+                                   0, bpp, NULL);
+        if (error) {
+                ASSERT(*bpp == NULL);
+                return XFS_ERROR(error);
+        }
+        (*bpp)->b_ops = &xfs_dquot_buf_ops;
+        ASSERT(xfs_buf_islocked(*bpp));
+        d = (struct xfs_dqblk *)(*bpp)->b_addr;
+        /* Do the actual repair of dquots in this buffer */
+        for (i = 0; i < mp->m_quotainfo->qi_dqperchunk; i++) {
+                ddq = &d[i].dd_diskdq;
+                error = xfs_qm_dqcheck(mp, ddq, firstid + i,
+                                       dqp->dq_flags & XFS_DQ_ALLTYPES,
+                                       XFS_QMOPT_DQREPAIR, "xfs_qm_dqrepair");
+                if (error) {
+                        /* repair failed, we're screwed */
+                        xfs_trans_brelse(tp, *bpp);
+                        return XFS_ERROR(EIO);
+                }
+        }
+        return 0;
+}
 /*
 * Maps a dquot to the buffer containing its on-disk version.
@@ -378,7 +476,6 @@ xfs_qm_dqtobp(
        xfs_buf_t       *bp;
        xfs_inode_t     *quotip = XFS_DQ_TO_QIP(dqp);
        xfs_mount_t     *mp = dqp->q_mount;
-        xfs_disk_dquot_t *ddq;
        xfs_dqid_t      id = be32_to_cpu(dqp->q_core.d_id);
        xfs_trans_t     *tp = (tpp ? *tpp : NULL);
@@ -439,33 +536,24 @@ xfs_qm_dqtobp(
                error = xfs_trans_read_buf(mp, tp, mp->m_ddev_targp,
                                           dqp->q_blkno,
                                           mp->m_quotainfo->qi_dqchunklen,
-                                           0, &bp);
+                                           0, &bp, &xfs_dquot_buf_ops);
-                if (error || !bp)
-                        return XFS_ERROR(error);
-        }
-        ASSERT(xfs_buf_islocked(bp));
-        /*
+                if (error == EFSCORRUPTED && (flags & XFS_QMOPT_DQREPAIR)) {
-         * calculate the location of the dquot inside the buffer.
+                        xfs_dqid_t firstid = (xfs_dqid_t)map.br_startoff *
-         */
+                                                mp->m_quotainfo->qi_dqperchunk;
-        ddq = bp->b_addr + dqp->q_bufoffset;
+                        ASSERT(bp == NULL);
+                        error = xfs_qm_dqrepair(mp, tp, dqp, firstid, &bp);
+                }
-        /*
+                if (error) {
-         * A simple sanity check in case we got a corrupted dquot...
+                        ASSERT(bp == NULL);
-         */
+                        return XFS_ERROR(error);
-        error = xfs_qm_dqcheck(mp, ddq, id, dqp->dq_flags & XFS_DQ_ALLTYPES,
-                           flags & (XFS_QMOPT_DQREPAIR|XFS_QMOPT_DOWARN),
-                           "dqtobp");
-        if (error) {
-                if (!(flags & XFS_QMOPT_DQREPAIR)) {
-                        xfs_trans_brelse(tp, bp);
-                        return XFS_ERROR(EIO);
                }
        }
+        ASSERT(xfs_buf_islocked(bp));
        *O_bpp = bp;
-        *O_ddpp = ddq;
+        *O_ddpp = bp->b_addr + dqp->q_bufoffset;
        return (0);
 }
@@ -920,7 +1008,7 @@ xfs_qm_dqflush(
         * Get the buffer containing the on-disk dquot
         */
        error = xfs_trans_read_buf(mp, NULL, mp->m_ddev_targp, dqp->q_blkno,
-                                   mp->m_quotainfo->qi_dqchunklen, 0, &bp);
+                                   mp->m_quotainfo->qi_dqchunklen, 0, &bp, NULL);
        if (error)
                goto out_unlock;
diff --git a/fs/xfs/xfs_dquot.h b/fs/xfs/xfs_dquot.h
index 7d20af27346d..c694a8469c4a 100644
--- a/fs/xfs/xfs_dquot.h
+++ b/fs/xfs/xfs_dquot.h
@@ -161,4 +161,6 @@ static inline struct xfs_dquot *xfs_qm_dqhold(struct xfs_dquot *dqp)
        return dqp;
 }
+extern const struct xfs_buf_ops xfs_dquot_buf_ops;
 #endif /* __XFS_DQUOT_H__ */
diff --git a/fs/xfs/xfs_export.c b/fs/xfs/xfs_export.c
index 8c6d1d70278c..a83611849cee 100644
--- a/fs/xfs/xfs_export.c
+++ b/fs/xfs/xfs_export.c
@@ -29,6 +29,7 @@
 #include "xfs_inode.h"
 #include "xfs_inode_item.h"
 #include "xfs_trace.h"
+#include "xfs_icache.h"
 /*
 * Note that we only accept fileids which are long enough rather than allow
diff --git a/fs/xfs/xfs_file.c b/fs/xfs/xfs_file.c
index aa473fa640a2..67284edb84d7 100644
--- a/fs/xfs/xfs_file.c
+++ b/fs/xfs/xfs_file.c
@@ -31,6 +31,8 @@
 #include "xfs_error.h"
 #include "xfs_vnodeops.h"
 #include "xfs_da_btree.h"
+#include "xfs_dir2_format.h"
+#include "xfs_dir2_priv.h"
 #include "xfs_ioctl.h"
 #include "xfs_trace.h"
@@ -84,7 +86,7 @@ xfs_rw_ilock_demote(
 *      valid before the operation, it will be read from disk before
 *      being partially zeroed.
 */
-STATIC int
+int
 xfs_iozero(
        struct xfs_inode        *ip,    /* inode                        */
        loff_t                  pos,    /* offset in file               */
@@ -255,15 +257,14 @@ xfs_file_aio_read(
                xfs_buftarg_t   *target =
                        XFS_IS_REALTIME_INODE(ip) ?
                                mp->m_rtdev_targp : mp->m_ddev_targp;
-                if ((iocb->ki_pos & target->bt_smask) ||
+                if ((pos & target->bt_smask) || (size & target->bt_smask)) {
-                    (size & target->bt_smask)) {
+                        if (pos == i_size_read(inode))
-                        if (iocb->ki_pos == i_size_read(inode))
                                return 0;
                        return -XFS_ERROR(EINVAL);
                }
        }
-        n = mp->m_super->s_maxbytes - iocb->ki_pos;
+        n = mp->m_super->s_maxbytes - pos;
        if (n <= 0 || size == 0)
                return 0;
@@ -289,20 +290,21 @@ xfs_file_aio_read(
                xfs_rw_ilock(ip, XFS_IOLOCK_EXCL);
                if (inode->i_mapping->nrpages) {
-                        ret = -xfs_flushinval_pages(ip,
+                        ret = -filemap_write_and_wait_range(
-                                        (iocb->ki_pos & PAGE_CACHE_MASK),
+                                                        VFS_I(ip)->i_mapping,
-                                        -1, FI_REMAPF_LOCKED);
+                                                        pos, -1);
                        if (ret) {
                                xfs_rw_iunlock(ip, XFS_IOLOCK_EXCL);
                                return ret;
                        }
+                        truncate_pagecache_range(VFS_I(ip), pos, -1);
                }
                xfs_rw_ilock_demote(ip, XFS_IOLOCK_EXCL);
        }
-        trace_xfs_file_read(ip, size, iocb->ki_pos, ioflags);
+        trace_xfs_file_read(ip, size, pos, ioflags);
-        ret = generic_file_aio_read(iocb, iovp, nr_segs, iocb->ki_pos);
+        ret = generic_file_aio_read(iocb, iovp, nr_segs, pos);
        if (ret > 0)
                XFS_STATS_ADD(xs_read_bytes, ret);
@@ -670,10 +672,11 @@ xfs_file_dio_aio_write(
                goto out;
        if (mapping->nrpages) {
-                ret = -xfs_flushinval_pages(ip, (pos & PAGE_CACHE_MASK), -1,
+                ret = -filemap_write_and_wait_range(VFS_I(ip)->i_mapping,
-                                                        FI_REMAPF_LOCKED);
+                                                    pos, -1);
                if (ret)
                        goto out;
+                truncate_pagecache_range(VFS_I(ip), pos, -1);
        }
        /*
@@ -728,16 +731,17 @@ xfs_file_buffered_aio_write(
 write_retry:
        trace_xfs_file_buffered_write(ip, count, iocb->ki_pos, 0);
        ret = generic_file_buffered_write(iocb, iovp, nr_segs,
-                        pos, &iocb->ki_pos, count, ret);
+                        pos, &iocb->ki_pos, count, 0);
        /*
-         * if we just got an ENOSPC, flush the inode now we aren't holding any
+         * If we just got an ENOSPC, try to write back all dirty inodes to
-         * page locks and retry *once*
+         * convert delalloc space to free up some of the excess reserved
+         * metadata space.
         */
        if (ret == -ENOSPC && !enospc) {
                enospc = 1;
-                ret = -xfs_flush_pages(ip, 0, -1, 0, FI_NONE);
+                xfs_flush_inodes(ip->i_mount);
-                if (!ret)
+                goto write_retry;
-                        goto write_retry;
        }
        current->backing_dev_info = NULL;
@@ -889,7 +893,7 @@ xfs_dir_open(
         */
        mode = xfs_ilock_map_shared(ip);
        if (ip->i_d.di_nextents > 0)
-                xfs_da_reada_buf(NULL, ip, 0, XFS_DATA_FORK);
+                xfs_dir2_data_readahead(NULL, ip, 0, -1);
        xfs_iunlock(ip, mode);
        return 0;
 }
diff --git a/fs/xfs/xfs_fs.h b/fs/xfs/xfs_fs.h
index c13fed8c394a..6dda3f949b04 100644
--- a/fs/xfs/xfs_fs.h
+++ b/fs/xfs/xfs_fs.h
@@ -233,7 +233,8 @@ typedef struct xfs_fsop_resblks {
 #define XFS_FSOP_GEOM_FLAGS_LOGV2       0x0100  /* log format version 2 */
 #define XFS_FSOP_GEOM_FLAGS_SECTOR      0x0200  /* sector sizes >1BB    */
 #define XFS_FSOP_GEOM_FLAGS_ATTR2       0x0400  /* inline attributes rework */
-#define XFS_FSOP_GEOM_FLAGS_DIRV2CI     0x1000  /* ASCII only CI names */
+#define XFS_FSOP_GEOM_FLAGS_PROJID32    0x0800  /* 32-bit project IDs   */
+#define XFS_FSOP_GEOM_FLAGS_DIRV2CI     0x1000  /* ASCII only CI names  */
 #define XFS_FSOP_GEOM_FLAGS_LAZYSB      0x4000  /* lazy superblock counters */
@@ -339,6 +340,35 @@ typedef struct xfs_error_injection {
 /*
+ * Speculative preallocation trimming.
+ */
+#define XFS_EOFBLOCKS_VERSION           1
+struct xfs_eofblocks {
+        __u32           eof_version;
+        __u32           eof_flags;
+        uid_t           eof_uid;
+        gid_t           eof_gid;
+        prid_t          eof_prid;
+        __u32           pad32;
+        __u64           eof_min_file_size;
+        __u64           pad64[12];
+};
+/* eof_flags values */
+#define XFS_EOF_FLAGS_SYNC              (1 << 0) /* sync/wait mode scan */
+#define XFS_EOF_FLAGS_UID               (1 << 1) /* filter by uid */
+#define XFS_EOF_FLAGS_GID               (1 << 2) /* filter by gid */
+#define XFS_EOF_FLAGS_PRID              (1 << 3) /* filter by project id */
+#define XFS_EOF_FLAGS_MINFILESIZE       (1 << 4) /* filter by min file size */
+#define XFS_EOF_FLAGS_VALID     \
+        (XFS_EOF_FLAGS_SYNC |   \
+         XFS_EOF_FLAGS_UID |    \
+         XFS_EOF_FLAGS_GID |    \
+         XFS_EOF_FLAGS_PRID |   \
+         XFS_EOF_FLAGS_MINFILESIZE)
+/*
 * The user-level Handle Request interface structure.
 */
 typedef struct xfs_fsop_handlereq {
@@ -456,6 +486,7 @@ typedef struct xfs_handle {
 /*      XFS_IOC_GETBIOSIZE ---- deprecated 47      */
 #define XFS_IOC_GETBMAPX        _IOWR('X', 56, struct getbmap)
 #define XFS_IOC_ZERO_RANGE      _IOW ('X', 57, struct xfs_flock64)
+#define XFS_IOC_FREE_EOFBLOCKS  _IOR ('X', 58, struct xfs_eofblocks)
 /*
 * ioctl commands that replace IRIX syssgi()'s
diff --git a/fs/xfs/xfs_fs_subr.c b/fs/xfs/xfs_fs_subr.c
deleted file mode 100644
index 652b875a9d4c..000000000000
--- a/fs/xfs/xfs_fs_subr.c
+++ /dev/null
@@ -1,96 +0,0 @@
-/*
- * Copyright (c) 2000-2002,2005-2006 Silicon Graphics, Inc.
- * All Rights Reserved.
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License as
- * published by the Free Software Foundation.
- *
- * This program is distributed in the hope that it would be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, write the Free Software Foundation,
- * Inc.,  51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
- */
-#include "xfs.h"
-#include "xfs_vnodeops.h"
-#include "xfs_bmap_btree.h"
-#include "xfs_inode.h"
-#include "xfs_trace.h"
-/*
- * note: all filemap functions return negative error codes. These
- * need to be inverted before returning to the xfs core functions.
- */
-void
-xfs_tosspages(
-        xfs_inode_t     *ip,
-        xfs_off_t       first,
-        xfs_off_t       last,
-        int             fiopt)
-{
-        /* can't toss partial tail pages, so mask them out */
-        last &= ~(PAGE_SIZE - 1);
-        truncate_inode_pages_range(VFS_I(ip)->i_mapping, first, last - 1);
-}
-int
-xfs_flushinval_pages(
-        xfs_inode_t     *ip,
-        xfs_off_t       first,
-        xfs_off_t       last,
-        int             fiopt)
-{
-        struct address_space *mapping = VFS_I(ip)->i_mapping;
-        int             ret = 0;
-        trace_xfs_pagecache_inval(ip, first, last);
-        xfs_iflags_clear(ip, XFS_ITRUNCATED);
-        ret = filemap_write_and_wait_range(mapping, first,
-                                last == -1 ? LLONG_MAX : last);
-        if (!ret)
-                truncate_inode_pages_range(mapping, first, last);
-        return -ret;
-}
-int
-xfs_flush_pages(
-        xfs_inode_t     *ip,
-        xfs_off_t       first,
-        xfs_off_t       last,
-        uint64_t        flags,
-        int             fiopt)
-{
-        struct address_space *mapping = VFS_I(ip)->i_mapping;
-        int             ret = 0;
-        int             ret2;
-        xfs_iflags_clear(ip, XFS_ITRUNCATED);
-        ret = -filemap_fdatawrite_range(mapping, first,
-                                last == -1 ? LLONG_MAX : last);
-        if (flags & XBF_ASYNC)
-                return ret;
-        ret2 = xfs_wait_on_pages(ip, first, last);
-        if (!ret)
-                ret = ret2;
-        return ret;
-}
-int
-xfs_wait_on_pages(
-        xfs_inode_t     *ip,
-        xfs_off_t       first,
-        xfs_off_t       last)
-{
-        struct address_space *mapping = VFS_I(ip)->i_mapping;
-        if (mapping_tagged(mapping, PAGECACHE_TAG_WRITEBACK)) {
-                return -filemap_fdatawait_range(mapping, first,
-                                        last == -1 ? XFS_ISIZE(ip) - 1 : last);
-        }
-        return 0;
-}
diff --git a/fs/xfs/xfs_fsops.c b/fs/xfs/xfs_fsops.c
index c25b094efbf7..94eaeedc5498 100644
--- a/fs/xfs/xfs_fsops.c
+++ b/fs/xfs/xfs_fsops.c
@@ -97,7 +97,9 @@ xfs_fs_geometry(
                        (xfs_sb_version_haslazysbcount(&mp->m_sb) ?
                                XFS_FSOP_GEOM_FLAGS_LAZYSB : 0) |
                        (xfs_sb_version_hasattr2(&mp->m_sb) ?
-                                XFS_FSOP_GEOM_FLAGS_ATTR2 : 0);
+                                XFS_FSOP_GEOM_FLAGS_ATTR2 : 0) |
+                        (xfs_sb_version_hasprojid32bit(&mp->m_sb) ?
+                                XFS_FSOP_GEOM_FLAGS_PROJID32 : 0);
                geo->logsectsize = xfs_sb_version_hassector(&mp->m_sb) ?
                                mp->m_sb.sb_logsectsize : BBSIZE;
                geo->rtsectsize = mp->m_sb.sb_blocksize;
@@ -112,18 +114,40 @@ xfs_fs_geometry(
        return 0;
 }
+static struct xfs_buf *
+xfs_growfs_get_hdr_buf(
+        struct xfs_mount        *mp,
+        xfs_daddr_t             blkno,
+        size_t                  numblks,
+        int                     flags,
+        const struct xfs_buf_ops *ops)
+{
+        struct xfs_buf          *bp;
+        bp = xfs_buf_get_uncached(mp->m_ddev_targp, numblks, flags);
+        if (!bp)
+                return NULL;
+        xfs_buf_zero(bp, 0, BBTOB(bp->b_length));
+        bp->b_bn = blkno;
+        bp->b_maps[0].bm_bn = blkno;
+        bp->b_ops = ops;
+        return bp;
+}
 static int
 xfs_growfs_data_private(
        xfs_mount_t             *mp,            /* mount point for filesystem */
        xfs_growfs_data_t       *in)            /* growfs data input struct */
 {
        xfs_agf_t               *agf;
+        struct xfs_agfl         *agfl;
        xfs_agi_t               *agi;
        xfs_agnumber_t          agno;
        xfs_extlen_t            agsize;
        xfs_extlen_t            tmpsize;
        xfs_alloc_rec_t         *arec;
-        struct xfs_btree_block  *block;
        xfs_buf_t               *bp;
        int                     bucket;
        int                     dpct;
@@ -146,9 +170,14 @@ xfs_growfs_data_private(
        dpct = pct - mp->m_sb.sb_imax_pct;
        bp = xfs_buf_read_uncached(mp->m_ddev_targp,
                                XFS_FSB_TO_BB(mp, nb) - XFS_FSS_TO_BB(mp, 1),
-                                XFS_FSS_TO_BB(mp, 1), 0);
+                                XFS_FSS_TO_BB(mp, 1), 0, NULL);
        if (!bp)
                return EIO;
+        if (bp->b_error) {
+                int     error = bp->b_error;
+                xfs_buf_relse(bp);
+                return error;
+        }
        xfs_buf_relse(bp);
        new = nb;       /* use new as a temporary here */
@@ -186,17 +215,18 @@ xfs_growfs_data_private(
        nfree = 0;
        for (agno = nagcount - 1; agno >= oagcount; agno--, new -= agsize) {
                /*
-                 * AG freelist header block
+                 * AG freespace header block
                 */
-                bp = xfs_buf_get(mp->m_ddev_targp,
+                bp = xfs_growfs_get_hdr_buf(mp,
-                                 XFS_AG_DADDR(mp, agno, XFS_AGF_DADDR(mp)),
+                                XFS_AG_DADDR(mp, agno, XFS_AGF_DADDR(mp)),
-                                 XFS_FSS_TO_BB(mp, 1), 0);
+                                XFS_FSS_TO_BB(mp, 1), 0,
+                                &xfs_agf_buf_ops);
                if (!bp) {
                        error = ENOMEM;
                        goto error0;
                }
                agf = XFS_BUF_TO_AGF(bp);
-                memset(agf, 0, mp->m_sb.sb_sectsize);
                agf->agf_magicnum = cpu_to_be32(XFS_AGF_MAGIC);
                agf->agf_versionnum = cpu_to_be32(XFS_AGF_VERSION);
                agf->agf_seqno = cpu_to_be32(agno);
@@ -223,17 +253,39 @@ xfs_growfs_data_private(
                        goto error0;
                /*
+                 * AG freelist header block
+                 */
+                bp = xfs_growfs_get_hdr_buf(mp,
+                                XFS_AG_DADDR(mp, agno, XFS_AGFL_DADDR(mp)),
+                                XFS_FSS_TO_BB(mp, 1), 0,
+                                &xfs_agfl_buf_ops);
+                if (!bp) {
+                        error = ENOMEM;
+                        goto error0;
+                }
+                agfl = XFS_BUF_TO_AGFL(bp);
+                for (bucket = 0; bucket < XFS_AGFL_SIZE(mp); bucket++)
+                        agfl->agfl_bno[bucket] = cpu_to_be32(NULLAGBLOCK);
+                error = xfs_bwrite(bp);
+                xfs_buf_relse(bp);
+                if (error)
+                        goto error0;
+                /*
                 * AG inode header block
                 */
-                bp = xfs_buf_get(mp->m_ddev_targp,
+                bp = xfs_growfs_get_hdr_buf(mp,
-                                 XFS_AG_DADDR(mp, agno, XFS_AGI_DADDR(mp)),
+                                XFS_AG_DADDR(mp, agno, XFS_AGI_DADDR(mp)),
-                                 XFS_FSS_TO_BB(mp, 1), 0);
+                                XFS_FSS_TO_BB(mp, 1), 0,
+                                &xfs_agi_buf_ops);
                if (!bp) {
                        error = ENOMEM;
                        goto error0;
                }
                agi = XFS_BUF_TO_AGI(bp);
-                memset(agi, 0, mp->m_sb.sb_sectsize);
                agi->agi_magicnum = cpu_to_be32(XFS_AGI_MAGIC);
                agi->agi_versionnum = cpu_to_be32(XFS_AGI_VERSION);
                agi->agi_seqno = cpu_to_be32(agno);
@@ -254,24 +306,22 @@ xfs_growfs_data_private(
                /*
                 * BNO btree root block
                 */
-                bp = xfs_buf_get(mp->m_ddev_targp,
+                bp = xfs_growfs_get_hdr_buf(mp,
-                                 XFS_AGB_TO_DADDR(mp, agno, XFS_BNO_BLOCK(mp)),
+                                XFS_AGB_TO_DADDR(mp, agno, XFS_BNO_BLOCK(mp)),
-                                 BTOBB(mp->m_sb.sb_blocksize), 0);
+                                BTOBB(mp->m_sb.sb_blocksize), 0,
+                                &xfs_allocbt_buf_ops);
                if (!bp) {
                        error = ENOMEM;
                        goto error0;
                }
-                block = XFS_BUF_TO_BLOCK(bp);
-                memset(block, 0, mp->m_sb.sb_blocksize);
+                xfs_btree_init_block(mp, bp, XFS_ABTB_MAGIC, 0, 1, 0);
-                block->bb_magic = cpu_to_be32(XFS_ABTB_MAGIC);
+                arec = XFS_ALLOC_REC_ADDR(mp, XFS_BUF_TO_BLOCK(bp), 1);
-                block->bb_level = 0;
-                block->bb_numrecs = cpu_to_be16(1);
-                block->bb_u.s.bb_leftsib = cpu_to_be32(NULLAGBLOCK);
-                block->bb_u.s.bb_rightsib = cpu_to_be32(NULLAGBLOCK);
-                arec = XFS_ALLOC_REC_ADDR(mp, block, 1);
                arec->ar_startblock = cpu_to_be32(XFS_PREALLOC_BLOCKS(mp));
                arec->ar_blockcount = cpu_to_be32(
                        agsize - be32_to_cpu(arec->ar_startblock));
                error = xfs_bwrite(bp);
                xfs_buf_relse(bp);
                if (error)
@@ -280,25 +330,22 @@ xfs_growfs_data_private(
                /*
                 * CNT btree root block
                 */
-                bp = xfs_buf_get(mp->m_ddev_targp,
+                bp = xfs_growfs_get_hdr_buf(mp,
-                                 XFS_AGB_TO_DADDR(mp, agno, XFS_CNT_BLOCK(mp)),
+                                XFS_AGB_TO_DADDR(mp, agno, XFS_CNT_BLOCK(mp)),
-                                 BTOBB(mp->m_sb.sb_blocksize), 0);
+                                BTOBB(mp->m_sb.sb_blocksize), 0,
+                                &xfs_allocbt_buf_ops);
                if (!bp) {
                        error = ENOMEM;
                        goto error0;
                }
-                block = XFS_BUF_TO_BLOCK(bp);
-                memset(block, 0, mp->m_sb.sb_blocksize);
+                xfs_btree_init_block(mp, bp, XFS_ABTC_MAGIC, 0, 1, 0);
-                block->bb_magic = cpu_to_be32(XFS_ABTC_MAGIC);
+                arec = XFS_ALLOC_REC_ADDR(mp, XFS_BUF_TO_BLOCK(bp), 1);
-                block->bb_level = 0;
-                block->bb_numrecs = cpu_to_be16(1);
-                block->bb_u.s.bb_leftsib = cpu_to_be32(NULLAGBLOCK);
-                block->bb_u.s.bb_rightsib = cpu_to_be32(NULLAGBLOCK);
-                arec = XFS_ALLOC_REC_ADDR(mp, block, 1);
                arec->ar_startblock = cpu_to_be32(XFS_PREALLOC_BLOCKS(mp));
                arec->ar_blockcount = cpu_to_be32(
                        agsize - be32_to_cpu(arec->ar_startblock));
                nfree += be32_to_cpu(arec->ar_blockcount);
                error = xfs_bwrite(bp);
                xfs_buf_relse(bp);
                if (error)
@@ -307,20 +354,17 @@ xfs_growfs_data_private(
                /*
                 * INO btree root block
                 */
-                bp = xfs_buf_get(mp->m_ddev_targp,
+                bp = xfs_growfs_get_hdr_buf(mp,
-                                 XFS_AGB_TO_DADDR(mp, agno, XFS_IBT_BLOCK(mp)),
+                                XFS_AGB_TO_DADDR(mp, agno, XFS_IBT_BLOCK(mp)),
-                                 BTOBB(mp->m_sb.sb_blocksize), 0);
+                                BTOBB(mp->m_sb.sb_blocksize), 0,
+                                &xfs_inobt_buf_ops);
                if (!bp) {
                        error = ENOMEM;
                        goto error0;
                }
-                block = XFS_BUF_TO_BLOCK(bp);
-                memset(block, 0, mp->m_sb.sb_blocksize);
+                xfs_btree_init_block(mp, bp, XFS_IBT_MAGIC, 0, 0, 0);
-                block->bb_magic = cpu_to_be32(XFS_IBT_MAGIC);
-                block->bb_level = 0;
-                block->bb_numrecs = 0;
-                block->bb_u.s.bb_leftsib = cpu_to_be32(NULLAGBLOCK);
-                block->bb_u.s.bb_rightsib = cpu_to_be32(NULLAGBLOCK);
                error = xfs_bwrite(bp);
                xfs_buf_relse(bp);
                if (error)
@@ -399,9 +443,28 @@ xfs_growfs_data_private(
        /* update secondary superblocks. */
        for (agno = 1; agno < nagcount; agno++) {
-                error = xfs_trans_read_buf(mp, NULL, mp->m_ddev_targp,
+                error = 0;
+                /*
+                 * new secondary superblocks need to be zeroed, not read from
+                 * disk as the contents of the new area we are growing into is
+                 * completely unknown.
+                 */
+                if (agno < oagcount) {
+                        error = xfs_trans_read_buf(mp, NULL, mp->m_ddev_targp,
+                                  XFS_AGB_TO_DADDR(mp, agno, XFS_SB_BLOCK(mp)),
+                                  XFS_FSS_TO_BB(mp, 1), 0, &bp,
+                                  &xfs_sb_buf_ops);
+                } else {
+                        bp = xfs_trans_get_buf(NULL, mp->m_ddev_targp,
                                  XFS_AGB_TO_DADDR(mp, agno, XFS_SB_BLOCK(mp)),
-                                  XFS_FSS_TO_BB(mp, 1), 0, &bp);
+                                  XFS_FSS_TO_BB(mp, 1), 0);
+                        if (bp) {
+                                bp->b_ops = &xfs_sb_buf_ops;
+                                xfs_buf_zero(bp, 0, BBTOB(bp->b_length));
+                        } else
+                                error = ENOMEM;
+                }
                if (error) {
                        xfs_warn(mp,
                "error %d reading secondary superblock for ag %d",
@@ -409,6 +472,7 @@ xfs_growfs_data_private(
                        break;
                }
                xfs_sb_to_disk(XFS_BUF_TO_SBP(bp), &mp->m_sb, XFS_SB_ALL_BITS);
                /*
                 * If we get an error writing out the alternate superblocks,
                 * just issue a warning and continue.  The real work is
@@ -423,7 +487,7 @@ xfs_growfs_data_private(
                        break; /* no point in continuing */
                }
        }
-        return 0;
+        return error;
 error0:
        xfs_trans_cancel(tp, XFS_TRANS_ABORT);
diff --git a/fs/xfs/xfs_globals.c b/fs/xfs/xfs_globals.c
index 76e81cff70b9..5399ef222dd7 100644
--- a/fs/xfs/xfs_globals.c
+++ b/fs/xfs/xfs_globals.c
@@ -21,7 +21,8 @@
 /*
 * Tunable XFS parameters.  xfs_params is required even when CONFIG_SYSCTL=n,
 * other XFS code uses these values.  Times are measured in centisecs (i.e.
- * 100ths of a second).
+ * 100ths of a second) with the exception of eofb_timer, which is measured in
+ * seconds.
 */
 xfs_param_t xfs_params = {
                          /*    MIN             DFLT            MAX     */
@@ -40,4 +41,5 @@ xfs_param_t xfs_params = {
        .rotorstep      = {     1,              1,              255     },
        .inherit_nodfrg = {     0,              1,              1       },
        .fstrm_timer    = {     1,              30*100,         3600*100},
+        .eofb_timer     = {     1,              300,            3600*24},
 };
diff --git a/fs/xfs/xfs_ialloc.c b/fs/xfs/xfs_ialloc.c
index 445bf1aef31c..a815412eab80 100644
--- a/fs/xfs/xfs_ialloc.c
+++ b/fs/xfs/xfs_ialloc.c
@@ -200,7 +200,8 @@ xfs_ialloc_inode_init(
                 */
                d = XFS_AGB_TO_DADDR(mp, agno, agbno + (j * blks_per_cluster));
                fbuf = xfs_trans_get_buf(tp, mp->m_ddev_targp, d,
-                                         mp->m_bsize * blks_per_cluster, 0);
+                                         mp->m_bsize * blks_per_cluster,
+                                         XBF_UNMAPPED);
                if (!fbuf)
                        return ENOMEM;
                /*
@@ -210,6 +211,7 @@ xfs_ialloc_inode_init(
                 *      to log a whole cluster of inodes instead of all the
                 *      individual transactions causing a lot of log traffic.
                 */
+                fbuf->b_ops = &xfs_inode_buf_ops;
                xfs_buf_zero(fbuf, 0, ninodes << mp->m_sb.sb_inodelog);
                for (i = 0; i < ninodes; i++) {
                        int     ioffset = i << mp->m_sb.sb_inodelog;
@@ -250,6 +252,7 @@ xfs_ialloc_ag_alloc(
                                        /* boundary */
        struct xfs_perag *pag;
+        memset(&args, 0, sizeof(args));
        args.tp = tp;
        args.mp = tp->t_mountp;
@@ -876,9 +879,9 @@ error0:
 * This function is designed to be called twice if it has to do an allocation
 * to make more free inodes.  On the first call, *IO_agbp should be set to NULL.
 * If an inode is available without having to performn an allocation, an inode
- * number is returned.  In this case, *IO_agbp would be NULL.  If an allocation
+ * number is returned.  In this case, *IO_agbp is set to NULL.  If an allocation
- * needes to be done, xfs_dialloc would return the current AGI buffer in
+ * needs to be done, xfs_dialloc returns the current AGI buffer in *IO_agbp.
- * *IO_agbp.  The caller should then commit the current transaction, allocate a
+ * The caller should then commit the current transaction, allocate a
 * new transaction, and call xfs_dialloc() again, passing in the previous value
 * of *IO_agbp.  IO_agbp should be held across the transactions. Since the AGI
 * buffer is locked across the two calls, the second call is guaranteed to have
@@ -1471,6 +1474,57 @@ xfs_check_agi_unlinked(
 #define xfs_check_agi_unlinked(agi)
 #endif
+static void
+xfs_agi_verify(
+        struct xfs_buf  *bp)
+{
+        struct xfs_mount *mp = bp->b_target->bt_mount;
+        struct xfs_agi  *agi = XFS_BUF_TO_AGI(bp);
+        int             agi_ok;
+        /*
+         * Validate the magic number of the agi block.
+         */
+        agi_ok = agi->agi_magicnum == cpu_to_be32(XFS_AGI_MAGIC) &&
+                XFS_AGI_GOOD_VERSION(be32_to_cpu(agi->agi_versionnum));
+        /*
+         * during growfs operations, the perag is not fully initialised,
+         * so we can't use it for any useful checking. growfs ensures we can't
+         * use it by using uncached buffers that don't have the perag attached
+         * so we can detect and avoid this problem.
+         */
+        if (bp->b_pag)
+                agi_ok = agi_ok && be32_to_cpu(agi->agi_seqno) ==
+                                                bp->b_pag->pag_agno;
+        if (unlikely(XFS_TEST_ERROR(!agi_ok, mp, XFS_ERRTAG_IALLOC_READ_AGI,
+                        XFS_RANDOM_IALLOC_READ_AGI))) {
+                XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_LOW, mp, agi);
+                xfs_buf_ioerror(bp, EFSCORRUPTED);
+        }
+        xfs_check_agi_unlinked(agi);
+}
+static void
+xfs_agi_read_verify(
+        struct xfs_buf  *bp)
+{
+        xfs_agi_verify(bp);
+}
+static void
+xfs_agi_write_verify(
+        struct xfs_buf  *bp)
+{
+        xfs_agi_verify(bp);
+}
+const struct xfs_buf_ops xfs_agi_buf_ops = {
+        .verify_read = xfs_agi_read_verify,
+        .verify_write = xfs_agi_write_verify,
+};
 /*
 * Read in the allocation group header (inode allocation section)
 */
@@ -1481,38 +1535,18 @@ xfs_read_agi(
        xfs_agnumber_t          agno,   /* allocation group number */
        struct xfs_buf          **bpp)  /* allocation group hdr buf */
 {
-        struct xfs_agi          *agi;   /* allocation group header */
-        int                     agi_ok; /* agi is consistent */
        int                     error;
        ASSERT(agno != NULLAGNUMBER);
        error = xfs_trans_read_buf(mp, tp, mp->m_ddev_targp,
                        XFS_AG_DADDR(mp, agno, XFS_AGI_DADDR(mp)),
-                        XFS_FSS_TO_BB(mp, 1), 0, bpp);
+                        XFS_FSS_TO_BB(mp, 1), 0, bpp, &xfs_agi_buf_ops);
        if (error)
                return error;
        ASSERT(!xfs_buf_geterror(*bpp));
-        agi = XFS_BUF_TO_AGI(*bpp);
-        /*
-         * Validate the magic number of the agi block.
-         */
-        agi_ok = agi->agi_magicnum == cpu_to_be32(XFS_AGI_MAGIC) &&
-                XFS_AGI_GOOD_VERSION(be32_to_cpu(agi->agi_versionnum)) &&
-                be32_to_cpu(agi->agi_seqno) == agno;
-        if (unlikely(XFS_TEST_ERROR(!agi_ok, mp, XFS_ERRTAG_IALLOC_READ_AGI,
-                        XFS_RANDOM_IALLOC_READ_AGI))) {
-                XFS_CORRUPTION_ERROR("xfs_read_agi", XFS_ERRLEVEL_LOW,
-                                     mp, agi);
-                xfs_trans_brelse(tp, *bpp);
-                return XFS_ERROR(EFSCORRUPTED);
-        }
        xfs_buf_set_ref(*bpp, XFS_AGI_REF);
-        xfs_check_agi_unlinked(agi);
        return 0;
 }
diff --git a/fs/xfs/xfs_ialloc.h b/fs/xfs/xfs_ialloc.h
index 1fd6ea4e9c91..c8da3df271e6 100644
--- a/fs/xfs/xfs_ialloc.h
+++ b/fs/xfs/xfs_ialloc.h
@@ -147,7 +147,9 @@ int xfs_inobt_lookup(struct xfs_btree_cur *cur, xfs_agino_t ino,
 /*
 * Get the data from the pointed-to record.
 */
-extern int xfs_inobt_get_rec(struct xfs_btree_cur *cur,
+int xfs_inobt_get_rec(struct xfs_btree_cur *cur,
                xfs_inobt_rec_incore_t *rec, int *stat);
+extern const struct xfs_buf_ops xfs_agi_buf_ops;
 #endif  /* __XFS_IALLOC_H__ */
diff --git a/fs/xfs/xfs_ialloc_btree.c b/fs/xfs/xfs_ialloc_btree.c
index 2b8b7a37aa18..bec344b36507 100644
--- a/fs/xfs/xfs_ialloc_btree.c
+++ b/fs/xfs/xfs_ialloc_btree.c
@@ -33,6 +33,7 @@
 #include "xfs_ialloc.h"
 #include "xfs_alloc.h"
 #include "xfs_error.h"
+#include "xfs_trace.h"
 STATIC int
@@ -181,6 +182,59 @@ xfs_inobt_key_diff(
                          cur->bc_rec.i.ir_startino;
 }
+void
+xfs_inobt_verify(
+        struct xfs_buf          *bp)
+{
+        struct xfs_mount        *mp = bp->b_target->bt_mount;
+        struct xfs_btree_block  *block = XFS_BUF_TO_BLOCK(bp);
+        unsigned int            level;
+        int                     sblock_ok; /* block passes checks */
+        /* magic number and level verification */
+        level = be16_to_cpu(block->bb_level);
+        sblock_ok = block->bb_magic == cpu_to_be32(XFS_IBT_MAGIC) &&
+                    level < mp->m_in_maxlevels;
+        /* numrecs verification */
+        sblock_ok = sblock_ok &&
+                be16_to_cpu(block->bb_numrecs) <= mp->m_inobt_mxr[level != 0];
+        /* sibling pointer verification */
+        sblock_ok = sblock_ok &&
+                (block->bb_u.s.bb_leftsib == cpu_to_be32(NULLAGBLOCK) ||
+                 be32_to_cpu(block->bb_u.s.bb_leftsib) < mp->m_sb.sb_agblocks) &&
+                block->bb_u.s.bb_leftsib &&
+                (block->bb_u.s.bb_rightsib == cpu_to_be32(NULLAGBLOCK) ||
+                 be32_to_cpu(block->bb_u.s.bb_rightsib) < mp->m_sb.sb_agblocks) &&
+                block->bb_u.s.bb_rightsib;
+        if (!sblock_ok) {
+                trace_xfs_btree_corrupt(bp, _RET_IP_);
+                XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_LOW, mp, block);
+                xfs_buf_ioerror(bp, EFSCORRUPTED);
+        }
+}
+static void
+xfs_inobt_read_verify(
+        struct xfs_buf  *bp)
+{
+        xfs_inobt_verify(bp);
+}
+static void
+xfs_inobt_write_verify(
+        struct xfs_buf  *bp)
+{
+        xfs_inobt_verify(bp);
+}
+const struct xfs_buf_ops xfs_inobt_buf_ops = {
+        .verify_read = xfs_inobt_read_verify,
+        .verify_write = xfs_inobt_write_verify,
+};
 #ifdef DEBUG
 STATIC int
 xfs_inobt_keys_inorder(
@@ -218,6 +272,7 @@ static const struct xfs_btree_ops xfs_inobt_ops = {
        .init_rec_from_cur      = xfs_inobt_init_rec_from_cur,
        .init_ptr_from_cur      = xfs_inobt_init_ptr_from_cur,
        .key_diff               = xfs_inobt_key_diff,
+        .buf_ops                = &xfs_inobt_buf_ops,
 #ifdef DEBUG
        .keys_inorder           = xfs_inobt_keys_inorder,
        .recs_inorder           = xfs_inobt_recs_inorder,
diff --git a/fs/xfs/xfs_ialloc_btree.h b/fs/xfs/xfs_ialloc_btree.h
index f782ad0c4769..25c0239a8eab 100644
--- a/fs/xfs/xfs_ialloc_btree.h
+++ b/fs/xfs/xfs_ialloc_btree.h
@@ -109,4 +109,6 @@ extern struct xfs_btree_cur *xfs_inobt_init_cursor(struct xfs_mount *,
                struct xfs_trans *, struct xfs_buf *, xfs_agnumber_t);
 extern int xfs_inobt_maxrecs(struct xfs_mount *, int, int);
+extern const struct xfs_buf_ops xfs_inobt_buf_ops;
 #endif  /* __XFS_IALLOC_BTREE_H__ */
diff --git a/fs/xfs/xfs_sync.c b/fs/xfs/xfs_icache.c
index 9500caf15acf..96e344e3e927 100644
--- a/fs/xfs/xfs_sync.c
+++ b/fs/xfs/xfs_icache.c
@@ -19,6 +19,7 @@
 #include "xfs_fs.h"
 #include "xfs_types.h"
 #include "xfs_log.h"
+#include "xfs_log_priv.h"
 #include "xfs_inum.h"
 #include "xfs_trans.h"
 #include "xfs_trans_priv.h"
@@ -35,11 +36,425 @@
 #include "xfs_quota.h"
 #include "xfs_trace.h"
 #include "xfs_fsops.h"
+#include "xfs_icache.h"
 #include <linux/kthread.h>
 #include <linux/freezer.h>
-struct workqueue_struct *xfs_syncd_wq;  /* sync workqueue */
+STATIC void __xfs_inode_clear_reclaim_tag(struct xfs_mount *mp,
+                                struct xfs_perag *pag, struct xfs_inode *ip);
+/*
+ * Allocate and initialise an xfs_inode.
+ */
+STATIC struct xfs_inode *
+xfs_inode_alloc(
+        struct xfs_mount        *mp,
+        xfs_ino_t               ino)
+{
+        struct xfs_inode        *ip;
+        /*
+         * if this didn't occur in transactions, we could use
+         * KM_MAYFAIL and return NULL here on ENOMEM. Set the
+         * code up to do this anyway.
+         */
+        ip = kmem_zone_alloc(xfs_inode_zone, KM_SLEEP);
+        if (!ip)
+                return NULL;
+        if (inode_init_always(mp->m_super, VFS_I(ip))) {
+                kmem_zone_free(xfs_inode_zone, ip);
+                return NULL;
+        }
+        ASSERT(atomic_read(&ip->i_pincount) == 0);
+        ASSERT(!spin_is_locked(&ip->i_flags_lock));
+        ASSERT(!xfs_isiflocked(ip));
+        ASSERT(ip->i_ino == 0);
+        mrlock_init(&ip->i_iolock, MRLOCK_BARRIER, "xfsio", ip->i_ino);
+        /* initialise the xfs inode */
+        ip->i_ino = ino;
+        ip->i_mount = mp;
+        memset(&ip->i_imap, 0, sizeof(struct xfs_imap));
+        ip->i_afp = NULL;
+        memset(&ip->i_df, 0, sizeof(xfs_ifork_t));
+        ip->i_flags = 0;
+        ip->i_delayed_blks = 0;
+        memset(&ip->i_d, 0, sizeof(xfs_icdinode_t));
+        return ip;
+}
+STATIC void
+xfs_inode_free_callback(
+        struct rcu_head         *head)
+{
+        struct inode            *inode = container_of(head, struct inode, i_rcu);
+        struct xfs_inode        *ip = XFS_I(inode);
+        kmem_zone_free(xfs_inode_zone, ip);
+}
+STATIC void
+xfs_inode_free(
+        struct xfs_inode        *ip)
+{
+        switch (ip->i_d.di_mode & S_IFMT) {
+        case S_IFREG:
+        case S_IFDIR:
+        case S_IFLNK:
+                xfs_idestroy_fork(ip, XFS_DATA_FORK);
+                break;
+        }
+        if (ip->i_afp)
+                xfs_idestroy_fork(ip, XFS_ATTR_FORK);
+        if (ip->i_itemp) {
+                ASSERT(!(ip->i_itemp->ili_item.li_flags & XFS_LI_IN_AIL));
+                xfs_inode_item_destroy(ip);
+                ip->i_itemp = NULL;
+        }
+        /* asserts to verify all state is correct here */
+        ASSERT(atomic_read(&ip->i_pincount) == 0);
+        ASSERT(!spin_is_locked(&ip->i_flags_lock));
+        ASSERT(!xfs_isiflocked(ip));
+        /*
+         * Because we use RCU freeing we need to ensure the inode always
+         * appears to be reclaimed with an invalid inode number when in the
+         * free state. The ip->i_flags_lock provides the barrier against lookup
+         * races.
+         */
+        spin_lock(&ip->i_flags_lock);
+        ip->i_flags = XFS_IRECLAIM;
+        ip->i_ino = 0;
+        spin_unlock(&ip->i_flags_lock);
+        call_rcu(&VFS_I(ip)->i_rcu, xfs_inode_free_callback);
+}
+/*
+ * Check the validity of the inode we just found it the cache
+ */
+static int
+xfs_iget_cache_hit(
+        struct xfs_perag        *pag,
+        struct xfs_inode        *ip,
+        xfs_ino_t               ino,
+        int                     flags,
+        int                     lock_flags) __releases(RCU)
+{
+        struct inode            *inode = VFS_I(ip);
+        struct xfs_mount        *mp = ip->i_mount;
+        int                     error;
+        /*
+         * check for re-use of an inode within an RCU grace period due to the
+         * radix tree nodes not being updated yet. We monitor for this by
+         * setting the inode number to zero before freeing the inode structure.
+         * If the inode has been reallocated and set up, then the inode number
+         * will not match, so check for that, too.
+         */
+        spin_lock(&ip->i_flags_lock);
+        if (ip->i_ino != ino) {
+                trace_xfs_iget_skip(ip);
+                XFS_STATS_INC(xs_ig_frecycle);
+                error = EAGAIN;
+                goto out_error;
+        }
+        /*
+         * If we are racing with another cache hit that is currently
+         * instantiating this inode or currently recycling it out of
+         * reclaimabe state, wait for the initialisation to complete
+         * before continuing.
+         *
+         * XXX(hch): eventually we should do something equivalent to
+         *           wait_on_inode to wait for these flags to be cleared
+         *           instead of polling for it.
+         */
+        if (ip->i_flags & (XFS_INEW|XFS_IRECLAIM)) {
+                trace_xfs_iget_skip(ip);
+                XFS_STATS_INC(xs_ig_frecycle);
+                error = EAGAIN;
+                goto out_error;
+        }
+        /*
+         * If lookup is racing with unlink return an error immediately.
+         */
+        if (ip->i_d.di_mode == 0 && !(flags & XFS_IGET_CREATE)) {
+                error = ENOENT;
+                goto out_error;
+        }
+        /*
+         * If IRECLAIMABLE is set, we've torn down the VFS inode already.
+         * Need to carefully get it back into useable state.
+         */
+        if (ip->i_flags & XFS_IRECLAIMABLE) {
+                trace_xfs_iget_reclaim(ip);
+                /*
+                 * We need to set XFS_IRECLAIM to prevent xfs_reclaim_inode
+                 * from stomping over us while we recycle the inode.  We can't
+                 * clear the radix tree reclaimable tag yet as it requires
+                 * pag_ici_lock to be held exclusive.
+                 */
+                ip->i_flags |= XFS_IRECLAIM;
+                spin_unlock(&ip->i_flags_lock);
+                rcu_read_unlock();
+                error = -inode_init_always(mp->m_super, inode);
+                if (error) {
+                        /*
+                         * Re-initializing the inode failed, and we are in deep
+                         * trouble.  Try to re-add it to the reclaim list.
+                         */
+                        rcu_read_lock();
+                        spin_lock(&ip->i_flags_lock);
+                        ip->i_flags &= ~(XFS_INEW | XFS_IRECLAIM);
+                        ASSERT(ip->i_flags & XFS_IRECLAIMABLE);
+                        trace_xfs_iget_reclaim_fail(ip);
+                        goto out_error;
+                }
+                spin_lock(&pag->pag_ici_lock);
+                spin_lock(&ip->i_flags_lock);
+                /*
+                 * Clear the per-lifetime state in the inode as we are now
+                 * effectively a new inode and need to return to the initial
+                 * state before reuse occurs.
+                 */
+                ip->i_flags &= ~XFS_IRECLAIM_RESET_FLAGS;
+                ip->i_flags |= XFS_INEW;
+                __xfs_inode_clear_reclaim_tag(mp, pag, ip);
+                inode->i_state = I_NEW;
+                ASSERT(!rwsem_is_locked(&ip->i_iolock.mr_lock));
+                mrlock_init(&ip->i_iolock, MRLOCK_BARRIER, "xfsio", ip->i_ino);
+                spin_unlock(&ip->i_flags_lock);
+                spin_unlock(&pag->pag_ici_lock);
+        } else {
+                /* If the VFS inode is being torn down, pause and try again. */
+                if (!igrab(inode)) {
+                        trace_xfs_iget_skip(ip);
+                        error = EAGAIN;
+                        goto out_error;
+                }
+                /* We've got a live one. */
+                spin_unlock(&ip->i_flags_lock);
+                rcu_read_unlock();
+                trace_xfs_iget_hit(ip);
+        }
+        if (lock_flags != 0)
+                xfs_ilock(ip, lock_flags);
+        xfs_iflags_clear(ip, XFS_ISTALE | XFS_IDONTCACHE);
+        XFS_STATS_INC(xs_ig_found);
+        return 0;
+out_error:
+        spin_unlock(&ip->i_flags_lock);
+        rcu_read_unlock();
+        return error;
+}
+static int
+xfs_iget_cache_miss(
+        struct xfs_mount        *mp,
+        struct xfs_perag        *pag,
+        xfs_trans_t             *tp,
+        xfs_ino_t               ino,
+        struct xfs_inode        **ipp,
+        int                     flags,
+        int                     lock_flags)
+{
+        struct xfs_inode        *ip;
+        int                     error;
+        xfs_agino_t             agino = XFS_INO_TO_AGINO(mp, ino);
+        int                     iflags;
+        ip = xfs_inode_alloc(mp, ino);
+        if (!ip)
+                return ENOMEM;
+        error = xfs_iread(mp, tp, ip, flags);
+        if (error)
+                goto out_destroy;
+        trace_xfs_iget_miss(ip);
+        if ((ip->i_d.di_mode == 0) && !(flags & XFS_IGET_CREATE)) {
+                error = ENOENT;
+                goto out_destroy;
+        }
+        /*
+         * Preload the radix tree so we can insert safely under the
+         * write spinlock. Note that we cannot sleep inside the preload
+         * region. Since we can be called from transaction context, don't
+         * recurse into the file system.
+         */
+        if (radix_tree_preload(GFP_NOFS)) {
+                error = EAGAIN;
+                goto out_destroy;
+        }
+        /*
+         * Because the inode hasn't been added to the radix-tree yet it can't
+         * be found by another thread, so we can do the non-sleeping lock here.
+         */
+        if (lock_flags) {
+                if (!xfs_ilock_nowait(ip, lock_flags))
+                        BUG();
+        }
+        /*
+         * These values must be set before inserting the inode into the radix
+         * tree as the moment it is inserted a concurrent lookup (allowed by the
+         * RCU locking mechanism) can find it and that lookup must see that this
+         * is an inode currently under construction (i.e. that XFS_INEW is set).
+         * The ip->i_flags_lock that protects the XFS_INEW flag forms the
+         * memory barrier that ensures this detection works correctly at lookup
+         * time.
+         */
+        iflags = XFS_INEW;
+        if (flags & XFS_IGET_DONTCACHE)
+                iflags |= XFS_IDONTCACHE;
+        ip->i_udquot = ip->i_gdquot = NULL;
+        xfs_iflags_set(ip, iflags);
+        /* insert the new inode */
+        spin_lock(&pag->pag_ici_lock);
+        error = radix_tree_insert(&pag->pag_ici_root, agino, ip);
+        if (unlikely(error)) {
+                WARN_ON(error != -EEXIST);
+                XFS_STATS_INC(xs_ig_dup);
+                error = EAGAIN;
+                goto out_preload_end;
+        }
+        spin_unlock(&pag->pag_ici_lock);
+        radix_tree_preload_end();
+        *ipp = ip;
+        return 0;
+out_preload_end:
+        spin_unlock(&pag->pag_ici_lock);
+        radix_tree_preload_end();
+        if (lock_flags)
+                xfs_iunlock(ip, lock_flags);
+out_destroy:
+        __destroy_inode(VFS_I(ip));
+        xfs_inode_free(ip);
+        return error;
+}
+/*
+ * Look up an inode by number in the given file system.
+ * The inode is looked up in the cache held in each AG.
+ * If the inode is found in the cache, initialise the vfs inode
+ * if necessary.
+ *
+ * If it is not in core, read it in from the file system's device,
+ * add it to the cache and initialise the vfs inode.
+ *
+ * The inode is locked according to the value of the lock_flags parameter.
+ * This flag parameter indicates how and if the inode's IO lock and inode lock
+ * should be taken.
+ *
+ * mp -- the mount point structure for the current file system.  It points
+ *       to the inode hash table.
+ * tp -- a pointer to the current transaction if there is one.  This is
+ *       simply passed through to the xfs_iread() call.
+ * ino -- the number of the inode desired.  This is the unique identifier
+ *        within the file system for the inode being requested.
+ * lock_flags -- flags indicating how to lock the inode.  See the comment
+ *               for xfs_ilock() for a list of valid values.
+ */
+int
+xfs_iget(
+        xfs_mount_t     *mp,
+        xfs_trans_t     *tp,
+        xfs_ino_t       ino,
+        uint            flags,
+        uint            lock_flags,
+        xfs_inode_t     **ipp)
+{
+        xfs_inode_t     *ip;
+        int             error;
+        xfs_perag_t     *pag;
+        xfs_agino_t     agino;
+        /*
+         * xfs_reclaim_inode() uses the ILOCK to ensure an inode
+         * doesn't get freed while it's being referenced during a
+         * radix tree traversal here.  It assumes this function
+         * aqcuires only the ILOCK (and therefore it has no need to
+         * involve the IOLOCK in this synchronization).
+         */
+        ASSERT((lock_flags & (XFS_IOLOCK_EXCL | XFS_IOLOCK_SHARED)) == 0);
+        /* reject inode numbers outside existing AGs */
+        if (!ino || XFS_INO_TO_AGNO(mp, ino) >= mp->m_sb.sb_agcount)
+                return EINVAL;
+        /* get the perag structure and ensure that it's inode capable */
+        pag = xfs_perag_get(mp, XFS_INO_TO_AGNO(mp, ino));
+        agino = XFS_INO_TO_AGINO(mp, ino);
+again:
+        error = 0;
+        rcu_read_lock();
+        ip = radix_tree_lookup(&pag->pag_ici_root, agino);
+        if (ip) {
+                error = xfs_iget_cache_hit(pag, ip, ino, flags, lock_flags);
+                if (error)
+                        goto out_error_or_again;
+        } else {
+                rcu_read_unlock();
+                XFS_STATS_INC(xs_ig_missed);
+                error = xfs_iget_cache_miss(mp, pag, tp, ino, &ip,
+                                                        flags, lock_flags);
+                if (error)
+                        goto out_error_or_again;
+        }
+        xfs_perag_put(pag);
+        *ipp = ip;
+        /*
+         * If we have a real type for an on-disk inode, we can set ops(&unlock)
+         * now.  If it's a new inode being created, xfs_ialloc will handle it.
+         */
+        if (xfs_iflags_test(ip, XFS_INEW) && ip->i_d.di_mode != 0)
+                xfs_setup_inode(ip);
+        return 0;
+out_error_or_again:
+        if (error == EAGAIN) {
+                delay(1);
+                goto again;
+        }
+        xfs_perag_put(pag);
+        return error;
+}
 /*
 * The inode lookup is done in batches to keep the amount of lock traffic and
@@ -101,8 +516,11 @@ xfs_inode_ag_walk(
        struct xfs_mount        *mp,
        struct xfs_perag        *pag,
        int                     (*execute)(struct xfs_inode *ip,
-                                           struct xfs_perag *pag, int flags),
+                                           struct xfs_perag *pag, int flags,
-        int                     flags)
+                                           void *args),
+        int                     flags,
+        void                    *args,
+        int                     tag)
 {
        uint32_t                first_index;
        int                     last_error = 0;
@@ -121,9 +539,17 @@ restart:
                int             i;
                rcu_read_lock();
-                nr_found = radix_tree_gang_lookup(&pag->pag_ici_root,
+                if (tag == -1)
+                        nr_found = radix_tree_gang_lookup(&pag->pag_ici_root,
                                        (void **)batch, first_index,
                                        XFS_LOOKUP_BATCH);
+                else
+                        nr_found = radix_tree_gang_lookup_tag(
+                                        &pag->pag_ici_root,
+                                        (void **) batch, first_index,
+                                        XFS_LOOKUP_BATCH, tag);
                if (!nr_found) {
                        rcu_read_unlock();
                        break;
@@ -164,7 +590,7 @@ restart:
                for (i = 0; i < nr_found; i++) {
                        if (!batch[i])
                                continue;
-                        error = execute(batch[i], pag, flags);
+                        error = execute(batch[i], pag, flags, args);
                        IRELE(batch[i]);
                        if (error == EAGAIN) {
                                skipped++;
@@ -189,12 +615,40 @@ restart:
        return last_error;
 }
+/*
+ * Background scanning to trim post-EOF preallocated space. This is queued
+ * based on the 'background_prealloc_discard_period' tunable (5m by default).
+ */
+STATIC void
+xfs_queue_eofblocks(
+        struct xfs_mount *mp)
+{
+        rcu_read_lock();
+        if (radix_tree_tagged(&mp->m_perag_tree, XFS_ICI_EOFBLOCKS_TAG))
+                queue_delayed_work(mp->m_eofblocks_workqueue,
+                                   &mp->m_eofblocks_work,
+                                   msecs_to_jiffies(xfs_eofb_secs * 1000));
+        rcu_read_unlock();
+}
+void
+xfs_eofblocks_worker(
+        struct work_struct *work)
+{
+        struct xfs_mount *mp = container_of(to_delayed_work(work),
+                                struct xfs_mount, m_eofblocks_work);
+        xfs_icache_free_eofblocks(mp, NULL);
+        xfs_queue_eofblocks(mp);
+}
 int
 xfs_inode_ag_iterator(
        struct xfs_mount        *mp,
        int                     (*execute)(struct xfs_inode *ip,
-                                           struct xfs_perag *pag, int flags),
+                                           struct xfs_perag *pag, int flags,
-        int                     flags)
+                                           void *args),
+        int                     flags,
+        void                    *args)
 {
        struct xfs_perag        *pag;
        int                     error = 0;
@@ -204,7 +658,7 @@ xfs_inode_ag_iterator(
        ag = 0;
        while ((pag = xfs_perag_get(mp, ag))) {
                ag = pag->pag_agno + 1;
-                error = xfs_inode_ag_walk(mp, pag, execute, flags);
+                error = xfs_inode_ag_walk(mp, pag, execute, flags, args, -1);
                xfs_perag_put(pag);
                if (error) {
                        last_error = error;
@@ -215,224 +669,50 @@ xfs_inode_ag_iterator(
        return XFS_ERROR(last_error);
 }
-STATIC int
-xfs_sync_inode_data(
-        struct xfs_inode        *ip,
-        struct xfs_perag        *pag,
-        int                     flags)
-{
-        struct inode            *inode = VFS_I(ip);
-        struct address_space *mapping = inode->i_mapping;
-        int                     error = 0;
-        if (!mapping_tagged(mapping, PAGECACHE_TAG_DIRTY))
-                return 0;
-        if (!xfs_ilock_nowait(ip, XFS_IOLOCK_SHARED)) {
-                if (flags & SYNC_TRYLOCK)
-                        return 0;
-                xfs_ilock(ip, XFS_IOLOCK_SHARED);
-        }
-        error = xfs_flush_pages(ip, 0, -1, (flags & SYNC_WAIT) ?
-                                0 : XBF_ASYNC, FI_NONE);
-        xfs_iunlock(ip, XFS_IOLOCK_SHARED);
-        return error;
-}
-/*
- * Write out pagecache data for the whole filesystem.
- */
-STATIC int
-xfs_sync_data(
-        struct xfs_mount        *mp,
-        int                     flags)
-{
-        int                     error;
-        ASSERT((flags & ~(SYNC_TRYLOCK|SYNC_WAIT)) == 0);
-        error = xfs_inode_ag_iterator(mp, xfs_sync_inode_data, flags);
-        if (error)
-                return XFS_ERROR(error);
-        xfs_log_force(mp, (flags & SYNC_WAIT) ? XFS_LOG_SYNC : 0);
-        return 0;
-}
-STATIC int
-xfs_sync_fsdata(
-        struct xfs_mount        *mp)
-{
-        struct xfs_buf          *bp;
-        int                     error;
-        /*
-         * If the buffer is pinned then push on the log so we won't get stuck
-         * waiting in the write for someone, maybe ourselves, to flush the log.
-         *
-         * Even though we just pushed the log above, we did not have the
-         * superblock buffer locked at that point so it can become pinned in
-         * between there and here.
-         */
-        bp = xfs_getsb(mp, 0);
-        if (xfs_buf_ispinned(bp))
-                xfs_log_force(mp, 0);
-        error = xfs_bwrite(bp);
-        xfs_buf_relse(bp);
-        return error;
-}
-/*
- * When remounting a filesystem read-only or freezing the filesystem, we have
- * two phases to execute. This first phase is syncing the data before we
- * quiesce the filesystem, and the second is flushing all the inodes out after
- * we've waited for all the transactions created by the first phase to
- * complete. The second phase ensures that the inodes are written to their
- * location on disk rather than just existing in transactions in the log. This
- * means after a quiesce there is no log replay required to write the inodes to
- * disk (this is the main difference between a sync and a quiesce).
- */
-/*
- * First stage of freeze - no writers will make progress now we are here,
- * so we flush delwri and delalloc buffers here, then wait for all I/O to
- * complete.  Data is frozen at that point. Metadata is not frozen,
- * transactions can still occur here so don't bother emptying the AIL
- * because it'll just get dirty again.
- */
 int
-xfs_quiesce_data(
+xfs_inode_ag_iterator_tag(
-        struct xfs_mount        *mp)
+        struct xfs_mount        *mp,
-{
+        int                     (*execute)(struct xfs_inode *ip,
-        int                     error, error2 = 0;
+                                           struct xfs_perag *pag, int flags,
+                                           void *args),
-        /* force out the log */
+        int                     flags,
-        xfs_log_force(mp, XFS_LOG_SYNC);
+        void                    *args,
+        int                     tag)
-        /* write superblock and hoover up shutdown errors */
-        error = xfs_sync_fsdata(mp);
-        /* mark the log as covered if needed */
-        if (xfs_log_need_covered(mp))
-                error2 = xfs_fs_log_dummy(mp);
-        return error ? error : error2;
-}
-/*
- * Second stage of a quiesce. The data is already synced, now we have to take
- * care of the metadata. New transactions are already blocked, so we need to
- * wait for any remaining transactions to drain out before proceeding.
- */
-void
-xfs_quiesce_attr(
-        struct xfs_mount        *mp)
-{
-        int     error = 0;
-        /* wait for all modifications to complete */
-        while (atomic_read(&mp->m_active_trans) > 0)
-                delay(100);
-        /* reclaim inodes to do any IO before the freeze completes */
-        xfs_reclaim_inodes(mp, 0);
-        xfs_reclaim_inodes(mp, SYNC_WAIT);
-        /* flush all pending changes from the AIL */
-        xfs_ail_push_all_sync(mp->m_ail);
-        /*
-         * Just warn here till VFS can correctly support
-         * read-only remount without racing.
-         */
-        WARN_ON(atomic_read(&mp->m_active_trans) != 0);
-        /* Push the superblock and write an unmount record */
-        error = xfs_log_sbcount(mp);
-        if (error)
-                xfs_warn(mp, "xfs_attr_quiesce: failed to log sb changes. "
-                                "Frozen image may not be consistent.");
-        xfs_log_unmount_write(mp);
-        /*
-         * At this point we might have modified the superblock again and thus
-         * added an item to the AIL, thus flush it again.
-         */
-        xfs_ail_push_all_sync(mp->m_ail);
-        /*
-         * The superblock buffer is uncached and xfsaild_push() will lock and
-         * set the XBF_ASYNC flag on the buffer. We cannot do xfs_buf_iowait()
-         * here but a lock on the superblock buffer will block until iodone()
-         * has completed.
-         */
-        xfs_buf_lock(mp->m_sb_bp);
-        xfs_buf_unlock(mp->m_sb_bp);
-}
-static void
-xfs_syncd_queue_sync(
-        struct xfs_mount        *mp)
-{
-        queue_delayed_work(xfs_syncd_wq, &mp->m_sync_work,
-                                msecs_to_jiffies(xfs_syncd_centisecs * 10));
-}
-/*
- * Every sync period we need to unpin all items, reclaim inodes and sync
- * disk quotas.  We might need to cover the log to indicate that the
- * filesystem is idle and not frozen.
- */
-STATIC void
-xfs_sync_worker(
-        struct work_struct *work)
 {
-        struct xfs_mount *mp = container_of(to_delayed_work(work),
+        struct xfs_perag        *pag;
-                                        struct xfs_mount, m_sync_work);
+        int                     error = 0;
-        int             error;
+        int                     last_error = 0;
+        xfs_agnumber_t          ag;
-        /*
-         * We shouldn't write/force the log if we are in the mount/unmount
-         * process or on a read only filesystem. The workqueue still needs to be
-         * active in both cases, however, because it is used for inode reclaim
-         * during these times.  Use the MS_ACTIVE flag to avoid doing anything
-         * during mount.  Doing work during unmount is avoided by calling
-         * cancel_delayed_work_sync on this work queue before tearing down
-         * the ail and the log in xfs_log_unmount.
-         */
-        if (!(mp->m_super->s_flags & MS_ACTIVE) &&
-            !(mp->m_flags & XFS_MOUNT_RDONLY)) {
-                /* dgc: errors ignored here */
-                if (mp->m_super->s_writers.frozen == SB_UNFROZEN &&
-                    xfs_log_need_covered(mp))
-                        error = xfs_fs_log_dummy(mp);
-                else
-                        xfs_log_force(mp, 0);
-                /* start pushing all the metadata that is currently
+        ag = 0;
-                 * dirty */
+        while ((pag = xfs_perag_get_tag(mp, ag, tag))) {
-                xfs_ail_push_all(mp->m_ail);
+                ag = pag->pag_agno + 1;
+                error = xfs_inode_ag_walk(mp, pag, execute, flags, args, tag);
+                xfs_perag_put(pag);
+                if (error) {
+                        last_error = error;
+                        if (error == EFSCORRUPTED)
+                                break;
+                }
        }
+        return XFS_ERROR(last_error);
-        /* queue us up again */
-        xfs_syncd_queue_sync(mp);
 }
 /*
 * Queue a new inode reclaim pass if there are reclaimable inodes and there
 * isn't a reclaim pass already in progress. By default it runs every 5s based
- * on the xfs syncd work default of 30s. Perhaps this should have it's own
+ * on the xfs periodic sync default of 30s. Perhaps this should have it's own
 * tunable, but that can be done if this method proves to be ineffective or too
 * aggressive.
 */
 static void
-xfs_syncd_queue_reclaim(
+xfs_reclaim_work_queue(
        struct xfs_mount        *mp)
 {
        rcu_read_lock();
        if (radix_tree_tagged(&mp->m_perag_tree, XFS_ICI_RECLAIM_TAG)) {
-                queue_delayed_work(xfs_syncd_wq, &mp->m_reclaim_work,
+                queue_delayed_work(mp->m_reclaim_workqueue, &mp->m_reclaim_work,
                        msecs_to_jiffies(xfs_syncd_centisecs / 6 * 10));
        }
        rcu_read_unlock();
@@ -445,7 +725,7 @@ xfs_syncd_queue_reclaim(
 * goes low. It scans as quickly as possible avoiding locked inodes or those
 * already being flushed, and once done schedules a future pass.
 */
-STATIC void
+void
 xfs_reclaim_worker(
        struct work_struct *work)
 {
@@ -453,65 +733,10 @@ xfs_reclaim_worker(
                                        struct xfs_mount, m_reclaim_work);
        xfs_reclaim_inodes(mp, SYNC_TRYLOCK);
-        xfs_syncd_queue_reclaim(mp);
+        xfs_reclaim_work_queue(mp);
 }
-/*
+static void
- * Flush delayed allocate data, attempting to free up reserved space
- * from existing allocations.  At this point a new allocation attempt
- * has failed with ENOSPC and we are in the process of scratching our
- * heads, looking about for more room.
- *
- * Queue a new data flush if there isn't one already in progress and
- * wait for completion of the flush. This means that we only ever have one
- * inode flush in progress no matter how many ENOSPC events are occurring and
- * so will prevent the system from bogging down due to every concurrent
- * ENOSPC event scanning all the active inodes in the system for writeback.
- */
-void
-xfs_flush_inodes(
-        struct xfs_inode        *ip)
-{
-        struct xfs_mount        *mp = ip->i_mount;
-        queue_work(xfs_syncd_wq, &mp->m_flush_work);
-        flush_work(&mp->m_flush_work);
-}
-STATIC void
-xfs_flush_worker(
-        struct work_struct *work)
-{
-        struct xfs_mount *mp = container_of(work,
-                                        struct xfs_mount, m_flush_work);
-        xfs_sync_data(mp, SYNC_TRYLOCK);
-        xfs_sync_data(mp, SYNC_TRYLOCK | SYNC_WAIT);
-}
-int
-xfs_syncd_init(
-        struct xfs_mount        *mp)
-{
-        INIT_WORK(&mp->m_flush_work, xfs_flush_worker);
-        INIT_DELAYED_WORK(&mp->m_sync_work, xfs_sync_worker);
-        INIT_DELAYED_WORK(&mp->m_reclaim_work, xfs_reclaim_worker);
-        xfs_syncd_queue_sync(mp);
-        return 0;
-}
-void
-xfs_syncd_stop(
-        struct xfs_mount        *mp)
-{
-        cancel_delayed_work_sync(&mp->m_sync_work);
-        cancel_delayed_work_sync(&mp->m_reclaim_work);
-        cancel_work_sync(&mp->m_flush_work);
-}
-void
 __xfs_inode_set_reclaim_tag(
        struct xfs_perag        *pag,
        struct xfs_inode        *ip)
@@ -529,7 +754,7 @@ __xfs_inode_set_reclaim_tag(
                spin_unlock(&ip->i_mount->m_perag_lock);
                /* schedule periodic background inode reclaim */
-                xfs_syncd_queue_reclaim(ip->i_mount);
+                xfs_reclaim_work_queue(ip->i_mount);
                trace_xfs_perag_set_reclaim(ip->i_mount, pag->pag_agno,
                                                        -1, _RET_IP_);
@@ -577,7 +802,7 @@ __xfs_inode_clear_reclaim(
        }
 }
-void
+STATIC void
 __xfs_inode_clear_reclaim_tag(
        xfs_mount_t     *mp,
        xfs_perag_t     *pag,
@@ -787,9 +1012,9 @@ out:
        /*
         * We could return EAGAIN here to make reclaim rescan the inode tree in
         * a short while. However, this just burns CPU time scanning the tree
-         * waiting for IO to complete and xfssyncd never goes back to the idle
+         * waiting for IO to complete and the reclaim work never goes back to
-         * state. Instead, return 0 to let the next scheduled background reclaim
+         * the idle state. Instead, return 0 to let the next scheduled
-         * attempt to reclaim the inode again.
+         * background reclaim attempt to reclaim the inode again.
         */
        return 0;
 }
@@ -800,7 +1025,7 @@ out:
 * then a shut down during filesystem unmount reclaim walk leak all the
 * unreclaimed inodes.
 */
-int
+STATIC int
 xfs_reclaim_inodes_ag(
        struct xfs_mount        *mp,
        int                     flags,
@@ -945,7 +1170,7 @@ xfs_reclaim_inodes_nr(
        int                     nr_to_scan)
 {
        /* kick background reclaimer and push the AIL */
-        xfs_syncd_queue_reclaim(mp);
+        xfs_reclaim_work_queue(mp);
        xfs_ail_push_all(mp->m_ail);
        xfs_reclaim_inodes_ag(mp, SYNC_TRYLOCK | SYNC_WAIT, &nr_to_scan);
@@ -971,3 +1196,146 @@ xfs_reclaim_inodes_count(
        return reclaimable;
 }
+STATIC int
+xfs_inode_match_id(
+        struct xfs_inode        *ip,
+        struct xfs_eofblocks    *eofb)
+{
+        if (eofb->eof_flags & XFS_EOF_FLAGS_UID &&
+            ip->i_d.di_uid != eofb->eof_uid)
+                return 0;
+        if (eofb->eof_flags & XFS_EOF_FLAGS_GID &&
+            ip->i_d.di_gid != eofb->eof_gid)
+                return 0;
+        if (eofb->eof_flags & XFS_EOF_FLAGS_PRID &&
+            xfs_get_projid(ip) != eofb->eof_prid)
+                return 0;
+        return 1;
+}
+STATIC int
+xfs_inode_free_eofblocks(
+        struct xfs_inode        *ip,
+        struct xfs_perag        *pag,
+        int                     flags,
+        void                    *args)
+{
+        int ret;
+        struct xfs_eofblocks *eofb = args;
+        if (!xfs_can_free_eofblocks(ip, false)) {
+                /* inode could be preallocated or append-only */
+                trace_xfs_inode_free_eofblocks_invalid(ip);
+                xfs_inode_clear_eofblocks_tag(ip);
+                return 0;
+        }
+        /*
+         * If the mapping is dirty the operation can block and wait for some
+         * time. Unless we are waiting, skip it.
+         */
+        if (!(flags & SYNC_WAIT) &&
+            mapping_tagged(VFS_I(ip)->i_mapping, PAGECACHE_TAG_DIRTY))
+                return 0;
+        if (eofb) {
+                if (!xfs_inode_match_id(ip, eofb))
+                        return 0;
+                /* skip the inode if the file size is too small */
+                if (eofb->eof_flags & XFS_EOF_FLAGS_MINFILESIZE &&
+                    XFS_ISIZE(ip) < eofb->eof_min_file_size)
+                        return 0;
+        }
+        ret = xfs_free_eofblocks(ip->i_mount, ip, true);
+        /* don't revisit the inode if we're not waiting */
+        if (ret == EAGAIN && !(flags & SYNC_WAIT))
+                ret = 0;
+        return ret;
+}
+int
+xfs_icache_free_eofblocks(
+        struct xfs_mount        *mp,
+        struct xfs_eofblocks    *eofb)
+{
+        int flags = SYNC_TRYLOCK;
+        if (eofb && (eofb->eof_flags & XFS_EOF_FLAGS_SYNC))
+                flags = SYNC_WAIT;
+        return xfs_inode_ag_iterator_tag(mp, xfs_inode_free_eofblocks, flags,
+                                         eofb, XFS_ICI_EOFBLOCKS_TAG);
+}
+void
+xfs_inode_set_eofblocks_tag(
+        xfs_inode_t     *ip)
+{
+        struct xfs_mount *mp = ip->i_mount;
+        struct xfs_perag *pag;
+        int tagged;
+        pag = xfs_perag_get(mp, XFS_INO_TO_AGNO(mp, ip->i_ino));
+        spin_lock(&pag->pag_ici_lock);
+        trace_xfs_inode_set_eofblocks_tag(ip);
+        tagged = radix_tree_tagged(&pag->pag_ici_root,
+                                   XFS_ICI_EOFBLOCKS_TAG);
+        radix_tree_tag_set(&pag->pag_ici_root,
+                           XFS_INO_TO_AGINO(ip->i_mount, ip->i_ino),
+                           XFS_ICI_EOFBLOCKS_TAG);
+        if (!tagged) {
+                /* propagate the eofblocks tag up into the perag radix tree */
+                spin_lock(&ip->i_mount->m_perag_lock);
+                radix_tree_tag_set(&ip->i_mount->m_perag_tree,
+                                   XFS_INO_TO_AGNO(ip->i_mount, ip->i_ino),
+                                   XFS_ICI_EOFBLOCKS_TAG);
+                spin_unlock(&ip->i_mount->m_perag_lock);
+                /* kick off background trimming */
+                xfs_queue_eofblocks(ip->i_mount);
+                trace_xfs_perag_set_eofblocks(ip->i_mount, pag->pag_agno,
+                                              -1, _RET_IP_);
+        }
+        spin_unlock(&pag->pag_ici_lock);
+        xfs_perag_put(pag);
+}
+void
+xfs_inode_clear_eofblocks_tag(
+        xfs_inode_t     *ip)
+{
+        struct xfs_mount *mp = ip->i_mount;
+        struct xfs_perag *pag;
+        pag = xfs_perag_get(mp, XFS_INO_TO_AGNO(mp, ip->i_ino));
+        spin_lock(&pag->pag_ici_lock);
+        trace_xfs_inode_clear_eofblocks_tag(ip);
+        radix_tree_tag_clear(&pag->pag_ici_root,
+                             XFS_INO_TO_AGINO(ip->i_mount, ip->i_ino),
+                             XFS_ICI_EOFBLOCKS_TAG);
+        if (!radix_tree_tagged(&pag->pag_ici_root, XFS_ICI_EOFBLOCKS_TAG)) {
+                /* clear the eofblocks tag from the perag radix tree */
+                spin_lock(&ip->i_mount->m_perag_lock);
+                radix_tree_tag_clear(&ip->i_mount->m_perag_tree,
+                                     XFS_INO_TO_AGNO(ip->i_mount, ip->i_ino),
+                                     XFS_ICI_EOFBLOCKS_TAG);
+                spin_unlock(&ip->i_mount->m_perag_lock);
+                trace_xfs_perag_clear_eofblocks(ip->i_mount, pag->pag_agno,
+                                               -1, _RET_IP_);
+        }
+        spin_unlock(&pag->pag_ici_lock);
+        xfs_perag_put(pag);
+}
diff --git a/fs/xfs/xfs_sync.h b/fs/xfs/xfs_icache.h
index 941202e7ac6e..e0f138c70a2f 100644
--- a/fs/xfs/xfs_sync.h
+++ b/fs/xfs/xfs_icache.h
@@ -24,28 +24,30 @@ struct xfs_perag;
 #define SYNC_WAIT               0x0001  /* wait for i/o to complete */
 #define SYNC_TRYLOCK            0x0002  /* only try to lock inodes */
-extern struct workqueue_struct  *xfs_syncd_wq;  /* sync workqueue */
+int xfs_iget(struct xfs_mount *mp, struct xfs_trans *tp, xfs_ino_t ino,
+             uint flags, uint lock_flags, xfs_inode_t **ipp);
-int xfs_syncd_init(struct xfs_mount *mp);
+void xfs_reclaim_worker(struct work_struct *work);
-void xfs_syncd_stop(struct xfs_mount *mp);
-int xfs_quiesce_data(struct xfs_mount *mp);
-void xfs_quiesce_attr(struct xfs_mount *mp);
-void xfs_flush_inodes(struct xfs_inode *ip);
 int xfs_reclaim_inodes(struct xfs_mount *mp, int mode);
 int xfs_reclaim_inodes_count(struct xfs_mount *mp);
 void xfs_reclaim_inodes_nr(struct xfs_mount *mp, int nr_to_scan);
 void xfs_inode_set_reclaim_tag(struct xfs_inode *ip);
-void __xfs_inode_set_reclaim_tag(struct xfs_perag *pag, struct xfs_inode *ip);
-void __xfs_inode_clear_reclaim_tag(struct xfs_mount *mp, struct xfs_perag *pag,
+void xfs_inode_set_eofblocks_tag(struct xfs_inode *ip);
-                                struct xfs_inode *ip);
+void xfs_inode_clear_eofblocks_tag(struct xfs_inode *ip);
+int xfs_icache_free_eofblocks(struct xfs_mount *, struct xfs_eofblocks *);
+void xfs_eofblocks_worker(struct work_struct *);
 int xfs_sync_inode_grab(struct xfs_inode *ip);
 int xfs_inode_ag_iterator(struct xfs_mount *mp,
-        int (*execute)(struct xfs_inode *ip, struct xfs_perag *pag, int flags),
+        int (*execute)(struct xfs_inode *ip, struct xfs_perag *pag,
-        int flags);
+                int flags, void *args),
+        int flags, void *args);
+int xfs_inode_ag_iterator_tag(struct xfs_mount *mp,
+        int (*execute)(struct xfs_inode *ip, struct xfs_perag *pag,
+                int flags, void *args),
+        int flags, void *args, int tag);
 #endif
diff --git a/fs/xfs/xfs_iget.c b/fs/xfs/xfs_iget.c
deleted file mode 100644
index 784a803383ec..000000000000
--- a/fs/xfs/xfs_iget.c
+++ /dev/null
@@ -1,705 +0,0 @@
-/*
- * Copyright (c) 2000-2005 Silicon Graphics, Inc.
- * All Rights Reserved.
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License as
- * published by the Free Software Foundation.
- *
- * This program is distributed in the hope that it would be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, write the Free Software Foundation,
- * Inc.,  51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
- */
-#include "xfs.h"
-#include "xfs_fs.h"
-#include "xfs_types.h"
-#include "xfs_acl.h"
-#include "xfs_log.h"
-#include "xfs_inum.h"
-#include "xfs_trans.h"
-#include "xfs_sb.h"
-#include "xfs_ag.h"
-#include "xfs_mount.h"
-#include "xfs_bmap_btree.h"
-#include "xfs_alloc_btree.h"
-#include "xfs_ialloc_btree.h"
-#include "xfs_dinode.h"
-#include "xfs_inode.h"
-#include "xfs_btree.h"
-#include "xfs_ialloc.h"
-#include "xfs_quota.h"
-#include "xfs_utils.h"
-#include "xfs_trans_priv.h"
-#include "xfs_inode_item.h"
-#include "xfs_bmap.h"
-#include "xfs_trace.h"
-/*
- * Allocate and initialise an xfs_inode.
- */
-STATIC struct xfs_inode *
-xfs_inode_alloc(
-        struct xfs_mount        *mp,
-        xfs_ino_t               ino)
-{
-        struct xfs_inode        *ip;
-        /*
-         * if this didn't occur in transactions, we could use
-         * KM_MAYFAIL and return NULL here on ENOMEM. Set the
-         * code up to do this anyway.
-         */
-        ip = kmem_zone_alloc(xfs_inode_zone, KM_SLEEP);
-        if (!ip)
-                return NULL;
-        if (inode_init_always(mp->m_super, VFS_I(ip))) {
-                kmem_zone_free(xfs_inode_zone, ip);
-                return NULL;
-        }
-        ASSERT(atomic_read(&ip->i_pincount) == 0);
-        ASSERT(!spin_is_locked(&ip->i_flags_lock));
-        ASSERT(!xfs_isiflocked(ip));
-        ASSERT(ip->i_ino == 0);
-        mrlock_init(&ip->i_iolock, MRLOCK_BARRIER, "xfsio", ip->i_ino);
-        /* initialise the xfs inode */
-        ip->i_ino = ino;
-        ip->i_mount = mp;
-        memset(&ip->i_imap, 0, sizeof(struct xfs_imap));
-        ip->i_afp = NULL;
-        memset(&ip->i_df, 0, sizeof(xfs_ifork_t));
-        ip->i_flags = 0;
-        ip->i_delayed_blks = 0;
-        memset(&ip->i_d, 0, sizeof(xfs_icdinode_t));
-        return ip;
-}
-STATIC void
-xfs_inode_free_callback(
-        struct rcu_head         *head)
-{
-        struct inode            *inode = container_of(head, struct inode, i_rcu);
-        struct xfs_inode        *ip = XFS_I(inode);
-        kmem_zone_free(xfs_inode_zone, ip);
-}
-void
-xfs_inode_free(
-        struct xfs_inode        *ip)
-{
-        switch (ip->i_d.di_mode & S_IFMT) {
-        case S_IFREG:
-        case S_IFDIR:
-        case S_IFLNK:
-                xfs_idestroy_fork(ip, XFS_DATA_FORK);
-                break;
-        }
-        if (ip->i_afp)
-                xfs_idestroy_fork(ip, XFS_ATTR_FORK);
-        if (ip->i_itemp) {
-                ASSERT(!(ip->i_itemp->ili_item.li_flags & XFS_LI_IN_AIL));
-                xfs_inode_item_destroy(ip);
-                ip->i_itemp = NULL;
-        }
-        /* asserts to verify all state is correct here */
-        ASSERT(atomic_read(&ip->i_pincount) == 0);
-        ASSERT(!spin_is_locked(&ip->i_flags_lock));
-        ASSERT(!xfs_isiflocked(ip));
-        /*
-         * Because we use RCU freeing we need to ensure the inode always
-         * appears to be reclaimed with an invalid inode number when in the
-         * free state. The ip->i_flags_lock provides the barrier against lookup
-         * races.
-         */
-        spin_lock(&ip->i_flags_lock);
-        ip->i_flags = XFS_IRECLAIM;
-        ip->i_ino = 0;
-        spin_unlock(&ip->i_flags_lock);
-        call_rcu(&VFS_I(ip)->i_rcu, xfs_inode_free_callback);
-}
-/*
- * Check the validity of the inode we just found it the cache
- */
-static int
-xfs_iget_cache_hit(
-        struct xfs_perag        *pag,
-        struct xfs_inode        *ip,
-        xfs_ino_t               ino,
-        int                     flags,
-        int                     lock_flags) __releases(RCU)
-{
-        struct inode            *inode = VFS_I(ip);
-        struct xfs_mount        *mp = ip->i_mount;
-        int                     error;
-        /*
-         * check for re-use of an inode within an RCU grace period due to the
-         * radix tree nodes not being updated yet. We monitor for this by
-         * setting the inode number to zero before freeing the inode structure.
-         * If the inode has been reallocated and set up, then the inode number
-         * will not match, so check for that, too.
-         */
-        spin_lock(&ip->i_flags_lock);
-        if (ip->i_ino != ino) {
-                trace_xfs_iget_skip(ip);
-                XFS_STATS_INC(xs_ig_frecycle);
-                error = EAGAIN;
-                goto out_error;
-        }
-        /*
-         * If we are racing with another cache hit that is currently
-         * instantiating this inode or currently recycling it out of
-         * reclaimabe state, wait for the initialisation to complete
-         * before continuing.
-         *
-         * XXX(hch): eventually we should do something equivalent to
-         *           wait_on_inode to wait for these flags to be cleared
-         *           instead of polling for it.
-         */
-        if (ip->i_flags & (XFS_INEW|XFS_IRECLAIM)) {
-                trace_xfs_iget_skip(ip);
-                XFS_STATS_INC(xs_ig_frecycle);
-                error = EAGAIN;
-                goto out_error;
-        }
-        /*
-         * If lookup is racing with unlink return an error immediately.
-         */
-        if (ip->i_d.di_mode == 0 && !(flags & XFS_IGET_CREATE)) {
-                error = ENOENT;
-                goto out_error;
-        }
-        /*
-         * If IRECLAIMABLE is set, we've torn down the VFS inode already.
-         * Need to carefully get it back into useable state.
-         */
-        if (ip->i_flags & XFS_IRECLAIMABLE) {
-                trace_xfs_iget_reclaim(ip);
-                /*
-                 * We need to set XFS_IRECLAIM to prevent xfs_reclaim_inode
-                 * from stomping over us while we recycle the inode.  We can't
-                 * clear the radix tree reclaimable tag yet as it requires
-                 * pag_ici_lock to be held exclusive.
-                 */
-                ip->i_flags |= XFS_IRECLAIM;
-                spin_unlock(&ip->i_flags_lock);
-                rcu_read_unlock();
-                error = -inode_init_always(mp->m_super, inode);
-                if (error) {
-                        /*
-                         * Re-initializing the inode failed, and we are in deep
-                         * trouble.  Try to re-add it to the reclaim list.
-                         */
-                        rcu_read_lock();
-                        spin_lock(&ip->i_flags_lock);
-                        ip->i_flags &= ~(XFS_INEW | XFS_IRECLAIM);
-                        ASSERT(ip->i_flags & XFS_IRECLAIMABLE);
-                        trace_xfs_iget_reclaim_fail(ip);
-                        goto out_error;
-                }
-                spin_lock(&pag->pag_ici_lock);
-                spin_lock(&ip->i_flags_lock);
-                /*
-                 * Clear the per-lifetime state in the inode as we are now
-                 * effectively a new inode and need to return to the initial
-                 * state before reuse occurs.
-                 */
-                ip->i_flags &= ~XFS_IRECLAIM_RESET_FLAGS;
-                ip->i_flags |= XFS_INEW;
-                __xfs_inode_clear_reclaim_tag(mp, pag, ip);
-                inode->i_state = I_NEW;
-                ASSERT(!rwsem_is_locked(&ip->i_iolock.mr_lock));
-                mrlock_init(&ip->i_iolock, MRLOCK_BARRIER, "xfsio", ip->i_ino);
-                spin_unlock(&ip->i_flags_lock);
-                spin_unlock(&pag->pag_ici_lock);
-        } else {
-                /* If the VFS inode is being torn down, pause and try again. */
-                if (!igrab(inode)) {
-                        trace_xfs_iget_skip(ip);
-                        error = EAGAIN;
-                        goto out_error;
-                }
-                /* We've got a live one. */
-                spin_unlock(&ip->i_flags_lock);
-                rcu_read_unlock();
-                trace_xfs_iget_hit(ip);
-        }
-        if (lock_flags != 0)
-                xfs_ilock(ip, lock_flags);
-        xfs_iflags_clear(ip, XFS_ISTALE | XFS_IDONTCACHE);
-        XFS_STATS_INC(xs_ig_found);
-        return 0;
-out_error:
-        spin_unlock(&ip->i_flags_lock);
-        rcu_read_unlock();
-        return error;
-}
-static int
-xfs_iget_cache_miss(
-        struct xfs_mount        *mp,
-        struct xfs_perag        *pag,
-        xfs_trans_t             *tp,
-        xfs_ino_t               ino,
-        struct xfs_inode        **ipp,
-        int                     flags,
-        int                     lock_flags)
-{
-        struct xfs_inode        *ip;
-        int                     error;
-        xfs_agino_t             agino = XFS_INO_TO_AGINO(mp, ino);
-        int                     iflags;
-        ip = xfs_inode_alloc(mp, ino);
-        if (!ip)
-                return ENOMEM;
-        error = xfs_iread(mp, tp, ip, flags);
-        if (error)
-                goto out_destroy;
-        trace_xfs_iget_miss(ip);
-        if ((ip->i_d.di_mode == 0) && !(flags & XFS_IGET_CREATE)) {
-                error = ENOENT;
-                goto out_destroy;
-        }
-        /*
-         * Preload the radix tree so we can insert safely under the
-         * write spinlock. Note that we cannot sleep inside the preload
-         * region. Since we can be called from transaction context, don't
-         * recurse into the file system.
-         */
-        if (radix_tree_preload(GFP_NOFS)) {
-                error = EAGAIN;
-                goto out_destroy;
-        }
-        /*
-         * Because the inode hasn't been added to the radix-tree yet it can't
-         * be found by another thread, so we can do the non-sleeping lock here.
-         */
-        if (lock_flags) {
-                if (!xfs_ilock_nowait(ip, lock_flags))
-                        BUG();
-        }
-        /*
-         * These values must be set before inserting the inode into the radix
-         * tree as the moment it is inserted a concurrent lookup (allowed by the
-         * RCU locking mechanism) can find it and that lookup must see that this
-         * is an inode currently under construction (i.e. that XFS_INEW is set).
-         * The ip->i_flags_lock that protects the XFS_INEW flag forms the
-         * memory barrier that ensures this detection works correctly at lookup
-         * time.
-         */
-        iflags = XFS_INEW;
-        if (flags & XFS_IGET_DONTCACHE)
-                iflags |= XFS_IDONTCACHE;
-        ip->i_udquot = ip->i_gdquot = NULL;
-        xfs_iflags_set(ip, iflags);
-        /* insert the new inode */
-        spin_lock(&pag->pag_ici_lock);
-        error = radix_tree_insert(&pag->pag_ici_root, agino, ip);
-        if (unlikely(error)) {
-                WARN_ON(error != -EEXIST);
-                XFS_STATS_INC(xs_ig_dup);
-                error = EAGAIN;
-                goto out_preload_end;
-        }
-        spin_unlock(&pag->pag_ici_lock);
-        radix_tree_preload_end();
-        *ipp = ip;
-        return 0;
-out_preload_end:
-        spin_unlock(&pag->pag_ici_lock);
-        radix_tree_preload_end();
-        if (lock_flags)
-                xfs_iunlock(ip, lock_flags);
-out_destroy:
-        __destroy_inode(VFS_I(ip));
-        xfs_inode_free(ip);
-        return error;
-}
-/*
- * Look up an inode by number in the given file system.
- * The inode is looked up in the cache held in each AG.
- * If the inode is found in the cache, initialise the vfs inode
- * if necessary.
- *
- * If it is not in core, read it in from the file system's device,
- * add it to the cache and initialise the vfs inode.
- *
- * The inode is locked according to the value of the lock_flags parameter.
- * This flag parameter indicates how and if the inode's IO lock and inode lock
- * should be taken.
- *
- * mp -- the mount point structure for the current file system.  It points
- *       to the inode hash table.
- * tp -- a pointer to the current transaction if there is one.  This is
- *       simply passed through to the xfs_iread() call.
- * ino -- the number of the inode desired.  This is the unique identifier
- *        within the file system for the inode being requested.
- * lock_flags -- flags indicating how to lock the inode.  See the comment
- *               for xfs_ilock() for a list of valid values.
- */
-int
-xfs_iget(
-        xfs_mount_t     *mp,
-        xfs_trans_t     *tp,
-        xfs_ino_t       ino,
-        uint            flags,
-        uint            lock_flags,
-        xfs_inode_t     **ipp)
-{
-        xfs_inode_t     *ip;
-        int             error;
-        xfs_perag_t     *pag;
-        xfs_agino_t     agino;
-        /*
-         * xfs_reclaim_inode() uses the ILOCK to ensure an inode
-         * doesn't get freed while it's being referenced during a
-         * radix tree traversal here.  It assumes this function
-         * aqcuires only the ILOCK (and therefore it has no need to
-         * involve the IOLOCK in this synchronization).
-         */
-        ASSERT((lock_flags & (XFS_IOLOCK_EXCL | XFS_IOLOCK_SHARED)) == 0);
-        /* reject inode numbers outside existing AGs */
-        if (!ino || XFS_INO_TO_AGNO(mp, ino) >= mp->m_sb.sb_agcount)
-                return EINVAL;
-        /* get the perag structure and ensure that it's inode capable */
-        pag = xfs_perag_get(mp, XFS_INO_TO_AGNO(mp, ino));
-        agino = XFS_INO_TO_AGINO(mp, ino);
-again:
-        error = 0;
-        rcu_read_lock();
-        ip = radix_tree_lookup(&pag->pag_ici_root, agino);
-        if (ip) {
-                error = xfs_iget_cache_hit(pag, ip, ino, flags, lock_flags);
-                if (error)
-                        goto out_error_or_again;
-        } else {
-                rcu_read_unlock();
-                XFS_STATS_INC(xs_ig_missed);
-                error = xfs_iget_cache_miss(mp, pag, tp, ino, &ip,
-                                                        flags, lock_flags);
-                if (error)
-                        goto out_error_or_again;
-        }
-        xfs_perag_put(pag);
-        *ipp = ip;
-        /*
-         * If we have a real type for an on-disk inode, we can set ops(&unlock)
-         * now.  If it's a new inode being created, xfs_ialloc will handle it.
-         */
-        if (xfs_iflags_test(ip, XFS_INEW) && ip->i_d.di_mode != 0)
-                xfs_setup_inode(ip);
-        return 0;
-out_error_or_again:
-        if (error == EAGAIN) {
-                delay(1);
-                goto again;
-        }
-        xfs_perag_put(pag);
-        return error;
-}
-/*
- * This is a wrapper routine around the xfs_ilock() routine
- * used to centralize some grungy code.  It is used in places
- * that wish to lock the inode solely for reading the extents.
- * The reason these places can't just call xfs_ilock(SHARED)
- * is that the inode lock also guards to bringing in of the
- * extents from disk for a file in b-tree format.  If the inode
- * is in b-tree format, then we need to lock the inode exclusively
- * until the extents are read in.  Locking it exclusively all
- * the time would limit our parallelism unnecessarily, though.
- * What we do instead is check to see if the extents have been
- * read in yet, and only lock the inode exclusively if they
- * have not.
- *
- * The function returns a value which should be given to the
- * corresponding xfs_iunlock_map_shared().  This value is
- * the mode in which the lock was actually taken.
- */
-uint
-xfs_ilock_map_shared(
-        xfs_inode_t     *ip)
-{
-        uint    lock_mode;
-        if ((ip->i_d.di_format == XFS_DINODE_FMT_BTREE) &&
-            ((ip->i_df.if_flags & XFS_IFEXTENTS) == 0)) {
-                lock_mode = XFS_ILOCK_EXCL;
-        } else {
-                lock_mode = XFS_ILOCK_SHARED;
-        }
-        xfs_ilock(ip, lock_mode);
-        return lock_mode;
-}
-/*
- * This is simply the unlock routine to go with xfs_ilock_map_shared().
- * All it does is call xfs_iunlock() with the given lock_mode.
- */
-void
-xfs_iunlock_map_shared(
-        xfs_inode_t     *ip,
-        unsigned int    lock_mode)
-{
-        xfs_iunlock(ip, lock_mode);
-}
-/*
- * The xfs inode contains 2 locks: a multi-reader lock called the
- * i_iolock and a multi-reader lock called the i_lock.  This routine
- * allows either or both of the locks to be obtained.
- *
- * The 2 locks should always be ordered so that the IO lock is
- * obtained first in order to prevent deadlock.
- *
- * ip -- the inode being locked
- * lock_flags -- this parameter indicates the inode's locks
- *       to be locked.  It can be:
- *              XFS_IOLOCK_SHARED,
- *              XFS_IOLOCK_EXCL,
- *              XFS_ILOCK_SHARED,
- *              XFS_ILOCK_EXCL,
- *              XFS_IOLOCK_SHARED | XFS_ILOCK_SHARED,
- *              XFS_IOLOCK_SHARED | XFS_ILOCK_EXCL,
- *              XFS_IOLOCK_EXCL | XFS_ILOCK_SHARED,
- *              XFS_IOLOCK_EXCL | XFS_ILOCK_EXCL
- */
-void
-xfs_ilock(
-        xfs_inode_t             *ip,
-        uint                    lock_flags)
-{
-        /*
-         * You can't set both SHARED and EXCL for the same lock,
-         * and only XFS_IOLOCK_SHARED, XFS_IOLOCK_EXCL, XFS_ILOCK_SHARED,
-         * and XFS_ILOCK_EXCL are valid values to set in lock_flags.
-         */
-        ASSERT((lock_flags & (XFS_IOLOCK_SHARED | XFS_IOLOCK_EXCL)) !=
-               (XFS_IOLOCK_SHARED | XFS_IOLOCK_EXCL));
-        ASSERT((lock_flags & (XFS_ILOCK_SHARED | XFS_ILOCK_EXCL)) !=
-               (XFS_ILOCK_SHARED | XFS_ILOCK_EXCL));
-        ASSERT((lock_flags & ~(XFS_LOCK_MASK | XFS_LOCK_DEP_MASK)) == 0);
-        if (lock_flags & XFS_IOLOCK_EXCL)
-                mrupdate_nested(&ip->i_iolock, XFS_IOLOCK_DEP(lock_flags));
-        else if (lock_flags & XFS_IOLOCK_SHARED)
-                mraccess_nested(&ip->i_iolock, XFS_IOLOCK_DEP(lock_flags));
-        if (lock_flags & XFS_ILOCK_EXCL)
-                mrupdate_nested(&ip->i_lock, XFS_ILOCK_DEP(lock_flags));
-        else if (lock_flags & XFS_ILOCK_SHARED)
-                mraccess_nested(&ip->i_lock, XFS_ILOCK_DEP(lock_flags));
-        trace_xfs_ilock(ip, lock_flags, _RET_IP_);
-}
-/*
- * This is just like xfs_ilock(), except that the caller
- * is guaranteed not to sleep.  It returns 1 if it gets
- * the requested locks and 0 otherwise.  If the IO lock is
- * obtained but the inode lock cannot be, then the IO lock
- * is dropped before returning.
- *
- * ip -- the inode being locked
- * lock_flags -- this parameter indicates the inode's locks to be
- *       to be locked.  See the comment for xfs_ilock() for a list
- *       of valid values.
- */
-int
-xfs_ilock_nowait(
-        xfs_inode_t             *ip,
-        uint                    lock_flags)
-{
-        /*
-         * You can't set both SHARED and EXCL for the same lock,
-         * and only XFS_IOLOCK_SHARED, XFS_IOLOCK_EXCL, XFS_ILOCK_SHARED,
-         * and XFS_ILOCK_EXCL are valid values to set in lock_flags.
-         */
-        ASSERT((lock_flags & (XFS_IOLOCK_SHARED | XFS_IOLOCK_EXCL)) !=
-               (XFS_IOLOCK_SHARED | XFS_IOLOCK_EXCL));
-        ASSERT((lock_flags & (XFS_ILOCK_SHARED | XFS_ILOCK_EXCL)) !=
-               (XFS_ILOCK_SHARED | XFS_ILOCK_EXCL));
-        ASSERT((lock_flags & ~(XFS_LOCK_MASK | XFS_LOCK_DEP_MASK)) == 0);
-        if (lock_flags & XFS_IOLOCK_EXCL) {
-                if (!mrtryupdate(&ip->i_iolock))
-                        goto out;
-        } else if (lock_flags & XFS_IOLOCK_SHARED) {
-                if (!mrtryaccess(&ip->i_iolock))
-                        goto out;
-        }
-        if (lock_flags & XFS_ILOCK_EXCL) {
-                if (!mrtryupdate(&ip->i_lock))
-                        goto out_undo_iolock;
-        } else if (lock_flags & XFS_ILOCK_SHARED) {
-                if (!mrtryaccess(&ip->i_lock))
-                        goto out_undo_iolock;
-        }
-        trace_xfs_ilock_nowait(ip, lock_flags, _RET_IP_);
-        return 1;
- out_undo_iolock:
-        if (lock_flags & XFS_IOLOCK_EXCL)
-                mrunlock_excl(&ip->i_iolock);
-        else if (lock_flags & XFS_IOLOCK_SHARED)
-                mrunlock_shared(&ip->i_iolock);
- out:
-        return 0;
-}
-/*
- * xfs_iunlock() is used to drop the inode locks acquired with
- * xfs_ilock() and xfs_ilock_nowait().  The caller must pass
- * in the flags given to xfs_ilock() or xfs_ilock_nowait() so
- * that we know which locks to drop.
- *
- * ip -- the inode being unlocked
- * lock_flags -- this parameter indicates the inode's locks to be
- *       to be unlocked.  See the comment for xfs_ilock() for a list
- *       of valid values for this parameter.
- *
- */
-void
-xfs_iunlock(
-        xfs_inode_t             *ip,
-        uint                    lock_flags)
-{
-        /*
-         * You can't set both SHARED and EXCL for the same lock,
-         * and only XFS_IOLOCK_SHARED, XFS_IOLOCK_EXCL, XFS_ILOCK_SHARED,
-         * and XFS_ILOCK_EXCL are valid values to set in lock_flags.
-         */
-        ASSERT((lock_flags & (XFS_IOLOCK_SHARED | XFS_IOLOCK_EXCL)) !=
-               (XFS_IOLOCK_SHARED | XFS_IOLOCK_EXCL));
-        ASSERT((lock_flags & (XFS_ILOCK_SHARED | XFS_ILOCK_EXCL)) !=
-               (XFS_ILOCK_SHARED | XFS_ILOCK_EXCL));
-        ASSERT((lock_flags & ~(XFS_LOCK_MASK | XFS_LOCK_DEP_MASK)) == 0);
-        ASSERT(lock_flags != 0);
-        if (lock_flags & XFS_IOLOCK_EXCL)
-                mrunlock_excl(&ip->i_iolock);
-        else if (lock_flags & XFS_IOLOCK_SHARED)
-                mrunlock_shared(&ip->i_iolock);
-        if (lock_flags & XFS_ILOCK_EXCL)
-                mrunlock_excl(&ip->i_lock);
-        else if (lock_flags & XFS_ILOCK_SHARED)
-                mrunlock_shared(&ip->i_lock);
-        trace_xfs_iunlock(ip, lock_flags, _RET_IP_);
-}
-/*
- * give up write locks.  the i/o lock cannot be held nested
- * if it is being demoted.
- */
-void
-xfs_ilock_demote(
-        xfs_inode_t             *ip,
-        uint                    lock_flags)
-{
-        ASSERT(lock_flags & (XFS_IOLOCK_EXCL|XFS_ILOCK_EXCL));
-        ASSERT((lock_flags & ~(XFS_IOLOCK_EXCL|XFS_ILOCK_EXCL)) == 0);
-        if (lock_flags & XFS_ILOCK_EXCL)
-                mrdemote(&ip->i_lock);
-        if (lock_flags & XFS_IOLOCK_EXCL)
-                mrdemote(&ip->i_iolock);
-        trace_xfs_ilock_demote(ip, lock_flags, _RET_IP_);
-}
-#ifdef DEBUG
-int
-xfs_isilocked(
-        xfs_inode_t             *ip,
-        uint                    lock_flags)
-{
-        if (lock_flags & (XFS_ILOCK_EXCL|XFS_ILOCK_SHARED)) {
-                if (!(lock_flags & XFS_ILOCK_SHARED))
-                        return !!ip->i_lock.mr_writer;
-                return rwsem_is_locked(&ip->i_lock.mr_lock);
-        }
-        if (lock_flags & (XFS_IOLOCK_EXCL|XFS_IOLOCK_SHARED)) {
-                if (!(lock_flags & XFS_IOLOCK_SHARED))
-                        return !!ip->i_iolock.mr_writer;
-                return rwsem_is_locked(&ip->i_iolock.mr_lock);
-        }
-        ASSERT(0);
-        return 0;
-}
-#endif
-void
-__xfs_iflock(
-        struct xfs_inode        *ip)
-{
-        wait_queue_head_t *wq = bit_waitqueue(&ip->i_flags, __XFS_IFLOCK_BIT);
-        DEFINE_WAIT_BIT(wait, &ip->i_flags, __XFS_IFLOCK_BIT);
-        do {
-                prepare_to_wait_exclusive(wq, &wait.wait, TASK_UNINTERRUPTIBLE);
-                if (xfs_isiflocked(ip))
-                        io_schedule();
-        } while (!xfs_iflock_nowait(ip));
-        finish_wait(wq, &wait.wait);
-}
diff --git a/fs/xfs/xfs_inode.c b/fs/xfs/xfs_inode.c
index 2778258fcfa2..66282dcb821b 100644
--- a/fs/xfs/xfs_inode.c
+++ b/fs/xfs/xfs_inode.c
@@ -45,6 +45,7 @@
 #include "xfs_filestream.h"
 #include "xfs_vnodeops.h"
 #include "xfs_trace.h"
+#include "xfs_icache.h"
 kmem_zone_t *xfs_ifork_zone;
 kmem_zone_t *xfs_inode_zone;
@@ -74,6 +75,256 @@ xfs_get_extsz_hint(
        return 0;
 }
+/*
+ * This is a wrapper routine around the xfs_ilock() routine used to centralize
+ * some grungy code.  It is used in places that wish to lock the inode solely
+ * for reading the extents.  The reason these places can't just call
+ * xfs_ilock(SHARED) is that the inode lock also guards to bringing in of the
+ * extents from disk for a file in b-tree format.  If the inode is in b-tree
+ * format, then we need to lock the inode exclusively until the extents are read
+ * in.  Locking it exclusively all the time would limit our parallelism
+ * unnecessarily, though.  What we do instead is check to see if the extents
+ * have been read in yet, and only lock the inode exclusively if they have not.
+ *
+ * The function returns a value which should be given to the corresponding
+ * xfs_iunlock_map_shared().  This value is the mode in which the lock was
+ * actually taken.
+ */
+uint
+xfs_ilock_map_shared(
+        xfs_inode_t     *ip)
+{
+        uint    lock_mode;
+        if ((ip->i_d.di_format == XFS_DINODE_FMT_BTREE) &&
+            ((ip->i_df.if_flags & XFS_IFEXTENTS) == 0)) {
+                lock_mode = XFS_ILOCK_EXCL;
+        } else {
+                lock_mode = XFS_ILOCK_SHARED;
+        }
+        xfs_ilock(ip, lock_mode);
+        return lock_mode;
+}
+/*
+ * This is simply the unlock routine to go with xfs_ilock_map_shared().
+ * All it does is call xfs_iunlock() with the given lock_mode.
+ */
+void
+xfs_iunlock_map_shared(
+        xfs_inode_t     *ip,
+        unsigned int    lock_mode)
+{
+        xfs_iunlock(ip, lock_mode);
+}
+/*
+ * The xfs inode contains 2 locks: a multi-reader lock called the
+ * i_iolock and a multi-reader lock called the i_lock.  This routine
+ * allows either or both of the locks to be obtained.
+ *
+ * The 2 locks should always be ordered so that the IO lock is
+ * obtained first in order to prevent deadlock.
+ *
+ * ip -- the inode being locked
+ * lock_flags -- this parameter indicates the inode's locks
+ *       to be locked.  It can be:
+ *              XFS_IOLOCK_SHARED,
+ *              XFS_IOLOCK_EXCL,
+ *              XFS_ILOCK_SHARED,
+ *              XFS_ILOCK_EXCL,
+ *              XFS_IOLOCK_SHARED | XFS_ILOCK_SHARED,
+ *              XFS_IOLOCK_SHARED | XFS_ILOCK_EXCL,
+ *              XFS_IOLOCK_EXCL | XFS_ILOCK_SHARED,
+ *              XFS_IOLOCK_EXCL | XFS_ILOCK_EXCL
+ */
+void
+xfs_ilock(
+        xfs_inode_t             *ip,
+        uint                    lock_flags)
+{
+        trace_xfs_ilock(ip, lock_flags, _RET_IP_);
+        /*
+         * You can't set both SHARED and EXCL for the same lock,
+         * and only XFS_IOLOCK_SHARED, XFS_IOLOCK_EXCL, XFS_ILOCK_SHARED,
+         * and XFS_ILOCK_EXCL are valid values to set in lock_flags.
+         */
+        ASSERT((lock_flags & (XFS_IOLOCK_SHARED | XFS_IOLOCK_EXCL)) !=
+               (XFS_IOLOCK_SHARED | XFS_IOLOCK_EXCL));
+        ASSERT((lock_flags & (XFS_ILOCK_SHARED | XFS_ILOCK_EXCL)) !=
+               (XFS_ILOCK_SHARED | XFS_ILOCK_EXCL));
+        ASSERT((lock_flags & ~(XFS_LOCK_MASK | XFS_LOCK_DEP_MASK)) == 0);
+        if (lock_flags & XFS_IOLOCK_EXCL)
+                mrupdate_nested(&ip->i_iolock, XFS_IOLOCK_DEP(lock_flags));
+        else if (lock_flags & XFS_IOLOCK_SHARED)
+                mraccess_nested(&ip->i_iolock, XFS_IOLOCK_DEP(lock_flags));
+        if (lock_flags & XFS_ILOCK_EXCL)
+                mrupdate_nested(&ip->i_lock, XFS_ILOCK_DEP(lock_flags));
+        else if (lock_flags & XFS_ILOCK_SHARED)
+                mraccess_nested(&ip->i_lock, XFS_ILOCK_DEP(lock_flags));
+}
+/*
+ * This is just like xfs_ilock(), except that the caller
+ * is guaranteed not to sleep.  It returns 1 if it gets
+ * the requested locks and 0 otherwise.  If the IO lock is
+ * obtained but the inode lock cannot be, then the IO lock
+ * is dropped before returning.
+ *
+ * ip -- the inode being locked
+ * lock_flags -- this parameter indicates the inode's locks to be
+ *       to be locked.  See the comment for xfs_ilock() for a list
+ *       of valid values.
+ */
+int
+xfs_ilock_nowait(
+        xfs_inode_t             *ip,
+        uint                    lock_flags)
+{
+        trace_xfs_ilock_nowait(ip, lock_flags, _RET_IP_);
+        /*
+         * You can't set both SHARED and EXCL for the same lock,
+         * and only XFS_IOLOCK_SHARED, XFS_IOLOCK_EXCL, XFS_ILOCK_SHARED,
+         * and XFS_ILOCK_EXCL are valid values to set in lock_flags.
+         */
+        ASSERT((lock_flags & (XFS_IOLOCK_SHARED | XFS_IOLOCK_EXCL)) !=
+               (XFS_IOLOCK_SHARED | XFS_IOLOCK_EXCL));
+        ASSERT((lock_flags & (XFS_ILOCK_SHARED | XFS_ILOCK_EXCL)) !=
+               (XFS_ILOCK_SHARED | XFS_ILOCK_EXCL));
+        ASSERT((lock_flags & ~(XFS_LOCK_MASK | XFS_LOCK_DEP_MASK)) == 0);
+        if (lock_flags & XFS_IOLOCK_EXCL) {
+                if (!mrtryupdate(&ip->i_iolock))
+                        goto out;
+        } else if (lock_flags & XFS_IOLOCK_SHARED) {
+                if (!mrtryaccess(&ip->i_iolock))
+                        goto out;
+        }
+        if (lock_flags & XFS_ILOCK_EXCL) {
+                if (!mrtryupdate(&ip->i_lock))
+                        goto out_undo_iolock;
+        } else if (lock_flags & XFS_ILOCK_SHARED) {
+                if (!mrtryaccess(&ip->i_lock))
+                        goto out_undo_iolock;
+        }
+        return 1;
+ out_undo_iolock:
+        if (lock_flags & XFS_IOLOCK_EXCL)
+                mrunlock_excl(&ip->i_iolock);
+        else if (lock_flags & XFS_IOLOCK_SHARED)
+                mrunlock_shared(&ip->i_iolock);
+ out:
+        return 0;
+}
+/*
+ * xfs_iunlock() is used to drop the inode locks acquired with
+ * xfs_ilock() and xfs_ilock_nowait().  The caller must pass
+ * in the flags given to xfs_ilock() or xfs_ilock_nowait() so
+ * that we know which locks to drop.
+ *
+ * ip -- the inode being unlocked
+ * lock_flags -- this parameter indicates the inode's locks to be
+ *       to be unlocked.  See the comment for xfs_ilock() for a list
+ *       of valid values for this parameter.
+ *
+ */
+void
+xfs_iunlock(
+        xfs_inode_t             *ip,
+        uint                    lock_flags)
+{
+        /*
+         * You can't set both SHARED and EXCL for the same lock,
+         * and only XFS_IOLOCK_SHARED, XFS_IOLOCK_EXCL, XFS_ILOCK_SHARED,
+         * and XFS_ILOCK_EXCL are valid values to set in lock_flags.
+         */
+        ASSERT((lock_flags & (XFS_IOLOCK_SHARED | XFS_IOLOCK_EXCL)) !=
+               (XFS_IOLOCK_SHARED | XFS_IOLOCK_EXCL));
+        ASSERT((lock_flags & (XFS_ILOCK_SHARED | XFS_ILOCK_EXCL)) !=
+               (XFS_ILOCK_SHARED | XFS_ILOCK_EXCL));
+        ASSERT((lock_flags & ~(XFS_LOCK_MASK | XFS_LOCK_DEP_MASK)) == 0);
+        ASSERT(lock_flags != 0);
+        if (lock_flags & XFS_IOLOCK_EXCL)
+                mrunlock_excl(&ip->i_iolock);
+        else if (lock_flags & XFS_IOLOCK_SHARED)
+                mrunlock_shared(&ip->i_iolock);
+        if (lock_flags & XFS_ILOCK_EXCL)
+                mrunlock_excl(&ip->i_lock);
+        else if (lock_flags & XFS_ILOCK_SHARED)
+                mrunlock_shared(&ip->i_lock);
+        trace_xfs_iunlock(ip, lock_flags, _RET_IP_);
+}
+/*
+ * give up write locks.  the i/o lock cannot be held nested
+ * if it is being demoted.
+ */
+void
+xfs_ilock_demote(
+        xfs_inode_t             *ip,
+        uint                    lock_flags)
+{
+        ASSERT(lock_flags & (XFS_IOLOCK_EXCL|XFS_ILOCK_EXCL));
+        ASSERT((lock_flags & ~(XFS_IOLOCK_EXCL|XFS_ILOCK_EXCL)) == 0);
+        if (lock_flags & XFS_ILOCK_EXCL)
+                mrdemote(&ip->i_lock);
+        if (lock_flags & XFS_IOLOCK_EXCL)
+                mrdemote(&ip->i_iolock);
+        trace_xfs_ilock_demote(ip, lock_flags, _RET_IP_);
+}
+#ifdef DEBUG
+int
+xfs_isilocked(
+        xfs_inode_t             *ip,
+        uint                    lock_flags)
+{
+        if (lock_flags & (XFS_ILOCK_EXCL|XFS_ILOCK_SHARED)) {
+                if (!(lock_flags & XFS_ILOCK_SHARED))
+                        return !!ip->i_lock.mr_writer;
+                return rwsem_is_locked(&ip->i_lock.mr_lock);
+        }
+        if (lock_flags & (XFS_IOLOCK_EXCL|XFS_IOLOCK_SHARED)) {
+                if (!(lock_flags & XFS_IOLOCK_SHARED))
+                        return !!ip->i_iolock.mr_writer;
+                return rwsem_is_locked(&ip->i_iolock.mr_lock);
+        }
+        ASSERT(0);
+        return 0;
+}
+#endif
+void
+__xfs_iflock(
+        struct xfs_inode        *ip)
+{
+        wait_queue_head_t *wq = bit_waitqueue(&ip->i_flags, __XFS_IFLOCK_BIT);
+        DEFINE_WAIT_BIT(wait, &ip->i_flags, __XFS_IFLOCK_BIT);
+        do {
+                prepare_to_wait_exclusive(wq, &wait.wait, TASK_UNINTERRUPTIBLE);
+                if (xfs_isiflocked(ip))
+                        io_schedule();
+        } while (!xfs_iflock_nowait(ip));
+        finish_wait(wq, &wait.wait);
+}
 #ifdef DEBUG
 /*
 * Make sure that the extents in the given memory buffer
@@ -131,6 +382,65 @@ xfs_inobp_check(
 }
 #endif
+static void
+xfs_inode_buf_verify(
+        struct xfs_buf  *bp)
+{
+        struct xfs_mount *mp = bp->b_target->bt_mount;
+        int             i;
+        int             ni;
+        /*
+         * Validate the magic number and version of every inode in the buffer
+         */
+        ni = XFS_BB_TO_FSB(mp, bp->b_length) * mp->m_sb.sb_inopblock;
+        for (i = 0; i < ni; i++) {
+                int             di_ok;
+                xfs_dinode_t    *dip;
+                dip = (struct xfs_dinode *)xfs_buf_offset(bp,
+                                        (i << mp->m_sb.sb_inodelog));
+                di_ok = dip->di_magic == cpu_to_be16(XFS_DINODE_MAGIC) &&
+                            XFS_DINODE_GOOD_VERSION(dip->di_version);
+                if (unlikely(XFS_TEST_ERROR(!di_ok, mp,
+                                                XFS_ERRTAG_ITOBP_INOTOBP,
+                                                XFS_RANDOM_ITOBP_INOTOBP))) {
+                        xfs_buf_ioerror(bp, EFSCORRUPTED);
+                        XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_HIGH,
+                                             mp, dip);
+#ifdef DEBUG
+                        xfs_emerg(mp,
+                                "bad inode magic/vsn daddr %lld #%d (magic=%x)",
+                                (unsigned long long)bp->b_bn, i,
+                                be16_to_cpu(dip->di_magic));
+                        ASSERT(0);
+#endif
+                }
+        }
+        xfs_inobp_check(mp, bp);
+}
+static void
+xfs_inode_buf_read_verify(
+        struct xfs_buf  *bp)
+{
+        xfs_inode_buf_verify(bp);
+}
+static void
+xfs_inode_buf_write_verify(
+        struct xfs_buf  *bp)
+{
+        xfs_inode_buf_verify(bp);
+}
+const struct xfs_buf_ops xfs_inode_buf_ops = {
+        .verify_read = xfs_inode_buf_read_verify,
+        .verify_write = xfs_inode_buf_write_verify,
+};
 /*
 * This routine is called to map an inode to the buffer containing the on-disk
 * version of the inode.  It returns a pointer to the buffer containing the
@@ -145,71 +455,33 @@ xfs_imap_to_bp(
        struct xfs_mount        *mp,
        struct xfs_trans        *tp,
        struct xfs_imap         *imap,
-        struct xfs_dinode       **dipp,
+        struct xfs_dinode       **dipp,
        struct xfs_buf          **bpp,
        uint                    buf_flags,
        uint                    iget_flags)
 {
        struct xfs_buf          *bp;
        int                     error;
-        int                     i;
-        int                     ni;
        buf_flags |= XBF_UNMAPPED;
        error = xfs_trans_read_buf(mp, tp, mp->m_ddev_targp, imap->im_blkno,
-                                   (int)imap->im_len, buf_flags, &bp);
+                                   (int)imap->im_len, buf_flags, &bp,
+                                   &xfs_inode_buf_ops);
        if (error) {
-                if (error != EAGAIN) {
+                if (error == EAGAIN) {
-                        xfs_warn(mp,
-                                "%s: xfs_trans_read_buf() returned error %d.",
-                                __func__, error);
-                } else {
                        ASSERT(buf_flags & XBF_TRYLOCK);
+                        return error;
                }
-                return error;
-        }
-        /*
-         * Validate the magic number and version of every inode in the buffer
-         * (if DEBUG kernel) or the first inode in the buffer, otherwise.
-         */
-#ifdef DEBUG
-        ni = BBTOB(imap->im_len) >> mp->m_sb.sb_inodelog;
-#else   /* usual case */
-        ni = 1;
-#endif
-        for (i = 0; i < ni; i++) {
+                if (error == EFSCORRUPTED &&
-                int             di_ok;
+                    (iget_flags & XFS_IGET_UNTRUSTED))
-                xfs_dinode_t    *dip;
+                        return XFS_ERROR(EINVAL);
-                dip = (xfs_dinode_t *)xfs_buf_offset(bp,
+                xfs_warn(mp, "%s: xfs_trans_read_buf() returned error %d.",
-                                        (i << mp->m_sb.sb_inodelog));
+                        __func__, error);
-                di_ok = dip->di_magic == cpu_to_be16(XFS_DINODE_MAGIC) &&
+                return error;
-                            XFS_DINODE_GOOD_VERSION(dip->di_version);
-                if (unlikely(XFS_TEST_ERROR(!di_ok, mp,
-                                                XFS_ERRTAG_ITOBP_INOTOBP,
-                                                XFS_RANDOM_ITOBP_INOTOBP))) {
-                        if (iget_flags & XFS_IGET_UNTRUSTED) {
-                                xfs_trans_brelse(tp, bp);
-                                return XFS_ERROR(EINVAL);
-                        }
-                        XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_HIGH,
-                                             mp, dip);
-#ifdef DEBUG
-                        xfs_emerg(mp,
-                                "bad inode magic/vsn daddr %lld #%d (magic=%x)",
-                                (unsigned long long)imap->im_blkno, i,
-                                be16_to_cpu(dip->di_magic));
-                        ASSERT(0);
-#endif
-                        xfs_trans_brelse(tp, bp);
-                        return XFS_ERROR(EFSCORRUPTED);
-                }
        }
-        xfs_inobp_check(mp, bp);
        *bpp = bp;
        *dipp = (struct xfs_dinode *)xfs_buf_offset(bp, imap->im_boffset);
        return 0;
@@ -853,16 +1125,16 @@ xfs_iread_extents(
 * set according to the contents of the given cred structure.
 *
 * Use xfs_dialloc() to allocate the on-disk inode. If xfs_dialloc()
- * has a free inode available, call xfs_iget()
+ * has a free inode available, call xfs_iget() to obtain the in-core
- * to obtain the in-core version of the allocated inode.  Finally,
+ * version of the allocated inode.  Finally, fill in the inode and
- * fill in the inode and log its initial contents.  In this case,
+ * log its initial contents.  In this case, ialloc_context would be
- * ialloc_context would be set to NULL and call_again set to false.
+ * set to NULL.
 *
- * If xfs_dialloc() does not have an available inode,
+ * If xfs_dialloc() does not have an available inode, it will replenish
- * it will replenish its supply by doing an allocation. Since we can
+ * its supply by doing an allocation. Since we can only do one
- * only do one allocation within a transaction without deadlocks, we
+ * allocation within a transaction without deadlocks, we must commit
- * must commit the current transaction before returning the inode itself.
+ * the current transaction before returning the inode itself.
- * In this case, therefore, we will set call_again to true and return.
+ * In this case, therefore, we will set ialloc_context and return.
 * The caller should then commit the current transaction, start a new
 * transaction, and call xfs_ialloc() again to actually get the inode.
 *
@@ -1509,10 +1781,23 @@ xfs_ifree_cluster(
                 * to mark all the active inodes on the buffer stale.
                 */
                bp = xfs_trans_get_buf(tp, mp->m_ddev_targp, blkno,
-                                        mp->m_bsize * blks_per_cluster, 0);
+                                        mp->m_bsize * blks_per_cluster,
+                                        XBF_UNMAPPED);
                if (!bp)
                        return ENOMEM;
+                /*
+                 * This buffer may not have been correctly initialised as we
+                 * didn't read it from disk. That's not important because we are
+                 * only using to mark the buffer as stale in the log, and to
+                 * attach stale cached inodes on it. That means it will never be
+                 * dispatched for IO. If it is, we want to know about it, and we
+                 * want it to fail. We can acheive this by adding a write
+                 * verifier to the buffer.
+                 */
+                 bp->b_ops = &xfs_inode_buf_ops;
                /*
                 * Walk the inodes already attached to the buffer and mark them
                 * stale. These will all have the flush locks held, so an
@@ -3660,3 +3945,40 @@ xfs_iext_irec_update_extoffs(
                ifp->if_u1.if_ext_irec[i].er_extoff += ext_diff;
        }
 }
+/*
+ * Test whether it is appropriate to check an inode for and free post EOF
+ * blocks. The 'force' parameter determines whether we should also consider
+ * regular files that are marked preallocated or append-only.
+ */
+bool
+xfs_can_free_eofblocks(struct xfs_inode *ip, bool force)
+{
+        /* prealloc/delalloc exists only on regular files */
+        if (!S_ISREG(ip->i_d.di_mode))
+                return false;
+        /*
+         * Zero sized files with no cached pages and delalloc blocks will not
+         * have speculative prealloc/delalloc blocks to remove.
+         */
+        if (VFS_I(ip)->i_size == 0 &&
+            VN_CACHED(VFS_I(ip)) == 0 &&
+            ip->i_delayed_blks == 0)
+                return false;
+        /* If we haven't read in the extent list, then don't do it now. */
+        if (!(ip->i_df.if_flags & XFS_IFEXTENTS))
+                return false;
+        /*
+         * Do not free real preallocated or append-only files unless the file
+         * has delalloc blocks and we are forced to remove them.
+         */
+        if (ip->i_d.di_flags & (XFS_DIFLAG_PREALLOC | XFS_DIFLAG_APPEND))
+                if (!force || ip->i_delayed_blks == 0)
+                        return false;
+        return true;
+}
diff --git a/fs/xfs/xfs_inode.h b/fs/xfs/xfs_inode.h
index 94b32f906e79..22baf6ea4fac 100644
--- a/fs/xfs/xfs_inode.h
+++ b/fs/xfs/xfs_inode.h
@@ -496,11 +496,10 @@ static inline int xfs_isiflocked(struct xfs_inode *ip)
        (((pip)->i_mount->m_flags & XFS_MOUNT_GRPID) || \
         ((pip)->i_d.di_mode & S_ISGID))
 /*
- * xfs_iget.c prototypes.
+ * xfs_inode.c prototypes.
 */
-int             xfs_iget(struct xfs_mount *, struct xfs_trans *, xfs_ino_t,
-                         uint, uint, xfs_inode_t **);
 void            xfs_ilock(xfs_inode_t *, uint);
 int             xfs_ilock_nowait(xfs_inode_t *, uint);
 void            xfs_iunlock(xfs_inode_t *, uint);
@@ -508,11 +507,6 @@ void		xfs_ilock_demote(xfs_inode_t *, uint);
 int             xfs_isilocked(xfs_inode_t *, uint);
 uint            xfs_ilock_map_shared(xfs_inode_t *);
 void            xfs_iunlock_map_shared(xfs_inode_t *, uint);
-void            xfs_inode_free(struct xfs_inode *ip);
-/*
- * xfs_inode.c prototypes.
- */
 int             xfs_ialloc(struct xfs_trans *, xfs_inode_t *, umode_t,
                           xfs_nlink_t, xfs_dev_t, prid_t, int,
                           struct xfs_buf **, xfs_inode_t **);
@@ -591,6 +585,7 @@ void		xfs_iext_irec_compact(xfs_ifork_t *);
 void            xfs_iext_irec_compact_pages(xfs_ifork_t *);
 void            xfs_iext_irec_compact_full(xfs_ifork_t *);
 void            xfs_iext_irec_update_extoffs(xfs_ifork_t *, int, int);
+bool            xfs_can_free_eofblocks(struct xfs_inode *, bool);
 #define xfs_ipincount(ip)       ((unsigned int) atomic_read(&ip->i_pincount))
@@ -603,5 +598,6 @@ void		xfs_inobp_check(struct xfs_mount *, struct xfs_buf *);
 extern struct kmem_zone *xfs_ifork_zone;
 extern struct kmem_zone *xfs_inode_zone;
 extern struct kmem_zone *xfs_ili_zone;
+extern const struct xfs_buf_ops xfs_inode_buf_ops;
 #endif  /* __XFS_INODE_H__ */
diff --git a/fs/xfs/xfs_ioctl.c b/fs/xfs/xfs_ioctl.c
index 8305f2ac6773..c1c3ef88a260 100644
--- a/fs/xfs/xfs_ioctl.c
+++ b/fs/xfs/xfs_ioctl.c
@@ -42,6 +42,7 @@
 #include "xfs_inode_item.h"
 #include "xfs_export.h"
 #include "xfs_trace.h"
+#include "xfs_icache.h"
 #include <linux/capability.h>
 #include <linux/dcache.h>
@@ -70,7 +71,7 @@ xfs_find_handle(
        int                     hsize;
        xfs_handle_t            handle;
        struct inode            *inode;
-        struct fd               f;
+        struct fd               f = {0};
        struct path             path;
        int                     error;
        struct xfs_inode        *ip;
@@ -1602,6 +1603,26 @@ xfs_file_ioctl(
                error = xfs_errortag_clearall(mp, 1);
                return -error;
+        case XFS_IOC_FREE_EOFBLOCKS: {
+                struct xfs_eofblocks eofb;
+                if (copy_from_user(&eofb, arg, sizeof(eofb)))
+                        return -XFS_ERROR(EFAULT);
+                if (eofb.eof_version != XFS_EOFBLOCKS_VERSION)
+                        return -XFS_ERROR(EINVAL);
+                if (eofb.eof_flags & ~XFS_EOF_FLAGS_VALID)
+                        return -XFS_ERROR(EINVAL);
+                if (memchr_inv(&eofb.pad32, 0, sizeof(eofb.pad32)) ||
+                    memchr_inv(eofb.pad64, 0, sizeof(eofb.pad64)))
+                        return -XFS_ERROR(EINVAL);
+                error = xfs_icache_free_eofblocks(mp, &eofb);
+                return -error;
+        }
        default:
                return -ENOTTY;
        }
diff --git a/fs/xfs/xfs_iomap.c b/fs/xfs/xfs_iomap.c
index 973dff6ad935..add06b4e9a63 100644
--- a/fs/xfs/xfs_iomap.c
+++ b/fs/xfs/xfs_iomap.c
@@ -41,6 +41,7 @@
 #include "xfs_utils.h"
 #include "xfs_iomap.h"
 #include "xfs_trace.h"
+#include "xfs_icache.h"
 #define XFS_WRITEIO_ALIGN(mp,off)       (((off) >> mp->m_writeio_log) \
@@ -373,7 +374,7 @@ xfs_iomap_write_delay(
        xfs_extlen_t    extsz;
        int             nimaps;
        xfs_bmbt_irec_t imap[XFS_WRITE_IMAPS];
-        int             prealloc, flushed = 0;
+        int             prealloc;
        int             error;
        ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL));
@@ -434,31 +435,29 @@ retry:
        }
        /*
-         * If bmapi returned us nothing, we got either ENOSPC or EDQUOT.  For
+         * If bmapi returned us nothing, we got either ENOSPC or EDQUOT. Retry
-         * ENOSPC, * flush all other inodes with delalloc blocks to free up
-         * some of the excess reserved metadata space. For both cases, retry
         * without EOF preallocation.
         */
        if (nimaps == 0) {
                trace_xfs_delalloc_enospc(ip, offset, count);
-                if (flushed)
+                if (prealloc) {
-                        return XFS_ERROR(error ? error : ENOSPC);
+                        prealloc = 0;
+                        error = 0;
-                if (error == ENOSPC) {
+                        goto retry;
-                        xfs_iunlock(ip, XFS_ILOCK_EXCL);
-                        xfs_flush_inodes(ip);
-                        xfs_ilock(ip, XFS_ILOCK_EXCL);
                }
+                return XFS_ERROR(error ? error : ENOSPC);
-                flushed = 1;
-                error = 0;
-                prealloc = 0;
-                goto retry;
        }
        if (!(imap[0].br_startblock || XFS_IS_REALTIME_INODE(ip)))
                return xfs_alert_fsblock_zero(ip, &imap[0]);
+        /*
+         * Tag the inode as speculatively preallocated so we can reclaim this
+         * space on demand, if necessary.
+         */
+        if (prealloc)
+                xfs_inode_set_eofblocks_tag(ip);
        *ret_imap = imap[0];
        return 0;
 }
@@ -584,7 +583,9 @@ xfs_iomap_write_allocate(
                         * pointer that the caller gave to us.
                         */
                        error = xfs_bmapi_write(tp, ip, map_start_fsb,
-                                                count_fsb, 0, &first_block, 1,
+                                                count_fsb,
+                                                XFS_BMAPI_STACK_SWITCH,
+                                                &first_block, 1,
                                                imap, &nimaps, &free_list);
                        if (error)
                                goto trans_cancel;
diff --git a/fs/xfs/xfs_iops.c b/fs/xfs/xfs_iops.c
index 4e00cf091d2c..d82efaa2ac73 100644
--- a/fs/xfs/xfs_iops.c
+++ b/fs/xfs/xfs_iops.c
@@ -38,6 +38,7 @@
 #include "xfs_vnodeops.h"
 #include "xfs_inode_item.h"
 #include "xfs_trace.h"
+#include "xfs_icache.h"
 #include <linux/capability.h>
 #include <linux/xattr.h>
@@ -779,8 +780,8 @@ xfs_setattr_size(
         * care about here.
         */
        if (oldsize != ip->i_d.di_size && newsize > ip->i_d.di_size) {
-                error = xfs_flush_pages(ip, ip->i_d.di_size, newsize, 0,
+                error = -filemap_write_and_wait_range(VFS_I(ip)->i_mapping,
-                                        FI_NONE);
+                                                      ip->i_d.di_size, newsize);
                if (error)
                        goto out_unlock;
        }
@@ -854,6 +855,9 @@ xfs_setattr_size(
                 * and do not wait the usual (long) time for writeout.
                 */
                xfs_iflags_set(ip, XFS_ITRUNCATED);
+                /* A truncate down always removes post-EOF blocks. */
+                xfs_inode_clear_eofblocks_tag(ip);
        }
        if (mask & ATTR_CTIME) {
diff --git a/fs/xfs/xfs_itable.c b/fs/xfs/xfs_itable.c
index 01d10a66e302..2ea7d402188d 100644
--- a/fs/xfs/xfs_itable.c
+++ b/fs/xfs/xfs_itable.c
@@ -34,6 +34,7 @@
 #include "xfs_error.h"
 #include "xfs_btree.h"
 #include "xfs_trace.h"
+#include "xfs_icache.h"
 STATIC int
 xfs_internal_inum(
@@ -395,7 +396,8 @@ xfs_bulkstat(
                                        if (xfs_inobt_maskn(chunkidx, nicluster)
                                                        & ~r.ir_free)
                                                xfs_btree_reada_bufs(mp, agno,
-                                                        agbno, nbcluster);
+                                                        agbno, nbcluster,
+                                                        &xfs_inode_buf_ops);
                                }
                                irbp->ir_startino = r.ir_startino;
                                irbp->ir_freecount = r.ir_freecount;
diff --git a/fs/xfs/xfs_linux.h b/fs/xfs/xfs_linux.h
index 828662f70d64..fe7e4df85a7b 100644
--- a/fs/xfs/xfs_linux.h
+++ b/fs/xfs/xfs_linux.h
@@ -44,6 +44,7 @@
 #include <linux/kernel.h>
 #include <linux/blkdev.h>
 #include <linux/slab.h>
+#include <linux/crc32c.h>
 #include <linux/module.h>
 #include <linux/mutex.h>
 #include <linux/file.h>
@@ -118,6 +119,7 @@
 #define xfs_rotorstep           xfs_params.rotorstep.val
 #define xfs_inherit_nodefrag    xfs_params.inherit_nodfrg.val
 #define xfs_fstrm_centisecs     xfs_params.fstrm_timer.val
+#define xfs_eofb_secs           xfs_params.eofb_timer.val
 #define current_cpu()           (raw_smp_processor_id())
 #define current_pid()           (current->pid)
diff --git a/fs/xfs/xfs_log.c b/fs/xfs/xfs_log.c
index 7f4f9370d0e7..46bd9d52ab51 100644
--- a/fs/xfs/xfs_log.c
+++ b/fs/xfs/xfs_log.c
@@ -34,6 +34,8 @@
 #include "xfs_dinode.h"
 #include "xfs_inode.h"
 #include "xfs_trace.h"
+#include "xfs_fsops.h"
+#include "xfs_cksum.h"
 kmem_zone_t     *xfs_log_ticket_zone;
@@ -458,7 +460,8 @@ xfs_log_reserve(
        tic->t_trans_type = t_type;
        *ticp = tic;
-        xlog_grant_push_ail(log, tic->t_unit_res * tic->t_cnt);
+        xlog_grant_push_ail(log, tic->t_cnt ? tic->t_unit_res * tic->t_cnt
+                                            : tic->t_unit_res);
        trace_xfs_log_reserve(log, tic);
@@ -679,25 +682,29 @@ out:
 }
 /*
- * Finish the recovery of the file system.  This is separate from
+ * Finish the recovery of the file system.  This is separate from the
- * the xfs_log_mount() call, because it depends on the code in
+ * xfs_log_mount() call, because it depends on the code in xfs_mountfs() to read
- * xfs_mountfs() to read in the root and real-time bitmap inodes
+ * in the root and real-time bitmap inodes between calling xfs_log_mount() and
- * between calling xfs_log_mount() and here.
+ * here.
 *
- * mp           - ubiquitous xfs mount point structure
+ * If we finish recovery successfully, start the background log work. If we are
+ * not doing recovery, then we have a RO filesystem and we don't need to start
+ * it.
 */
 int
 xfs_log_mount_finish(xfs_mount_t *mp)
 {
-        int     error;
+        int     error = 0;
-        if (!(mp->m_flags & XFS_MOUNT_NORECOVERY))
+        if (!(mp->m_flags & XFS_MOUNT_NORECOVERY)) {
                error = xlog_recover_finish(mp->m_log);
-        else {
+                if (!error)
-                error = 0;
+                        xfs_log_work_queue(mp);
+        } else {
                ASSERT(mp->m_flags & XFS_MOUNT_RDONLY);
        }
        return error;
 }
@@ -850,15 +857,49 @@ xfs_log_unmount_write(xfs_mount_t *mp)
 }       /* xfs_log_unmount_write */
 /*
- * Deallocate log structures for unmount/relocation.
+ * Empty the log for unmount/freeze.
+ *
+ * To do this, we first need to shut down the background log work so it is not
+ * trying to cover the log as we clean up. We then need to unpin all objects in
+ * the log so we can then flush them out. Once they have completed their IO and
+ * run the callbacks removing themselves from the AIL, we can write the unmount
+ * record.
+ */
+void
+xfs_log_quiesce(
+        struct xfs_mount        *mp)
+{
+        cancel_delayed_work_sync(&mp->m_log->l_work);
+        xfs_log_force(mp, XFS_LOG_SYNC);
+        /*
+         * The superblock buffer is uncached and while xfs_ail_push_all_sync()
+         * will push it, xfs_wait_buftarg() will not wait for it. Further,
+         * xfs_buf_iowait() cannot be used because it was pushed with the
+         * XBF_ASYNC flag set, so we need to use a lock/unlock pair to wait for
+         * the IO to complete.
+         */
+        xfs_ail_push_all_sync(mp->m_ail);
+        xfs_wait_buftarg(mp->m_ddev_targp);
+        xfs_buf_lock(mp->m_sb_bp);
+        xfs_buf_unlock(mp->m_sb_bp);
+        xfs_log_unmount_write(mp);
+}
+/*
+ * Shut down and release the AIL and Log.
 *
- * We need to stop the aild from running before we destroy
+ * During unmount, we need to ensure we flush all the dirty metadata objects
- * and deallocate the log as the aild references the log.
+ * from the AIL so that the log is empty before we write the unmount record to
+ * the log. Once this is done, we can tear down the AIL and the log.
 */
 void
-xfs_log_unmount(xfs_mount_t *mp)
+xfs_log_unmount(
+        struct xfs_mount        *mp)
 {
-        cancel_delayed_work_sync(&mp->m_sync_work);
+        xfs_log_quiesce(mp);
        xfs_trans_ail_destroy(mp);
        xlog_dealloc_log(mp->m_log);
 }
@@ -1090,8 +1131,7 @@ xlog_iodone(xfs_buf_t *bp)
         * with it being freed after writing the unmount record to the
         * log.
         */
+}
-}       /* xlog_iodone */
 /*
 * Return size of each in-core log record buffer.
@@ -1161,6 +1201,40 @@ done:
 }       /* xlog_get_iclog_buffer_size */
+void
+xfs_log_work_queue(
+        struct xfs_mount        *mp)
+{
+        queue_delayed_work(mp->m_log_workqueue, &mp->m_log->l_work,
+                                msecs_to_jiffies(xfs_syncd_centisecs * 10));
+}
+/*
+ * Every sync period we need to unpin all items in the AIL and push them to
+ * disk. If there is nothing dirty, then we might need to cover the log to
+ * indicate that the filesystem is idle.
+ */
+void
+xfs_log_worker(
+        struct work_struct      *work)
+{
+        struct xlog             *log = container_of(to_delayed_work(work),
+                                                struct xlog, l_work);
+        struct xfs_mount        *mp = log->l_mp;
+        /* dgc: errors ignored - not fatal and nowhere to report them */
+        if (xfs_log_need_covered(mp))
+                xfs_fs_log_dummy(mp);
+        else
+                xfs_log_force(mp, 0);
+        /* start pushing all the metadata that is currently dirty */
+        xfs_ail_push_all(mp->m_ail);
+        /* queue us up again */
+        xfs_log_work_queue(mp);
+}
 /*
 * This routine initializes some of the log structure for a given mount point.
 * Its primary purpose is to fill in enough, so recovery can occur.  However,
@@ -1195,6 +1269,7 @@ xlog_alloc_log(
        log->l_logBBsize   = num_bblks;
        log->l_covered_state = XLOG_STATE_COVER_IDLE;
        log->l_flags       |= XLOG_ACTIVE_RECOVERY;
+        INIT_DELAYED_WORK(&log->l_work, xfs_log_worker);
        log->l_prev_block  = -1;
        /* log->l_tail_lsn = 0x100000000LL; cycle = 1; current block = 0 */
@@ -1417,6 +1492,84 @@ xlog_grant_push_ail(
 }
 /*
+ * Stamp cycle number in every block
+ */
+STATIC void
+xlog_pack_data(
+        struct xlog             *log,
+        struct xlog_in_core     *iclog,
+        int                     roundoff)
+{
+        int                     i, j, k;
+        int                     size = iclog->ic_offset + roundoff;
+        __be32                  cycle_lsn;
+        xfs_caddr_t             dp;
+        cycle_lsn = CYCLE_LSN_DISK(iclog->ic_header.h_lsn);
+        dp = iclog->ic_datap;
+        for (i = 0; i < BTOBB(size); i++) {
+                if (i >= (XLOG_HEADER_CYCLE_SIZE / BBSIZE))
+                        break;
+                iclog->ic_header.h_cycle_data[i] = *(__be32 *)dp;
+                *(__be32 *)dp = cycle_lsn;
+                dp += BBSIZE;
+        }
+        if (xfs_sb_version_haslogv2(&log->l_mp->m_sb)) {
+                xlog_in_core_2_t *xhdr = iclog->ic_data;
+                for ( ; i < BTOBB(size); i++) {
+                        j = i / (XLOG_HEADER_CYCLE_SIZE / BBSIZE);
+                        k = i % (XLOG_HEADER_CYCLE_SIZE / BBSIZE);
+                        xhdr[j].hic_xheader.xh_cycle_data[k] = *(__be32 *)dp;
+                        *(__be32 *)dp = cycle_lsn;
+                        dp += BBSIZE;
+                }
+                for (i = 1; i < log->l_iclog_heads; i++)
+                        xhdr[i].hic_xheader.xh_cycle = cycle_lsn;
+        }
+}
+/*
+ * Calculate the checksum for a log buffer.
+ *
+ * This is a little more complicated than it should be because the various
+ * headers and the actual data are non-contiguous.
+ */
+__le32
+xlog_cksum(
+        struct xlog             *log,
+        struct xlog_rec_header  *rhead,
+        char                    *dp,
+        int                     size)
+{
+        __uint32_t              crc;
+        /* first generate the crc for the record header ... */
+        crc = xfs_start_cksum((char *)rhead,
+                              sizeof(struct xlog_rec_header),
+                              offsetof(struct xlog_rec_header, h_crc));
+        /* ... then for additional cycle data for v2 logs ... */
+        if (xfs_sb_version_haslogv2(&log->l_mp->m_sb)) {
+                union xlog_in_core2 *xhdr = (union xlog_in_core2 *)rhead;
+                int             i;
+                for (i = 1; i < log->l_iclog_heads; i++) {
+                        crc = crc32c(crc, &xhdr[i].hic_xheader,
+                                     sizeof(struct xlog_rec_ext_header));
+                }
+        }
+        /* ... and finally for the payload */
+        crc = crc32c(crc, dp, size);
+        return xfs_end_cksum(crc);
+}
+/*
 * The bdstrat callback function for log bufs. This gives us a central
 * place to trap bufs in case we get hit by a log I/O error and need to
 * shutdown. Actually, in practice, even when we didn't get a log error,
@@ -1476,7 +1629,6 @@ xlog_sync(
        struct xlog             *log,
        struct xlog_in_core     *iclog)
 {
-        xfs_caddr_t     dptr;           /* pointer to byte sized element */
        xfs_buf_t       *bp;
        int             i;
        uint            count;          /* byte count of bwrite */
@@ -1485,6 +1637,7 @@ xlog_sync(
        int             split = 0;      /* split write into two regions */
        int             error;
        int             v2 = xfs_sb_version_haslogv2(&log->l_mp->m_sb);
+        int             size;
        XFS_STATS_INC(xs_log_writes);
        ASSERT(atomic_read(&iclog->ic_refcnt) == 0);
@@ -1515,13 +1668,10 @@ xlog_sync(
        xlog_pack_data(log, iclog, roundoff); 
        /* real byte length */
-        if (v2) {
+        size = iclog->ic_offset;
-                iclog->ic_header.h_len =
+        if (v2)
-                        cpu_to_be32(iclog->ic_offset + roundoff);
+                size += roundoff;
-        } else {
+        iclog->ic_header.h_len = cpu_to_be32(size);
-                iclog->ic_header.h_len =
-                        cpu_to_be32(iclog->ic_offset);
-        }
        bp = iclog->ic_bp;
        XFS_BUF_SET_ADDR(bp, BLOCK_LSN(be64_to_cpu(iclog->ic_header.h_lsn)));
@@ -1530,12 +1680,36 @@ xlog_sync(
        /* Do we need to split this write into 2 parts? */
        if (XFS_BUF_ADDR(bp) + BTOBB(count) > log->l_logBBsize) {
+                char            *dptr;
                split = count - (BBTOB(log->l_logBBsize - XFS_BUF_ADDR(bp)));
                count = BBTOB(log->l_logBBsize - XFS_BUF_ADDR(bp));
-                iclog->ic_bwritecnt = 2;        /* split into 2 writes */
+                iclog->ic_bwritecnt = 2;
+                /*
+                 * Bump the cycle numbers at the start of each block in the
+                 * part of the iclog that ends up in the buffer that gets
+                 * written to the start of the log.
+                 *
+                 * Watch out for the header magic number case, though.
+                 */
+                dptr = (char *)&iclog->ic_header + count;
+                for (i = 0; i < split; i += BBSIZE) {
+                        __uint32_t cycle = be32_to_cpu(*(__be32 *)dptr);
+                        if (++cycle == XLOG_HEADER_MAGIC_NUM)
+                                cycle++;
+                        *(__be32 *)dptr = cpu_to_be32(cycle);
+                        dptr += BBSIZE;
+                }
        } else {
                iclog->ic_bwritecnt = 1;
        }
+        /* calculcate the checksum */
+        iclog->ic_header.h_crc = xlog_cksum(log, &iclog->ic_header,
+                                            iclog->ic_datap, size);
        bp->b_io_length = BTOBB(count);
        bp->b_fspriv = iclog;
        XFS_BUF_ZEROFLAGS(bp);
@@ -1589,19 +1763,6 @@ xlog_sync(
                bp->b_flags |= XBF_SYNCIO;
                if (log->l_mp->m_flags & XFS_MOUNT_BARRIER)
                        bp->b_flags |= XBF_FUA;
-                dptr = bp->b_addr;
-                /*
-                 * Bump the cycle numbers at the start of each block
-                 * since this part of the buffer is at the start of
-                 * a new cycle.  Watch out for the header magic number
-                 * case, though.
-                 */
-                for (i = 0; i < split; i += BBSIZE) {
-                        be32_add_cpu((__be32 *)dptr, 1);
-                        if (be32_to_cpu(*(__be32 *)dptr) == XLOG_HEADER_MAGIC_NUM)
-                                be32_add_cpu((__be32 *)dptr, 1);
-                        dptr += BBSIZE;
-                }
                ASSERT(XFS_BUF_ADDR(bp) <= log->l_logBBsize-1);
                ASSERT(XFS_BUF_ADDR(bp) + BTOBB(count) <= log->l_logBBsize);
@@ -1618,7 +1779,6 @@ xlog_sync(
        return 0;
 }       /* xlog_sync */
 /*
 * Deallocate a log structure
 */
@@ -2387,14 +2547,27 @@ xlog_state_do_callback(
                                /*
-                                 * update the last_sync_lsn before we drop the
+                                 * Completion of a iclog IO does not imply that
+                                 * a transaction has completed, as transactions
+                                 * can be large enough to span many iclogs. We
+                                 * cannot change the tail of the log half way
+                                 * through a transaction as this may be the only
+                                 * transaction in the log and moving th etail to
+                                 * point to the middle of it will prevent
+                                 * recovery from finding the start of the
+                                 * transaction. Hence we should only update the
+                                 * last_sync_lsn if this iclog contains
+                                 * transaction completion callbacks on it.
+                                 *
+                                 * We have to do this before we drop the
                                 * icloglock to ensure we are the only one that
                                 * can update it.
                                 */
                                ASSERT(XFS_LSN_CMP(atomic64_read(&log->l_last_sync_lsn),
                                        be64_to_cpu(iclog->ic_header.h_lsn)) <= 0);
-                                atomic64_set(&log->l_last_sync_lsn,
+                                if (iclog->ic_callback)
-                                        be64_to_cpu(iclog->ic_header.h_lsn));
+                                        atomic64_set(&log->l_last_sync_lsn,
+                                                be64_to_cpu(iclog->ic_header.h_lsn));
                        } else
                                ioerrors++;
@@ -3700,3 +3873,4 @@ xlog_iclogs_empty(
        } while (iclog != log->l_iclog);
        return 1;
 }
diff --git a/fs/xfs/xfs_log.h b/fs/xfs/xfs_log.h
index 748d312850e2..5caee96059df 100644
--- a/fs/xfs/xfs_log.h
+++ b/fs/xfs/xfs_log.h
@@ -181,5 +181,9 @@ int	xfs_log_commit_cil(struct xfs_mount *mp, struct xfs_trans *tp,
                                xfs_lsn_t *commit_lsn, int flags);
 bool    xfs_log_item_in_current_chkpt(struct xfs_log_item *lip);
+void    xfs_log_work_queue(struct xfs_mount *mp);
+void    xfs_log_worker(struct work_struct *work);
+void    xfs_log_quiesce(struct xfs_mount *mp);
 #endif
 #endif  /* __XFS_LOG_H__ */
diff --git a/fs/xfs/xfs_log_priv.h b/fs/xfs/xfs_log_priv.h
index 18a801d76a42..16d8d12ea3b4 100644
--- a/fs/xfs/xfs_log_priv.h
+++ b/fs/xfs/xfs_log_priv.h
@@ -139,7 +139,6 @@ static inline uint xlog_get_client_id(__be32 i)
 /*
 * Flags for log structure
 */
-#define XLOG_CHKSUM_MISMATCH    0x1     /* used only during recovery */
 #define XLOG_ACTIVE_RECOVERY    0x2     /* in the middle of recovery */
 #define XLOG_RECOVERY_NEEDED    0x4     /* log was recovered */
 #define XLOG_IO_ERROR           0x8     /* log hit an I/O error, and being
@@ -291,7 +290,7 @@ typedef struct xlog_rec_header {
        __be32    h_len;        /* len in bytes; should be 64-bit aligned: 4 */
        __be64    h_lsn;        /* lsn of this LR                       :  8 */
        __be64    h_tail_lsn;   /* lsn of 1st LR w/ buffers not committed: 8 */
-        __be32    h_chksum;     /* may not be used; non-zero if used    :  4 */
+        __le32    h_crc;        /* crc of log record                    :  4 */
        __be32    h_prev_block; /* block number to previous LR          :  4 */
        __be32    h_num_logops; /* number of log operations in this LR  :  4 */
        __be32    h_cycle_data[XLOG_HEADER_CYCLE_SIZE / BBSIZE];
@@ -495,6 +494,7 @@ struct xlog {
        struct xfs_buf          *l_xbuf;        /* extra buffer for log
                                                 * wrapping */
        struct xfs_buftarg      *l_targ;        /* buftarg of log */
+        struct delayed_work     l_work;         /* background flush work */
        uint                    l_flags;
        uint                    l_quotaoffs_flag; /* XFS_DQ_*, for QUOTAOFFs */
        struct list_head        *l_buf_cancel_table;
@@ -554,11 +554,9 @@ xlog_recover(
 extern int
 xlog_recover_finish(
        struct xlog             *log);
-extern void
-xlog_pack_data(
+extern __le32    xlog_cksum(struct xlog *log, struct xlog_rec_header *rhead,
-        struct xlog             *log,
+                            char *dp, int size);
-        struct xlog_in_core     *iclog,
-        int);
 extern kmem_zone_t *xfs_log_ticket_zone;
 struct xlog_ticket *
diff --git a/fs/xfs/xfs_log_recover.c b/fs/xfs/xfs_log_recover.c
index 5da3ace352bf..96fcbb85ff83 100644
--- a/fs/xfs/xfs_log_recover.c
+++ b/fs/xfs/xfs_log_recover.c
@@ -41,7 +41,9 @@
 #include "xfs_trans_priv.h"
 #include "xfs_quota.h"
 #include "xfs_utils.h"
+#include "xfs_cksum.h"
 #include "xfs_trace.h"
+#include "xfs_icache.h"
 STATIC int
 xlog_find_zeroed(
@@ -2143,7 +2145,7 @@ xlog_recover_buffer_pass2(
                buf_flags |= XBF_UNMAPPED;
        bp = xfs_buf_read(mp->m_ddev_targp, buf_f->blf_blkno, buf_f->blf_len,
-                          buf_flags);
+                          buf_flags, NULL);
        if (!bp)
                return XFS_ERROR(ENOMEM);
        error = bp->b_error;
@@ -2236,7 +2238,8 @@ xlog_recover_inode_pass2(
        }
        trace_xfs_log_recover_inode_recover(log, in_f);
-        bp = xfs_buf_read(mp->m_ddev_targp, in_f->ilf_blkno, in_f->ilf_len, 0);
+        bp = xfs_buf_read(mp->m_ddev_targp, in_f->ilf_blkno, in_f->ilf_len, 0,
+                          NULL);
        if (!bp) {
                error = ENOMEM;
                goto error;
@@ -2547,7 +2550,8 @@ xlog_recover_dquot_pass2(
        ASSERT(dq_f->qlf_len == 1);
        error = xfs_trans_read_buf(mp, NULL, mp->m_ddev_targp, dq_f->qlf_blkno,
-                                   XFS_FSB_TO_BB(mp, dq_f->qlf_len), 0, &bp);
+                                   XFS_FSB_TO_BB(mp, dq_f->qlf_len), 0, &bp,
+                                   NULL);
        if (error)
                return error;
@@ -3213,80 +3217,58 @@ xlog_recover_process_iunlinks(
        mp->m_dmevmask = mp_dmevmask;
 }
-#ifdef DEBUG
-STATIC void
-xlog_pack_data_checksum(
-        struct xlog             *log,
-        struct xlog_in_core     *iclog,
-        int                     size)
-{
-        int             i;
-        __be32          *up;
-        uint            chksum = 0;
-        up = (__be32 *)iclog->ic_datap;
-        /* divide length by 4 to get # words */
-        for (i = 0; i < (size >> 2); i++) {
-                chksum ^= be32_to_cpu(*up);
-                up++;
-        }
-        iclog->ic_header.h_chksum = cpu_to_be32(chksum);
-}
-#else
-#define xlog_pack_data_checksum(log, iclog, size)
-#endif
 /*
- * Stamp cycle number in every block
+ * Upack the log buffer data and crc check it. If the check fails, issue a
+ * warning if and only if the CRC in the header is non-zero. This makes the
+ * check an advisory warning, and the zero CRC check will prevent failure
+ * warnings from being emitted when upgrading the kernel from one that does not
+ * add CRCs by default.
+ *
+ * When filesystems are CRC enabled, this CRC mismatch becomes a fatal log
+ * corruption failure
 */
-void
+STATIC int
-xlog_pack_data(
+xlog_unpack_data_crc(
-        struct xlog             *log,
+        struct xlog_rec_header  *rhead,
-        struct xlog_in_core     *iclog,
+        xfs_caddr_t             dp,
-        int                     roundoff)
+        struct xlog             *log)
 {
-        int                     i, j, k;
+        __le32                  crc;
-        int                     size = iclog->ic_offset + roundoff;
-        __be32                  cycle_lsn;
+        crc = xlog_cksum(log, rhead, dp, be32_to_cpu(rhead->h_len));
-        xfs_caddr_t             dp;
+        if (crc != rhead->h_crc) {
+                if (rhead->h_crc || xfs_sb_version_hascrc(&log->l_mp->m_sb)) {
-        xlog_pack_data_checksum(log, iclog, size);
+                        xfs_alert(log->l_mp,
+                "log record CRC mismatch: found 0x%x, expected 0x%x.\n",
-        cycle_lsn = CYCLE_LSN_DISK(iclog->ic_header.h_lsn);
+                                        le32_to_cpu(rhead->h_crc),
+                                        le32_to_cpu(crc));
-        dp = iclog->ic_datap;
+                        xfs_hex_dump(dp, 32);
-        for (i = 0; i < BTOBB(size) &&
-                i < (XLOG_HEADER_CYCLE_SIZE / BBSIZE); i++) {
-                iclog->ic_header.h_cycle_data[i] = *(__be32 *)dp;
-                *(__be32 *)dp = cycle_lsn;
-                dp += BBSIZE;
-        }
-        if (xfs_sb_version_haslogv2(&log->l_mp->m_sb)) {
-                xlog_in_core_2_t *xhdr = iclog->ic_data;
-                for ( ; i < BTOBB(size); i++) {
-                        j = i / (XLOG_HEADER_CYCLE_SIZE / BBSIZE);
-                        k = i % (XLOG_HEADER_CYCLE_SIZE / BBSIZE);
-                        xhdr[j].hic_xheader.xh_cycle_data[k] = *(__be32 *)dp;
-                        *(__be32 *)dp = cycle_lsn;
-                        dp += BBSIZE;
                }
-                for (i = 1; i < log->l_iclog_heads; i++) {
+                /*
-                        xhdr[i].hic_xheader.xh_cycle = cycle_lsn;
+                 * If we've detected a log record corruption, then we can't
-                }
+                 * recover past this point. Abort recovery if we are enforcing
+                 * CRC protection by punting an error back up the stack.
+                 */
+                if (xfs_sb_version_hascrc(&log->l_mp->m_sb))
+                        return EFSCORRUPTED;
        }
+        return 0;
 }
-STATIC void
+STATIC int
 xlog_unpack_data(
        struct xlog_rec_header  *rhead,
        xfs_caddr_t             dp,
        struct xlog             *log)
 {
        int                     i, j, k;
+        int                     error;
+        error = xlog_unpack_data_crc(rhead, dp, log);
+        if (error)
+                return error;
        for (i = 0; i < BTOBB(be32_to_cpu(rhead->h_len)) &&
                  i < (XLOG_HEADER_CYCLE_SIZE / BBSIZE); i++) {
@@ -3303,6 +3285,8 @@ xlog_unpack_data(
                        dp += BBSIZE;
                }
        }
+        return 0;
 }
 STATIC int
@@ -3434,9 +3418,13 @@ xlog_do_recovery_pass(
                        if (error)
                                goto bread_err2;
-                        xlog_unpack_data(rhead, offset, log);
+                        error = xlog_unpack_data(rhead, offset, log);
-                        if ((error = xlog_recover_process_data(log,
+                        if (error)
-                                                rhash, rhead, offset, pass)))
+                                goto bread_err2;
+                        error = xlog_recover_process_data(log,
+                                                rhash, rhead, offset, pass);
+                        if (error)
                                goto bread_err2;
                        blk_no += bblks + hblks;
                }
@@ -3541,14 +3529,19 @@ xlog_do_recovery_pass(
                                 *   - order is important.
                                 */
                                error = xlog_bread_offset(log, 0,
-                                                bblks - split_bblks, hbp,
+                                                bblks - split_bblks, dbp,
                                                offset + BBTOB(split_bblks));
                                if (error)
                                        goto bread_err2;
                        }
-                        xlog_unpack_data(rhead, offset, log);
-                        if ((error = xlog_recover_process_data(log, rhash,
+                        error = xlog_unpack_data(rhead, offset, log);
-                                                        rhead, offset, pass)))
+                        if (error)
+                                goto bread_err2;
+                        error = xlog_recover_process_data(log, rhash,
+                                                        rhead, offset, pass);
+                        if (error)
                                goto bread_err2;
                        blk_no += bblks;
                }
@@ -3573,9 +3566,13 @@ xlog_do_recovery_pass(
                        if (error)
                                goto bread_err2;
-                        xlog_unpack_data(rhead, offset, log);
+                        error = xlog_unpack_data(rhead, offset, log);
-                        if ((error = xlog_recover_process_data(log, rhash,
+                        if (error)
-                                                        rhead, offset, pass)))
+                                goto bread_err2;
+                        error = xlog_recover_process_data(log, rhash,
+                                                        rhead, offset, pass);
+                        if (error)
                                goto bread_err2;
                        blk_no += bblks + hblks;
                }
@@ -3689,13 +3686,14 @@ xlog_do_recover(
        /*
         * Now that we've finished replaying all buffer and inode
-         * updates, re-read in the superblock.
+         * updates, re-read in the superblock and reverify it.
         */
        bp = xfs_getsb(log->l_mp, 0);
        XFS_BUF_UNDONE(bp);
        ASSERT(!(XFS_BUF_ISWRITE(bp)));
        XFS_BUF_READ(bp);
        XFS_BUF_UNASYNC(bp);
+        bp->b_ops = &xfs_sb_buf_ops;
        xfsbdstrat(log->l_mp, bp);
        error = xfs_buf_iowait(bp);
        if (error) {
@@ -3707,7 +3705,7 @@ xlog_do_recover(
        /* Convert superblock from on-disk format */
        sbp = &log->l_mp->m_sb;
-        xfs_sb_from_disk(log->l_mp, XFS_BUF_TO_SBP(bp));
+        xfs_sb_from_disk(sbp, XFS_BUF_TO_SBP(bp));
        ASSERT(sbp->sb_magicnum == XFS_SB_MAGIC);
        ASSERT(xfs_sb_good_version(sbp));
        xfs_buf_relse(bp);
diff --git a/fs/xfs/xfs_mount.c b/fs/xfs/xfs_mount.c
index b2bd3a0e6376..da508463ff10 100644
--- a/fs/xfs/xfs_mount.c
+++ b/fs/xfs/xfs_mount.c
@@ -42,6 +42,7 @@
 #include "xfs_fsops.h"
 #include "xfs_utils.h"
 #include "xfs_trace.h"
+#include "xfs_icache.h"
 #ifdef HAVE_PERCPU_SB
@@ -303,9 +304,8 @@ STATIC int
 xfs_mount_validate_sb(
        xfs_mount_t     *mp,
        xfs_sb_t        *sbp,
-        int             flags)
+        bool            check_inprogress)
 {
-        int             loud = !(flags & XFS_MFSI_QUIET);
        /*
         * If the log device and data device have the
@@ -315,21 +315,18 @@ xfs_mount_validate_sb(
         * a volume filesystem in a non-volume manner.
         */
        if (sbp->sb_magicnum != XFS_SB_MAGIC) {
-                if (loud)
+                xfs_warn(mp, "bad magic number");
-                        xfs_warn(mp, "bad magic number");
                return XFS_ERROR(EWRONGFS);
        }
        if (!xfs_sb_good_version(sbp)) {
-                if (loud)
+                xfs_warn(mp, "bad version");
-                        xfs_warn(mp, "bad version");
                return XFS_ERROR(EWRONGFS);
        }
        if (unlikely(
            sbp->sb_logstart == 0 && mp->m_logdev_targp == mp->m_ddev_targp)) {
-                if (loud)
+                xfs_warn(mp,
-                        xfs_warn(mp,
                "filesystem is marked as having an external log; "
                "specify logdev on the mount command line.");
                return XFS_ERROR(EINVAL);
@@ -337,8 +334,7 @@ xfs_mount_validate_sb(
        if (unlikely(
            sbp->sb_logstart != 0 && mp->m_logdev_targp != mp->m_ddev_targp)) {
-                if (loud)
+                xfs_warn(mp,
-                        xfs_warn(mp,
                "filesystem is marked as having an internal log; "
                "do not specify logdev on the mount command line.");
                return XFS_ERROR(EINVAL);
@@ -372,8 +368,7 @@ xfs_mount_validate_sb(
            sbp->sb_dblocks == 0                                        ||
            sbp->sb_dblocks > XFS_MAX_DBLOCKS(sbp)                      ||
            sbp->sb_dblocks < XFS_MIN_DBLOCKS(sbp))) {
-                if (loud)
+                XFS_CORRUPTION_ERROR("SB sanity check failed",
-                        XFS_CORRUPTION_ERROR("SB sanity check failed",
                                XFS_ERRLEVEL_LOW, mp, sbp);
                return XFS_ERROR(EFSCORRUPTED);
        }
@@ -382,12 +377,10 @@ xfs_mount_validate_sb(
         * Until this is fixed only page-sized or smaller data blocks work.
         */
        if (unlikely(sbp->sb_blocksize > PAGE_SIZE)) {
-                if (loud) {
+                xfs_warn(mp,
-                        xfs_warn(mp,
                "File system with blocksize %d bytes. "
                "Only pagesize (%ld) or less will currently work.",
                                sbp->sb_blocksize, PAGE_SIZE);
-                }
                return XFS_ERROR(ENOSYS);
        }
@@ -401,23 +394,20 @@ xfs_mount_validate_sb(
        case 2048:
                break;
        default:
-                if (loud)
+                xfs_warn(mp, "inode size of %d bytes not supported",
-                        xfs_warn(mp, "inode size of %d bytes not supported",
                                sbp->sb_inodesize);
                return XFS_ERROR(ENOSYS);
        }
        if (xfs_sb_validate_fsb_count(sbp, sbp->sb_dblocks) ||
            xfs_sb_validate_fsb_count(sbp, sbp->sb_rblocks)) {
-                if (loud)
+                xfs_warn(mp,
-                        xfs_warn(mp,
                "file system too large to be mounted on this system.");
                return XFS_ERROR(EFBIG);
        }
-        if (unlikely(sbp->sb_inprogress)) {
+        if (check_inprogress && sbp->sb_inprogress) {
-                if (loud)
+                xfs_warn(mp, "Offline file system operation in progress!");
-                        xfs_warn(mp, "file system busy");
                return XFS_ERROR(EFSCORRUPTED);
        }
@@ -425,9 +415,7 @@ xfs_mount_validate_sb(
         * Version 1 directory format has never worked on Linux.
         */
        if (unlikely(!xfs_sb_version_hasdirv2(sbp))) {
-                if (loud)
+                xfs_warn(mp, "file system using version 1 directory format");
-                        xfs_warn(mp,
-                                "file system using version 1 directory format");
                return XFS_ERROR(ENOSYS);
        }
@@ -520,11 +508,9 @@ out_unwind:
 void
 xfs_sb_from_disk(
-        struct xfs_mount        *mp,
+        struct xfs_sb   *to,
        xfs_dsb_t       *from)
 {
-        struct xfs_sb *to = &mp->m_sb;
        to->sb_magicnum = be32_to_cpu(from->sb_magicnum);
        to->sb_blocksize = be32_to_cpu(from->sb_blocksize);
        to->sb_dblocks = be64_to_cpu(from->sb_dblocks);
@@ -626,6 +612,72 @@ xfs_sb_to_disk(
        }
 }
+static void
+xfs_sb_verify(
+        struct xfs_buf  *bp)
+{
+        struct xfs_mount *mp = bp->b_target->bt_mount;
+        struct xfs_sb   sb;
+        int             error;
+        xfs_sb_from_disk(&sb, XFS_BUF_TO_SBP(bp));
+        /*
+         * Only check the in progress field for the primary superblock as
+         * mkfs.xfs doesn't clear it from secondary superblocks.
+         */
+        error = xfs_mount_validate_sb(mp, &sb, bp->b_bn == XFS_SB_DADDR);
+        if (error)
+                xfs_buf_ioerror(bp, error);
+}
+static void
+xfs_sb_read_verify(
+        struct xfs_buf  *bp)
+{
+        xfs_sb_verify(bp);
+}
+/*
+ * We may be probed for a filesystem match, so we may not want to emit
+ * messages when the superblock buffer is not actually an XFS superblock.
+ * If we find an XFS superblock, the run a normal, noisy mount because we are
+ * really going to mount it and want to know about errors.
+ */
+static void
+xfs_sb_quiet_read_verify(
+        struct xfs_buf  *bp)
+{
+        struct xfs_sb   sb;
+        xfs_sb_from_disk(&sb, XFS_BUF_TO_SBP(bp));
+        if (sb.sb_magicnum == XFS_SB_MAGIC) {
+                /* XFS filesystem, verify noisily! */
+                xfs_sb_read_verify(bp);
+                return;
+        }
+        /* quietly fail */
+        xfs_buf_ioerror(bp, EFSCORRUPTED);
+}
+static void
+xfs_sb_write_verify(
+        struct xfs_buf  *bp)
+{
+        xfs_sb_verify(bp);
+}
+const struct xfs_buf_ops xfs_sb_buf_ops = {
+        .verify_read = xfs_sb_read_verify,
+        .verify_write = xfs_sb_write_verify,
+};
+static const struct xfs_buf_ops xfs_sb_quiet_buf_ops = {
+        .verify_read = xfs_sb_quiet_read_verify,
+        .verify_write = xfs_sb_write_verify,
+};
 /*
 * xfs_readsb
 *
@@ -651,26 +703,27 @@ xfs_readsb(xfs_mount_t *mp, int flags)
 reread:
        bp = xfs_buf_read_uncached(mp->m_ddev_targp, XFS_SB_DADDR,
-                                        BTOBB(sector_size), 0);
+                                   BTOBB(sector_size), 0,
+                                   loud ? &xfs_sb_buf_ops
+                                        : &xfs_sb_quiet_buf_ops);
        if (!bp) {
                if (loud)
                        xfs_warn(mp, "SB buffer read failed");
                return EIO;
        }
+        if (bp->b_error) {
-        /*
+                error = bp->b_error;
-         * Initialize the mount structure from the superblock.
-         * But first do some basic consistency checking.
-         */
-        xfs_sb_from_disk(mp, XFS_BUF_TO_SBP(bp));
-        error = xfs_mount_validate_sb(mp, &(mp->m_sb), flags);
-        if (error) {
                if (loud)
                        xfs_warn(mp, "SB validate failed");
                goto release_buf;
        }
        /*
+         * Initialize the mount structure from the superblock.
+         */
+        xfs_sb_from_disk(&mp->m_sb, XFS_BUF_TO_SBP(bp));
+        /*
         * We must be able to do sector-sized and sector-aligned IO.
         */
        if (sector_size > mp->m_sb.sb_sectsize) {
@@ -1001,7 +1054,7 @@ xfs_check_sizes(xfs_mount_t *mp)
        }
        bp = xfs_buf_read_uncached(mp->m_ddev_targp,
                                        d - XFS_FSS_TO_BB(mp, 1),
-                                        XFS_FSS_TO_BB(mp, 1), 0);
+                                        XFS_FSS_TO_BB(mp, 1), 0, NULL);
        if (!bp) {
                xfs_warn(mp, "last sector read failed");
                return EIO;
@@ -1016,7 +1069,7 @@ xfs_check_sizes(xfs_mount_t *mp)
                }
                bp = xfs_buf_read_uncached(mp->m_logdev_targp,
                                        d - XFS_FSB_TO_BB(mp, 1),
-                                        XFS_FSB_TO_BB(mp, 1), 0);
+                                        XFS_FSB_TO_BB(mp, 1), 0, NULL);
                if (!bp) {
                        xfs_warn(mp, "log device read failed");
                        return EIO;
@@ -1427,6 +1480,8 @@ xfs_unmountfs(
        __uint64_t              resblks;
        int                     error;
+        cancel_delayed_work_sync(&mp->m_eofblocks_work);
        xfs_qm_unmount_quotas(mp);
        xfs_rtunmount_inodes(mp);
        IRELE(mp->m_rootip);
@@ -1450,21 +1505,16 @@ xfs_unmountfs(
        /*
         * And reclaim all inodes.  At this point there should be no dirty
-         * inode, and none should be pinned or locked, but use synchronous
+         * inodes and none should be pinned or locked, but use synchronous
-         * reclaim just to be sure.
+         * reclaim just to be sure. We can stop background inode reclaim
+         * here as well if it is still running.
         */
+        cancel_delayed_work_sync(&mp->m_reclaim_work);
        xfs_reclaim_inodes(mp, SYNC_WAIT);
        xfs_qm_unmount(mp);
        /*
-         * Flush out the log synchronously so that we know for sure
-         * that nothing is pinned.  This is important because bflush()
-         * will skip pinned buffers.
-         */
-        xfs_log_force(mp, XFS_LOG_SYNC);
-        /*
         * Unreserve any blocks we have so that when we unmount we don't account
         * the reserved free space as used. This is really only necessary for
         * lazy superblock counting because it trusts the incore superblock
@@ -1489,23 +1539,6 @@ xfs_unmountfs(
                xfs_warn(mp, "Unable to update superblock counters. "
                                "Freespace may not be correct on next mount.");
-        /*
-         * At this point we might have modified the superblock again and thus
-         * added an item to the AIL, thus flush it again.
-         */
-        xfs_ail_push_all_sync(mp->m_ail);
-        xfs_wait_buftarg(mp->m_ddev_targp);
-        /*
-         * The superblock buffer is uncached and xfsaild_push() will lock and
-         * set the XBF_ASYNC flag on the buffer. We cannot do xfs_buf_iowait()
-         * here but a lock on the superblock buffer will block until iodone()
-         * has completed.
-         */
-        xfs_buf_lock(mp->m_sb_bp);
-        xfs_buf_unlock(mp->m_sb_bp);
-        xfs_log_unmount_write(mp);
        xfs_log_unmount(mp);
        xfs_uuid_unmount(mp);
diff --git a/fs/xfs/xfs_mount.h b/fs/xfs/xfs_mount.h
index deee09e534dc..bab8314507e4 100644
--- a/fs/xfs/xfs_mount.h
+++ b/fs/xfs/xfs_mount.h
@@ -51,8 +51,6 @@ typedef struct xfs_trans_reservations {
 #else /* __KERNEL__ */
-#include "xfs_sync.h"
 struct xlog;
 struct xfs_inode;
 struct xfs_mru_cache;
@@ -197,9 +195,9 @@ typedef struct xfs_mount {
        struct mutex            m_icsb_mutex;   /* balancer sync lock */
 #endif
        struct xfs_mru_cache    *m_filestream;  /* per-mount filestream data */
-        struct delayed_work     m_sync_work;    /* background sync work */
        struct delayed_work     m_reclaim_work; /* background inode reclaim */
-        struct work_struct      m_flush_work;   /* background inode flush */
+        struct delayed_work     m_eofblocks_work; /* background eof blocks
+                                                     trimming */
        __int64_t               m_update_flags; /* sb flags we need to update
                                                   on the next remount,rw */
        struct shrinker         m_inode_shrink; /* inode reclaim shrinker */
@@ -209,6 +207,9 @@ typedef struct xfs_mount {
        struct workqueue_struct *m_data_workqueue;
        struct workqueue_struct *m_unwritten_workqueue;
        struct workqueue_struct *m_cil_workqueue;
+        struct workqueue_struct *m_reclaim_workqueue;
+        struct workqueue_struct *m_log_workqueue;
+        struct workqueue_struct *m_eofblocks_workqueue;
 } xfs_mount_t;
 /*
@@ -387,7 +388,9 @@ extern void	xfs_set_low_space_thresholds(struct xfs_mount *);
 extern void     xfs_mod_sb(struct xfs_trans *, __int64_t);
 extern int      xfs_initialize_perag(struct xfs_mount *, xfs_agnumber_t,
                                        xfs_agnumber_t *);
-extern void     xfs_sb_from_disk(struct xfs_mount *, struct xfs_dsb *);
+extern void     xfs_sb_from_disk(struct xfs_sb *, struct xfs_dsb *);
 extern void     xfs_sb_to_disk(struct xfs_dsb *, struct xfs_sb *, __int64_t);
+extern const struct xfs_buf_ops xfs_sb_buf_ops;
 #endif  /* __XFS_MOUNT_H__ */
diff --git a/fs/xfs/xfs_qm.c b/fs/xfs/xfs_qm.c
index 2e86fa0cfc0d..60eff4763156 100644
--- a/fs/xfs/xfs_qm.c
+++ b/fs/xfs/xfs_qm.c
@@ -40,6 +40,7 @@
 #include "xfs_utils.h"
 #include "xfs_qm.h"
 #include "xfs_trace.h"
+#include "xfs_icache.h"
 /*
 * The global quota manager. There is only one of these for the entire
@@ -891,7 +892,8 @@ xfs_qm_dqiter_bufs(
        while (blkcnt--) {
                error = xfs_trans_read_buf(mp, NULL, mp->m_ddev_targp,
                              XFS_FSB_TO_DADDR(mp, bno),
-                              mp->m_quotainfo->qi_dqchunklen, 0, &bp);
+                              mp->m_quotainfo->qi_dqchunklen, 0, &bp,
+                              &xfs_dquot_buf_ops);
                if (error)
                        break;
@@ -978,7 +980,8 @@ xfs_qm_dqiterate(
                                while (rablkcnt--) {
                                        xfs_buf_readahead(mp->m_ddev_targp,
                                               XFS_FSB_TO_DADDR(mp, rablkno),
-                                               mp->m_quotainfo->qi_dqchunklen);
+                                               mp->m_quotainfo->qi_dqchunklen,
+                                               NULL);
                                        rablkno++;
                                }
                        }
@@ -1453,7 +1456,7 @@ xfs_qm_dqreclaim_one(
        int                     error;
        if (!xfs_dqlock_nowait(dqp))
-                goto out_busy;
+                goto out_move_tail;
        /*
         * This dquot has acquired a reference in the meantime remove it from
@@ -1476,7 +1479,7 @@ xfs_qm_dqreclaim_one(
         * getting flushed to disk, we don't want to reclaim it.
         */
        if (!xfs_dqflock_nowait(dqp))
-                goto out_busy;
+                goto out_unlock_move_tail;
        if (XFS_DQ_IS_DIRTY(dqp)) {
                struct xfs_buf  *bp = NULL;
@@ -1487,7 +1490,7 @@ xfs_qm_dqreclaim_one(
                if (error) {
                        xfs_warn(mp, "%s: dquot %p flush failed",
                                 __func__, dqp);
-                        goto out_busy;
+                        goto out_unlock_move_tail;
                }
                xfs_buf_delwri_queue(bp, buffer_list);
@@ -1496,7 +1499,7 @@ xfs_qm_dqreclaim_one(
                 * Give the dquot another try on the freelist, as the
                 * flushing will take some time.
                 */
-                goto out_busy;
+                goto out_unlock_move_tail;
        }
        xfs_dqfunlock(dqp);
@@ -1515,14 +1518,13 @@ xfs_qm_dqreclaim_one(
        XFS_STATS_INC(xs_qm_dqreclaims);
        return;
-out_busy:
-        xfs_dqunlock(dqp);
        /*
         * Move the dquot to the tail of the list so that we don't spin on it.
         */
+out_unlock_move_tail:
+        xfs_dqunlock(dqp);
+out_move_tail:
        list_move_tail(&dqp->q_lru, &qi->qi_lru_list);
        trace_xfs_dqreclaim_busy(dqp);
        XFS_STATS_INC(xs_qm_dqreclaim_misses);
 }
diff --git a/fs/xfs/xfs_qm_syscalls.c b/fs/xfs/xfs_qm_syscalls.c
index 858a3b186110..5f53e75409b8 100644
--- a/fs/xfs/xfs_qm_syscalls.c
+++ b/fs/xfs/xfs_qm_syscalls.c
@@ -40,6 +40,7 @@
 #include "xfs_utils.h"
 #include "xfs_qm.h"
 #include "xfs_trace.h"
+#include "xfs_icache.h"
 STATIC int      xfs_qm_log_quotaoff(xfs_mount_t *, xfs_qoff_logitem_t **, uint);
 STATIC int      xfs_qm_log_quotaoff_end(xfs_mount_t *, xfs_qoff_logitem_t *,
@@ -845,7 +846,8 @@ STATIC int
 xfs_dqrele_inode(
        struct xfs_inode        *ip,
        struct xfs_perag        *pag,
-        int                     flags)
+        int                     flags,
+        void                    *args)
 {
        /* skip quota inodes */
        if (ip == ip->i_mount->m_quotainfo->qi_uquotaip ||
@@ -881,5 +883,5 @@ xfs_qm_dqrele_all_inodes(
        uint             flags)
 {
        ASSERT(mp->m_quotainfo);
-        xfs_inode_ag_iterator(mp, xfs_dqrele_inode, flags);
+        xfs_inode_ag_iterator(mp, xfs_dqrele_inode, flags, NULL);
 }
diff --git a/fs/xfs/xfs_rtalloc.c b/fs/xfs/xfs_rtalloc.c
index ca28a4ba4b54..98dc670d3ee0 100644
--- a/fs/xfs/xfs_rtalloc.c
+++ b/fs/xfs/xfs_rtalloc.c
@@ -38,6 +38,7 @@
 #include "xfs_utils.h"
 #include "xfs_trace.h"
 #include "xfs_buf.h"
+#include "xfs_icache.h"
 /*
@@ -869,7 +870,7 @@ xfs_rtbuf_get(
        ASSERT(map.br_startblock != NULLFSBLOCK);
        error = xfs_trans_read_buf(mp, tp, mp->m_ddev_targp,
                                   XFS_FSB_TO_DADDR(mp, map.br_startblock),
-                                   mp->m_bsize, 0, &bp);
+                                   mp->m_bsize, 0, &bp, NULL);
        if (error)
                return error;
        ASSERT(!xfs_buf_geterror(bp));
@@ -1872,9 +1873,14 @@ xfs_growfs_rt(
         */
        bp = xfs_buf_read_uncached(mp->m_rtdev_targp,
                                XFS_FSB_TO_BB(mp, nrblocks - 1),
-                                XFS_FSB_TO_BB(mp, 1), 0);
+                                XFS_FSB_TO_BB(mp, 1), 0, NULL);
        if (!bp)
                return EIO;
+        if (bp->b_error) {
+                error = bp->b_error;
+                xfs_buf_relse(bp);
+                return error;
+        }
        xfs_buf_relse(bp);
        /*
@@ -2219,9 +2225,11 @@ xfs_rtmount_init(
        }
        bp = xfs_buf_read_uncached(mp->m_rtdev_targp,
                                        d - XFS_FSB_TO_BB(mp, 1),
-                                        XFS_FSB_TO_BB(mp, 1), 0);
+                                        XFS_FSB_TO_BB(mp, 1), 0, NULL);
-        if (!bp) {
+        if (!bp || bp->b_error) {
                xfs_warn(mp, "realtime device size check failed");
+                if (bp)
+                        xfs_buf_relse(bp);
                return EIO;
        }
        xfs_buf_relse(bp);
diff --git a/fs/xfs/xfs_sb.h b/fs/xfs/xfs_sb.h
index f429d9d5d325..a05b45175fb0 100644
--- a/fs/xfs/xfs_sb.h
+++ b/fs/xfs/xfs_sb.h
@@ -81,6 +81,7 @@ struct xfs_mount;
 #define XFS_SB_VERSION2_ATTR2BIT        0x00000008      /* Inline attr rework */
 #define XFS_SB_VERSION2_PARENTBIT       0x00000010      /* parent pointers */
 #define XFS_SB_VERSION2_PROJID32BIT     0x00000080      /* 32 bit project id */
+#define XFS_SB_VERSION2_CRCBIT          0x00000100      /* metadata CRCs */
 #define XFS_SB_VERSION2_OKREALFBITS     \
        (XFS_SB_VERSION2_LAZYSBCOUNTBIT | \
@@ -503,6 +504,12 @@ static inline int xfs_sb_version_hasprojid32bit(xfs_sb_t *sbp)
                (sbp->sb_features2 & XFS_SB_VERSION2_PROJID32BIT);
 }
+static inline int xfs_sb_version_hascrc(xfs_sb_t *sbp)
+{
+        return (xfs_sb_version_hasmorebits(sbp) &&
+                (sbp->sb_features2 & XFS_SB_VERSION2_CRCBIT));
+}
 /*
 * end of superblock version macros
 */
diff --git a/fs/xfs/xfs_super.c b/fs/xfs/xfs_super.c
index 26a09bd7f975..ab8839b26272 100644
--- a/fs/xfs/xfs_super.c
+++ b/fs/xfs/xfs_super.c
@@ -49,7 +49,7 @@
 #include "xfs_extfree_item.h"
 #include "xfs_mru_cache.h"
 #include "xfs_inode_item.h"
-#include "xfs_sync.h"
+#include "xfs_icache.h"
 #include "xfs_trace.h"
 #include <linux/namei.h>
@@ -863,8 +863,30 @@ xfs_init_mount_workqueues(
                        WQ_MEM_RECLAIM, 0, mp->m_fsname);
        if (!mp->m_cil_workqueue)
                goto out_destroy_unwritten;
+        mp->m_reclaim_workqueue = alloc_workqueue("xfs-reclaim/%s",
+                        WQ_NON_REENTRANT, 0, mp->m_fsname);
+        if (!mp->m_reclaim_workqueue)
+                goto out_destroy_cil;
+        mp->m_log_workqueue = alloc_workqueue("xfs-log/%s",
+                        WQ_NON_REENTRANT, 0, mp->m_fsname);
+        if (!mp->m_log_workqueue)
+                goto out_destroy_reclaim;
+        mp->m_eofblocks_workqueue = alloc_workqueue("xfs-eofblocks/%s",
+                        WQ_NON_REENTRANT, 0, mp->m_fsname);
+        if (!mp->m_eofblocks_workqueue)
+                goto out_destroy_log;
        return 0;
+out_destroy_log:
+        destroy_workqueue(mp->m_log_workqueue);
+out_destroy_reclaim:
+        destroy_workqueue(mp->m_reclaim_workqueue);
+out_destroy_cil:
+        destroy_workqueue(mp->m_cil_workqueue);
 out_destroy_unwritten:
        destroy_workqueue(mp->m_unwritten_workqueue);
 out_destroy_data_iodone_queue:
@@ -877,11 +899,32 @@ STATIC void
 xfs_destroy_mount_workqueues(
        struct xfs_mount        *mp)
 {
+        destroy_workqueue(mp->m_eofblocks_workqueue);
+        destroy_workqueue(mp->m_log_workqueue);
+        destroy_workqueue(mp->m_reclaim_workqueue);
        destroy_workqueue(mp->m_cil_workqueue);
        destroy_workqueue(mp->m_data_workqueue);
        destroy_workqueue(mp->m_unwritten_workqueue);
 }
+/*
+ * Flush all dirty data to disk. Must not be called while holding an XFS_ILOCK
+ * or a page lock. We use sync_inodes_sb() here to ensure we block while waiting
+ * for IO to complete so that we effectively throttle multiple callers to the
+ * rate at which IO is completing.
+ */
+void
+xfs_flush_inodes(
+        struct xfs_mount        *mp)
+{
+        struct super_block      *sb = mp->m_super;
+        if (down_read_trylock(&sb->s_umount)) {
+                sync_inodes_sb(sb);
+                up_read(&sb->s_umount);
+        }
+}
 /* Catch misguided souls that try to use this interface on XFS */
 STATIC struct inode *
 xfs_fs_alloc_inode(
@@ -1006,9 +1049,8 @@ xfs_fs_put_super(
        struct xfs_mount        *mp = XFS_M(sb);
        xfs_filestream_unmount(mp);
-        cancel_delayed_work_sync(&mp->m_sync_work);
        xfs_unmountfs(mp);
-        xfs_syncd_stop(mp);
        xfs_freesb(mp);
        xfs_icsb_destroy_counters(mp);
        xfs_destroy_mount_workqueues(mp);
@@ -1023,7 +1065,6 @@ xfs_fs_sync_fs(
        int                     wait)
 {
        struct xfs_mount        *mp = XFS_M(sb);
-        int                     error;
        /*
         * Doing anything during the async pass would be counterproductive.
@@ -1031,17 +1072,14 @@ xfs_fs_sync_fs(
        if (!wait)
                return 0;
-        error = xfs_quiesce_data(mp);
+        xfs_log_force(mp, XFS_LOG_SYNC);
-        if (error)
-                return -error;
        if (laptop_mode) {
                /*
                 * The disk must be active because we're syncing.
-                 * We schedule xfssyncd now (now that the disk is
+                 * We schedule log work now (now that the disk is
                 * active) instead of later (when it might not be).
                 */
-                flush_delayed_work(&mp->m_sync_work);
+                flush_delayed_work(&mp->m_log->l_work);
        }
        return 0;
@@ -1118,6 +1156,48 @@ xfs_restore_resvblks(struct xfs_mount *mp)
        xfs_reserve_blocks(mp, &resblks, NULL);
 }
+/*
+ * Trigger writeback of all the dirty metadata in the file system.
+ *
+ * This ensures that the metadata is written to their location on disk rather
+ * than just existing in transactions in the log. This means after a quiesce
+ * there is no log replay required to write the inodes to disk - this is the
+ * primary difference between a sync and a quiesce.
+ *
+ * Note: xfs_log_quiesce() stops background log work - the callers must ensure
+ * it is started again when appropriate.
+ */
+void
+xfs_quiesce_attr(
+        struct xfs_mount        *mp)
+{
+        int     error = 0;
+        /* wait for all modifications to complete */
+        while (atomic_read(&mp->m_active_trans) > 0)
+                delay(100);
+        /* force the log to unpin objects from the now complete transactions */
+        xfs_log_force(mp, XFS_LOG_SYNC);
+        /* reclaim inodes to do any IO before the freeze completes */
+        xfs_reclaim_inodes(mp, 0);
+        xfs_reclaim_inodes(mp, SYNC_WAIT);
+        /* Push the superblock and write an unmount record */
+        error = xfs_log_sbcount(mp);
+        if (error)
+                xfs_warn(mp, "xfs_attr_quiesce: failed to log sb changes. "
+                                "Frozen image may not be consistent.");
+        /*
+         * Just warn here till VFS can correctly support
+         * read-only remount without racing.
+         */
+        WARN_ON(atomic_read(&mp->m_active_trans) != 0);
+        xfs_log_quiesce(mp);
+}
 STATIC int
 xfs_fs_remount(
        struct super_block      *sb,
@@ -1198,20 +1278,18 @@ xfs_fs_remount(
                 * value if it is non-zero, otherwise go with the default.
                 */
                xfs_restore_resvblks(mp);
+                xfs_log_work_queue(mp);
        }
        /* rw -> ro */
        if (!(mp->m_flags & XFS_MOUNT_RDONLY) && (*flags & MS_RDONLY)) {
                /*
-                 * After we have synced the data but before we sync the
+                 * Before we sync the metadata, we need to free up the reserve
-                 * metadata, we need to free up the reserve block pool so that
+                 * block pool so that the used block count in the superblock on
-                 * the used block count in the superblock on disk is correct at
+                 * disk is correct at the end of the remount. Stash the current
-                 * the end of the remount. Stash the current reserve pool size
+                 * reserve pool size so that if we get remounted rw, we can
-                 * so that if we get remounted rw, we can return it to the same
+                 * return it to the same size.
-                 * size.
                 */
-                xfs_quiesce_data(mp);
                xfs_save_resvblks(mp);
                xfs_quiesce_attr(mp);
                mp->m_flags |= XFS_MOUNT_RDONLY;
@@ -1243,6 +1321,7 @@ xfs_fs_unfreeze(
        struct xfs_mount        *mp = XFS_M(sb);
        xfs_restore_resvblks(mp);
+        xfs_log_work_queue(mp);
        return 0;
 }
@@ -1321,6 +1400,8 @@ xfs_fs_fill_super(
        spin_lock_init(&mp->m_sb_lock);
        mutex_init(&mp->m_growlock);
        atomic_set(&mp->m_active_trans, 0);
+        INIT_DELAYED_WORK(&mp->m_reclaim_work, xfs_reclaim_worker);
+        INIT_DELAYED_WORK(&mp->m_eofblocks_work, xfs_eofblocks_worker);
        mp->m_super = sb;
        sb->s_fs_info = mp;
@@ -1371,10 +1452,6 @@ xfs_fs_fill_super(
        /*
         * we must configure the block size in the superblock before we run the
         * full mount process as the mount process can lookup and cache inodes.
-         * For the same reason we must also initialise the syncd and register
-         * the inode cache shrinker so that inodes can be reclaimed during
-         * operations like a quotacheck that iterate all inodes in the
-         * filesystem.
         */
        sb->s_magic = XFS_SB_MAGIC;
        sb->s_blocksize = mp->m_sb.sb_blocksize;
@@ -1384,13 +1461,9 @@ xfs_fs_fill_super(
        sb->s_time_gran = 1;
        set_posix_acl_flag(sb);
-        error = xfs_syncd_init(mp);
-        if (error)
-                goto out_filestream_unmount;
        error = xfs_mountfs(mp);
        if (error)
-                goto out_syncd_stop;
+                goto out_filestream_unmount;
        root = igrab(VFS_I(mp->m_rootip));
        if (!root) {
@@ -1408,8 +1481,7 @@ xfs_fs_fill_super(
        }
        return 0;
- out_syncd_stop:
-        xfs_syncd_stop(mp);
 out_filestream_unmount:
        xfs_filestream_unmount(mp);
 out_free_sb:
@@ -1429,7 +1501,6 @@ out_destroy_workqueues:
 out_unmount:
        xfs_filestream_unmount(mp);
        xfs_unmountfs(mp);
-        xfs_syncd_stop(mp);
        goto out_free_sb;
 }
@@ -1625,16 +1696,6 @@ STATIC int __init
 xfs_init_workqueues(void)
 {
        /*
-         * We never want to the same work item to run twice, reclaiming inodes
-         * or idling the log is not going to get any faster by multiple CPUs
-         * competing for ressources.  Use the default large max_active value
-         * so that even lots of filesystems can perform these task in parallel.
-         */
-        xfs_syncd_wq = alloc_workqueue("xfssyncd", WQ_NON_REENTRANT, 0);
-        if (!xfs_syncd_wq)
-                return -ENOMEM;
-        /*
         * The allocation workqueue can be used in memory reclaim situations
         * (writepage path), and parallelism is only limited by the number of
         * AGs in all the filesystems mounted. Hence use the default large
@@ -1642,20 +1703,15 @@ xfs_init_workqueues(void)
         */
        xfs_alloc_wq = alloc_workqueue("xfsalloc", WQ_MEM_RECLAIM, 0);
        if (!xfs_alloc_wq)
-                goto out_destroy_syncd;
+                return -ENOMEM;
        return 0;
-out_destroy_syncd:
-        destroy_workqueue(xfs_syncd_wq);
-        return -ENOMEM;
 }
 STATIC void
 xfs_destroy_workqueues(void)
 {
        destroy_workqueue(xfs_alloc_wq);
-        destroy_workqueue(xfs_syncd_wq);
 }
 STATIC int __init
diff --git a/fs/xfs/xfs_super.h b/fs/xfs/xfs_super.h
index 9de4a920ba05..bbe3d15a7904 100644
--- a/fs/xfs/xfs_super.h
+++ b/fs/xfs/xfs_super.h
@@ -74,6 +74,7 @@ struct block_device;
 extern __uint64_t xfs_max_file_offset(unsigned int);
+extern void xfs_flush_inodes(struct xfs_mount *mp);
 extern void xfs_blkdev_issue_flush(struct xfs_buftarg *);
 extern xfs_agnumber_t xfs_set_inode32(struct xfs_mount *);
 extern xfs_agnumber_t xfs_set_inode64(struct xfs_mount *);
diff --git a/fs/xfs/xfs_sysctl.c b/fs/xfs/xfs_sysctl.c
index ee2d2adaa438..2801b5ce6cdb 100644
--- a/fs/xfs/xfs_sysctl.c
+++ b/fs/xfs/xfs_sysctl.c
@@ -202,6 +202,15 @@ static ctl_table xfs_table[] = {
                .extra1         = &xfs_params.fstrm_timer.min,
                .extra2         = &xfs_params.fstrm_timer.max,
        },
+        {
+                .procname       = "speculative_prealloc_lifetime",
+                .data           = &xfs_params.eofb_timer.val,
+                .maxlen         = sizeof(int),
+                .mode           = 0644,
+                .proc_handler   = proc_dointvec_minmax,
+                .extra1         = &xfs_params.eofb_timer.min,
+                .extra2         = &xfs_params.eofb_timer.max,
+        },
        /* please keep this the last entry */
 #ifdef CONFIG_PROC_FS
        {
diff --git a/fs/xfs/xfs_sysctl.h b/fs/xfs/xfs_sysctl.h
index b9937d450f8e..bd8e157c20ef 100644
--- a/fs/xfs/xfs_sysctl.h
+++ b/fs/xfs/xfs_sysctl.h
@@ -47,6 +47,7 @@ typedef struct xfs_param {
        xfs_sysctl_val_t rotorstep;     /* inode32 AG rotoring control knob */
        xfs_sysctl_val_t inherit_nodfrg;/* Inherit the "nodefrag" inode flag. */
        xfs_sysctl_val_t fstrm_timer;   /* Filestream dir-AG assoc'n timeout. */
+        xfs_sysctl_val_t eofb_timer;    /* Interval between eofb scan wakeups */
 } xfs_param_t;
 /*
diff --git a/fs/xfs/xfs_trace.h b/fs/xfs/xfs_trace.h
index 7d36ccf57f93..2e137d4a85ae 100644
--- a/fs/xfs/xfs_trace.h
+++ b/fs/xfs/xfs_trace.h
@@ -96,6 +96,8 @@ DEFINE_ATTR_LIST_EVENT(xfs_attr_list_full);
 DEFINE_ATTR_LIST_EVENT(xfs_attr_list_add);
 DEFINE_ATTR_LIST_EVENT(xfs_attr_list_wrong_blk);
 DEFINE_ATTR_LIST_EVENT(xfs_attr_list_notfound);
+DEFINE_ATTR_LIST_EVENT(xfs_attr_leaf_list);
+DEFINE_ATTR_LIST_EVENT(xfs_attr_node_list);
 DECLARE_EVENT_CLASS(xfs_perag_class,
        TP_PROTO(struct xfs_mount *mp, xfs_agnumber_t agno, int refcount,
@@ -130,6 +132,8 @@ DEFINE_PERAG_REF_EVENT(xfs_perag_get_tag);
 DEFINE_PERAG_REF_EVENT(xfs_perag_put);
 DEFINE_PERAG_REF_EVENT(xfs_perag_set_reclaim);
 DEFINE_PERAG_REF_EVENT(xfs_perag_clear_reclaim);
+DEFINE_PERAG_REF_EVENT(xfs_perag_set_eofblocks);
+DEFINE_PERAG_REF_EVENT(xfs_perag_clear_eofblocks);
 TRACE_EVENT(xfs_attr_list_node_descend,
        TP_PROTO(struct xfs_attr_list_context *ctx,
@@ -585,6 +589,10 @@ DEFINE_INODE_EVENT(xfs_update_time);
 DEFINE_INODE_EVENT(xfs_dquot_dqalloc);
 DEFINE_INODE_EVENT(xfs_dquot_dqdetach);
+DEFINE_INODE_EVENT(xfs_inode_set_eofblocks_tag);
+DEFINE_INODE_EVENT(xfs_inode_clear_eofblocks_tag);
+DEFINE_INODE_EVENT(xfs_inode_free_eofblocks_invalid);
 DECLARE_EVENT_CLASS(xfs_iref_class,
        TP_PROTO(struct xfs_inode *ip, unsigned long caller_ip),
        TP_ARGS(ip, caller_ip),
@@ -1496,8 +1504,42 @@ DEFINE_DIR2_EVENT(xfs_dir2_node_replace);
 DEFINE_DIR2_EVENT(xfs_dir2_node_removename);
 DEFINE_DIR2_EVENT(xfs_dir2_node_to_leaf);
+DECLARE_EVENT_CLASS(xfs_attr_class,
+        TP_PROTO(struct xfs_da_args *args),
+        TP_ARGS(args),
+        TP_STRUCT__entry(
+                __field(dev_t, dev)
+                __field(xfs_ino_t, ino)
+                __dynamic_array(char, name, args->namelen)
+                __field(int, namelen)
+                __field(int, valuelen)
+                __field(xfs_dahash_t, hashval)
+                __field(int, op_flags)
+        ),
+        TP_fast_assign(
+                __entry->dev = VFS_I(args->dp)->i_sb->s_dev;
+                __entry->ino = args->dp->i_ino;
+                if (args->namelen)
+                        memcpy(__get_str(name), args->name, args->namelen);
+                __entry->namelen = args->namelen;
+                __entry->valuelen = args->valuelen;
+                __entry->hashval = args->hashval;
+                __entry->op_flags = args->op_flags;
+        ),
+        TP_printk("dev %d:%d ino 0x%llx name %.*s namelen %d valuelen %d "
+                  "hashval 0x%x op_flags %s",
+                  MAJOR(__entry->dev), MINOR(__entry->dev),
+                  __entry->ino,
+                  __entry->namelen,
+                  __entry->namelen ? __get_str(name) : NULL,
+                  __entry->namelen,
+                  __entry->valuelen,
+                  __entry->hashval,
+                  __print_flags(__entry->op_flags, "|", XFS_DA_OP_FLAGS))
+)
 #define DEFINE_ATTR_EVENT(name) \
-DEFINE_EVENT(xfs_da_class, name, \
+DEFINE_EVENT(xfs_attr_class, name, \
        TP_PROTO(struct xfs_da_args *args), \
        TP_ARGS(args))
 DEFINE_ATTR_EVENT(xfs_attr_sf_add);
@@ -1511,10 +1553,14 @@ DEFINE_ATTR_EVENT(xfs_attr_sf_to_leaf);
 DEFINE_ATTR_EVENT(xfs_attr_leaf_add);
 DEFINE_ATTR_EVENT(xfs_attr_leaf_add_old);
 DEFINE_ATTR_EVENT(xfs_attr_leaf_add_new);
+DEFINE_ATTR_EVENT(xfs_attr_leaf_add_work);
 DEFINE_ATTR_EVENT(xfs_attr_leaf_addname);
 DEFINE_ATTR_EVENT(xfs_attr_leaf_create);
+DEFINE_ATTR_EVENT(xfs_attr_leaf_compact);
+DEFINE_ATTR_EVENT(xfs_attr_leaf_get);
 DEFINE_ATTR_EVENT(xfs_attr_leaf_lookup);
 DEFINE_ATTR_EVENT(xfs_attr_leaf_replace);
+DEFINE_ATTR_EVENT(xfs_attr_leaf_remove);
 DEFINE_ATTR_EVENT(xfs_attr_leaf_removename);
 DEFINE_ATTR_EVENT(xfs_attr_leaf_split);
 DEFINE_ATTR_EVENT(xfs_attr_leaf_split_before);
@@ -1526,12 +1572,21 @@ DEFINE_ATTR_EVENT(xfs_attr_leaf_to_sf);
 DEFINE_ATTR_EVENT(xfs_attr_leaf_to_node);
 DEFINE_ATTR_EVENT(xfs_attr_leaf_rebalance);
 DEFINE_ATTR_EVENT(xfs_attr_leaf_unbalance);
+DEFINE_ATTR_EVENT(xfs_attr_leaf_toosmall);
 DEFINE_ATTR_EVENT(xfs_attr_node_addname);
+DEFINE_ATTR_EVENT(xfs_attr_node_get);
 DEFINE_ATTR_EVENT(xfs_attr_node_lookup);
 DEFINE_ATTR_EVENT(xfs_attr_node_replace);
 DEFINE_ATTR_EVENT(xfs_attr_node_removename);
+DEFINE_ATTR_EVENT(xfs_attr_fillstate);
+DEFINE_ATTR_EVENT(xfs_attr_refillstate);
+DEFINE_ATTR_EVENT(xfs_attr_rmtval_get);
+DEFINE_ATTR_EVENT(xfs_attr_rmtval_set);
+DEFINE_ATTR_EVENT(xfs_attr_rmtval_remove);
 #define DEFINE_DA_EVENT(name) \
 DEFINE_EVENT(xfs_da_class, name, \
        TP_PROTO(struct xfs_da_args *args), \
@@ -1550,9 +1605,12 @@ DEFINE_DA_EVENT(xfs_da_node_split);
 DEFINE_DA_EVENT(xfs_da_node_remove);
 DEFINE_DA_EVENT(xfs_da_node_rebalance);
 DEFINE_DA_EVENT(xfs_da_node_unbalance);
+DEFINE_DA_EVENT(xfs_da_node_toosmall);
 DEFINE_DA_EVENT(xfs_da_swap_lastblock);
 DEFINE_DA_EVENT(xfs_da_grow_inode);
 DEFINE_DA_EVENT(xfs_da_shrink_inode);
+DEFINE_DA_EVENT(xfs_da_fixhashpath);
+DEFINE_DA_EVENT(xfs_da_path_shift);
 DECLARE_EVENT_CLASS(xfs_dir2_space_class,
        TP_PROTO(struct xfs_da_args *args, int idx),
diff --git a/fs/xfs/xfs_trans.h b/fs/xfs/xfs_trans.h
index db056544cbb5..c6c0601abd7a 100644
--- a/fs/xfs/xfs_trans.h
+++ b/fs/xfs/xfs_trans.h
@@ -464,10 +464,7 @@ xfs_trans_get_buf(
        int                     numblks,
        uint                    flags)
 {
-        struct xfs_buf_map      map = {
+        DEFINE_SINGLE_BUF_MAP(map, blkno, numblks);
-                .bm_bn = blkno,
-                .bm_len = numblks,
-        };
        return xfs_trans_get_buf_map(tp, target, &map, 1, flags);
 }
@@ -476,7 +473,8 @@ int		xfs_trans_read_buf_map(struct xfs_mount *mp,
                                       struct xfs_buftarg *target,
                                       struct xfs_buf_map *map, int nmaps,
                                       xfs_buf_flags_t flags,
-                                       struct xfs_buf **bpp);
+                                       struct xfs_buf **bpp,
+                                       const struct xfs_buf_ops *ops);
 static inline int
 xfs_trans_read_buf(
@@ -486,13 +484,12 @@ xfs_trans_read_buf(
        xfs_daddr_t             blkno,
        int                     numblks,
        xfs_buf_flags_t         flags,
-        struct xfs_buf          **bpp)
+        struct xfs_buf          **bpp,
+        const struct xfs_buf_ops *ops)
 {
-        struct xfs_buf_map      map = {
+        DEFINE_SINGLE_BUF_MAP(map, blkno, numblks);
-                .bm_bn = blkno,
+        return xfs_trans_read_buf_map(mp, tp, target, &map, 1,
-                .bm_len = numblks,
+                                      flags, bpp, ops);
-        };
-        return xfs_trans_read_buf_map(mp, tp, target, &map, 1, flags, bpp);
 }
 struct xfs_buf  *xfs_trans_getsb(xfs_trans_t *, struct xfs_mount *, int);
diff --git a/fs/xfs/xfs_trans_buf.c b/fs/xfs/xfs_trans_buf.c
index 6311b99c267f..4fc17d479d42 100644
--- a/fs/xfs/xfs_trans_buf.c
+++ b/fs/xfs/xfs_trans_buf.c
@@ -257,7 +257,8 @@ xfs_trans_read_buf_map(
        struct xfs_buf_map      *map,
        int                     nmaps,
        xfs_buf_flags_t         flags,
-        struct xfs_buf          **bpp)
+        struct xfs_buf          **bpp,
+        const struct xfs_buf_ops *ops)
 {
        xfs_buf_t               *bp;
        xfs_buf_log_item_t      *bip;
@@ -265,7 +266,7 @@ xfs_trans_read_buf_map(
        *bpp = NULL;
        if (!tp) {
-                bp = xfs_buf_read_map(target, map, nmaps, flags);
+                bp = xfs_buf_read_map(target, map, nmaps, flags, ops);
                if (!bp)
                        return (flags & XBF_TRYLOCK) ?
                                        EAGAIN : XFS_ERROR(ENOMEM);
@@ -312,7 +313,9 @@ xfs_trans_read_buf_map(
                if (!(XFS_BUF_ISDONE(bp))) {
                        trace_xfs_trans_read_buf_io(bp, _RET_IP_);
                        ASSERT(!XFS_BUF_ISASYNC(bp));
+                        ASSERT(bp->b_iodone == NULL);
                        XFS_BUF_READ(bp);
+                        bp->b_ops = ops;
                        xfsbdstrat(tp->t_mountp, bp);
                        error = xfs_buf_iowait(bp);
                        if (error) {
@@ -349,7 +352,7 @@ xfs_trans_read_buf_map(
                return 0;
        }
-        bp = xfs_buf_read_map(target, map, nmaps, flags);
+        bp = xfs_buf_read_map(target, map, nmaps, flags, ops);
        if (bp == NULL) {
                *bpp = NULL;
                return (flags & XBF_TRYLOCK) ?
diff --git a/fs/xfs/xfs_vnodeops.c b/fs/xfs/xfs_vnodeops.c
index 2a5c637344b4..d95f565a390e 100644
--- a/fs/xfs/xfs_vnodeops.c
+++ b/fs/xfs/xfs_vnodeops.c
@@ -47,6 +47,7 @@
 #include "xfs_filestream.h"
 #include "xfs_vnodeops.h"
 #include "xfs_trace.h"
+#include "xfs_icache.h"
 /*
 * The maximum pathlen is 1024 bytes. Since the minimum file system
@@ -79,7 +80,7 @@ xfs_readlink_bmap(
                d = XFS_FSB_TO_DADDR(mp, mval[n].br_startblock);
                byte_cnt = XFS_FSB_TO_B(mp, mval[n].br_blockcount);
-                bp = xfs_buf_read(mp->m_ddev_targp, d, BTOBB(byte_cnt), 0);
+                bp = xfs_buf_read(mp->m_ddev_targp, d, BTOBB(byte_cnt), 0, NULL);
                if (!bp)
                        return XFS_ERROR(ENOMEM);
                error = bp->b_error;
@@ -150,7 +151,7 @@ xfs_readlink(
 * when the link count isn't zero and by xfs_dm_punch_hole() when
 * punching a hole to EOF.
 */
-STATIC int
+int
 xfs_free_eofblocks(
        xfs_mount_t     *mp,
        xfs_inode_t     *ip,
@@ -199,7 +200,7 @@ xfs_free_eofblocks(
                if (need_iolock) {
                        if (!xfs_ilock_nowait(ip, XFS_IOLOCK_EXCL)) {
                                xfs_trans_cancel(tp, 0);
-                                return 0;
+                                return EAGAIN;
                        }
                }
@@ -237,6 +238,8 @@ xfs_free_eofblocks(
                } else {
                        error = xfs_trans_commit(tp,
                                                XFS_TRANS_RELEASE_LOG_RES);
+                        if (!error)
+                                xfs_inode_clear_eofblocks_tag(ip);
                }
                xfs_iunlock(ip, XFS_ILOCK_EXCL);
@@ -425,19 +428,18 @@ xfs_release(
                truncated = xfs_iflags_test_and_clear(ip, XFS_ITRUNCATED);
                if (truncated) {
                        xfs_iflags_clear(ip, XFS_IDIRTY_RELEASE);
-                        if (VN_DIRTY(VFS_I(ip)) && ip->i_delayed_blks > 0)
+                        if (VN_DIRTY(VFS_I(ip)) && ip->i_delayed_blks > 0) {
-                                xfs_flush_pages(ip, 0, -1, XBF_ASYNC, FI_NONE);
+                                error = -filemap_flush(VFS_I(ip)->i_mapping);
+                                if (error)
+                                        return error;
+                        }
                }
        }
        if (ip->i_d.di_nlink == 0)
                return 0;
-        if ((S_ISREG(ip->i_d.di_mode) &&
+        if (xfs_can_free_eofblocks(ip, false)) {
-             (VFS_I(ip)->i_size > 0 ||
-              (VN_CACHED(VFS_I(ip)) > 0 || ip->i_delayed_blks > 0)) &&
-             (ip->i_df.if_flags & XFS_IFEXTENTS))  &&
-            (!(ip->i_d.di_flags & (XFS_DIFLAG_PREALLOC | XFS_DIFLAG_APPEND)))) {
                /*
                 * If we can't get the iolock just skip truncating the blocks
@@ -464,7 +466,7 @@ xfs_release(
                        return 0;
                error = xfs_free_eofblocks(mp, ip, true);
-                if (error)
+                if (error && error != EAGAIN)
                        return error;
                /* delalloc blocks after truncation means it really is dirty */
@@ -513,13 +515,12 @@ xfs_inactive(
                goto out;
        if (ip->i_d.di_nlink != 0) {
-                if ((S_ISREG(ip->i_d.di_mode) &&
+                /*
-                    (VFS_I(ip)->i_size > 0 ||
+                 * force is true because we are evicting an inode from the
-                     (VN_CACHED(VFS_I(ip)) > 0 || ip->i_delayed_blks > 0)) &&
+                 * cache. Post-eof blocks must be freed, lest we end up with
-                    (ip->i_df.if_flags & XFS_IFEXTENTS) &&
+                 * broken free space accounting.
-                    (!(ip->i_d.di_flags &
+                 */
-                                (XFS_DIFLAG_PREALLOC | XFS_DIFLAG_APPEND)) ||
+                if (xfs_can_free_eofblocks(ip, true)) {
-                     ip->i_delayed_blks != 0))) {
                        error = xfs_free_eofblocks(mp, ip, false);
                        if (error)
                                return VN_INACTIVE_CACHE;
@@ -777,7 +778,7 @@ xfs_create(
                        XFS_TRANS_PERM_LOG_RES, log_count);
        if (error == ENOSPC) {
                /* flush outstanding delalloc blocks and retry */
-                xfs_flush_inodes(dp);
+                xfs_flush_inodes(mp);
                error = xfs_trans_reserve(tp, resblks, log_res, 0,
                                XFS_TRANS_PERM_LOG_RES, log_count);
        }
@@ -1957,12 +1958,11 @@ xfs_free_file_space(
        rounding = max_t(uint, 1 << mp->m_sb.sb_blocklog, PAGE_CACHE_SIZE);
        ioffset = offset & ~(rounding - 1);
+        error = -filemap_write_and_wait_range(VFS_I(ip)->i_mapping,
-        if (VN_CACHED(VFS_I(ip)) != 0) {
+                                              ioffset, -1);
-                error = xfs_flushinval_pages(ip, ioffset, -1, FI_REMAPF_LOCKED);
+        if (error)
-                if (error)
+                goto out_unlock_iolock;
-                        goto out_unlock_iolock;
+        truncate_pagecache_range(VFS_I(ip), ioffset, -1);
-        }
        /*
         * Need to zero the stuff we're not freeing, on disk.
@@ -2095,6 +2095,73 @@ xfs_free_file_space(
        return error;
 }
+STATIC int
+xfs_zero_file_space(
+        struct xfs_inode        *ip,
+        xfs_off_t               offset,
+        xfs_off_t               len,
+        int                     attr_flags)
+{
+        struct xfs_mount        *mp = ip->i_mount;
+        uint                    granularity;
+        xfs_off_t               start_boundary;
+        xfs_off_t               end_boundary;
+        int                     error;
+        granularity = max_t(uint, 1 << mp->m_sb.sb_blocklog, PAGE_CACHE_SIZE);
+        /*
+         * Round the range of extents we are going to convert inwards.  If the
+         * offset is aligned, then it doesn't get changed so we zero from the
+         * start of the block offset points to.
+         */
+        start_boundary = round_up(offset, granularity);
+        end_boundary = round_down(offset + len, granularity);
+        ASSERT(start_boundary >= offset);
+        ASSERT(end_boundary <= offset + len);
+        if (!(attr_flags & XFS_ATTR_NOLOCK))
+                xfs_ilock(ip, XFS_IOLOCK_EXCL);
+        if (start_boundary < end_boundary - 1) {
+                /* punch out the page cache over the conversion range */
+                truncate_pagecache_range(VFS_I(ip), start_boundary,
+                                         end_boundary - 1);
+                /* convert the blocks */
+                error = xfs_alloc_file_space(ip, start_boundary,
+                                        end_boundary - start_boundary - 1,
+                                        XFS_BMAPI_PREALLOC | XFS_BMAPI_CONVERT,
+                                        attr_flags);
+                if (error)
+                        goto out_unlock;
+                /* We've handled the interior of the range, now for the edges */
+                if (start_boundary != offset)
+                        error = xfs_iozero(ip, offset, start_boundary - offset);
+                if (error)
+                        goto out_unlock;
+                if (end_boundary != offset + len)
+                        error = xfs_iozero(ip, end_boundary,
+                                           offset + len - end_boundary);
+        } else {
+                /*
+                 * It's either a sub-granularity range or the range spanned lies
+                 * partially across two adjacent blocks.
+                 */
+                error = xfs_iozero(ip, offset, len);
+        }
+out_unlock:
+        if (!(attr_flags & XFS_ATTR_NOLOCK))
+                xfs_iunlock(ip, XFS_IOLOCK_EXCL);
+        return error;
+}
 /*
 * xfs_change_file_space()
 *      This routine allocates or frees disk space for the given file.
@@ -2120,10 +2187,8 @@ xfs_change_file_space(
        xfs_fsize_t     fsize;
        int             setprealloc;
        xfs_off_t       startoffset;
-        xfs_off_t       llen;
        xfs_trans_t     *tp;
        struct iattr    iattr;
-        int             prealloc_type;
        if (!S_ISREG(ip->i_d.di_mode))
                return XFS_ERROR(EINVAL);
@@ -2141,12 +2206,30 @@ xfs_change_file_space(
                return XFS_ERROR(EINVAL);
        }
-        llen = bf->l_len > 0 ? bf->l_len - 1 : bf->l_len;
+        /*
+         * length of <= 0 for resv/unresv/zero is invalid.  length for
+         * alloc/free is ignored completely and we have no idea what userspace
+         * might have set it to, so set it to zero to allow range
+         * checks to pass.
+         */
+        switch (cmd) {
+        case XFS_IOC_ZERO_RANGE:
+        case XFS_IOC_RESVSP:
+        case XFS_IOC_RESVSP64:
+        case XFS_IOC_UNRESVSP:
+        case XFS_IOC_UNRESVSP64:
+                if (bf->l_len <= 0)
+                        return XFS_ERROR(EINVAL);
+                break;
+        default:
+                bf->l_len = 0;
+                break;
+        }
        if (bf->l_start < 0 ||
            bf->l_start > mp->m_super->s_maxbytes ||
-            bf->l_start + llen < 0 ||
+            bf->l_start + bf->l_len < 0 ||
-            bf->l_start + llen > mp->m_super->s_maxbytes)
+            bf->l_start + bf->l_len >= mp->m_super->s_maxbytes)
                return XFS_ERROR(EINVAL);
        bf->l_whence = 0;
@@ -2154,29 +2237,20 @@ xfs_change_file_space(
        startoffset = bf->l_start;
        fsize = XFS_ISIZE(ip);
-        /*
-         * XFS_IOC_RESVSP and XFS_IOC_UNRESVSP will reserve or unreserve
-         * file space.
-         * These calls do NOT zero the data space allocated to the file,
-         * nor do they change the file size.
-         *
-         * XFS_IOC_ALLOCSP and XFS_IOC_FREESP will allocate and free file
-         * space.
-         * These calls cause the new file data to be zeroed and the file
-         * size to be changed.
-         */
        setprealloc = clrprealloc = 0;
-        prealloc_type = XFS_BMAPI_PREALLOC;
        switch (cmd) {
        case XFS_IOC_ZERO_RANGE:
-                prealloc_type |= XFS_BMAPI_CONVERT;
+                error = xfs_zero_file_space(ip, startoffset, bf->l_len,
-                xfs_tosspages(ip, startoffset, startoffset + bf->l_len, 0);
+                                                attr_flags);
-                /* FALLTHRU */
+                if (error)
+                        return error;
+                setprealloc = 1;
+                break;
        case XFS_IOC_RESVSP:
        case XFS_IOC_RESVSP64:
                error = xfs_alloc_file_space(ip, startoffset, bf->l_len,
-                                                prealloc_type, attr_flags);
+                                                XFS_BMAPI_PREALLOC, attr_flags);
                if (error)
                        return error;
                setprealloc = 1;
diff --git a/fs/xfs/xfs_vnodeops.h b/fs/xfs/xfs_vnodeops.h
index 447e146b2ba6..5163022d9808 100644
--- a/fs/xfs/xfs_vnodeops.h
+++ b/fs/xfs/xfs_vnodeops.h
@@ -48,14 +48,9 @@ int xfs_attr_set(struct xfs_inode *dp, const unsigned char *name,
 int xfs_attr_remove(struct xfs_inode *dp, const unsigned char *name, int flags);
 int xfs_attr_list(struct xfs_inode *dp, char *buffer, int bufsize,
                int flags, struct attrlist_cursor_kern *cursor);
-void xfs_tosspages(struct xfs_inode *inode, xfs_off_t first,
-                xfs_off_t last, int fiopt);
-int xfs_flushinval_pages(struct xfs_inode *ip, xfs_off_t first,
-                xfs_off_t last, int fiopt);
-int xfs_flush_pages(struct xfs_inode *ip, xfs_off_t first,
-                xfs_off_t last, uint64_t flags, int fiopt);
-int xfs_wait_on_pages(struct xfs_inode *ip, xfs_off_t first, xfs_off_t last);
+int xfs_iozero(struct xfs_inode *, loff_t, size_t);
 int xfs_zero_eof(struct xfs_inode *, xfs_off_t, xfs_fsize_t);
+int xfs_free_eofblocks(struct xfs_mount *, struct xfs_inode *, bool);
 #endif /* _XFS_VNODEOPS_H */