142 files changed, 4953 insertions, 3071 deletions
diff --git a/fs/Kconfig.binfmt b/fs/Kconfig.binfmt
index 370b24cee4d8..c055d56ec63d 100644
--- a/fs/Kconfig.binfmt
+++ b/fs/Kconfig.binfmt
@@ -30,6 +30,9 @@ config COMPAT_BINFMT_ELF
 config ARCH_BINFMT_ELF_RANDOMIZE_PIE
        bool
+config ARCH_BINFMT_ELF_STATE
+        bool
 config BINFMT_ELF_FDPIC
        bool "Kernel support for FDPIC ELF binaries"
        default y
diff --git a/fs/binfmt_elf.c b/fs/binfmt_elf.c
index 3a6175fe10c0..02b16910f4c9 100644
--- a/fs/binfmt_elf.c
+++ b/fs/binfmt_elf.c
@@ -386,6 +386,127 @@ static unsigned long total_mapping_size(struct elf_phdr *cmds, int nr)
                                ELF_PAGESTART(cmds[first_idx].p_vaddr);
 }
+/**
+ * load_elf_phdrs() - load ELF program headers
+ * @elf_ex:   ELF header of the binary whose program headers should be loaded
+ * @elf_file: the opened ELF binary file
+ *
+ * Loads ELF program headers from the binary file elf_file, which has the ELF
+ * header pointed to by elf_ex, into a newly allocated array. The caller is
+ * responsible for freeing the allocated data. Returns an ERR_PTR upon failure.
+ */
+static struct elf_phdr *load_elf_phdrs(struct elfhdr *elf_ex,
+                                       struct file *elf_file)
+{
+        struct elf_phdr *elf_phdata = NULL;
+        int retval, size, err = -1;
+        /*
+         * If the size of this structure has changed, then punt, since
+         * we will be doing the wrong thing.
+         */
+        if (elf_ex->e_phentsize != sizeof(struct elf_phdr))
+                goto out;
+        /* Sanity check the number of program headers... */
+        if (elf_ex->e_phnum < 1 ||
+                elf_ex->e_phnum > 65536U / sizeof(struct elf_phdr))
+                goto out;
+        /* ...and their total size. */
+        size = sizeof(struct elf_phdr) * elf_ex->e_phnum;
+        if (size > ELF_MIN_ALIGN)
+                goto out;
+        elf_phdata = kmalloc(size, GFP_KERNEL);
+        if (!elf_phdata)
+                goto out;
+        /* Read in the program headers */
+        retval = kernel_read(elf_file, elf_ex->e_phoff,
+                             (char *)elf_phdata, size);
+        if (retval != size) {
+                err = (retval < 0) ? retval : -EIO;
+                goto out;
+        }
+        /* Success! */
+        err = 0;
+out:
+        if (err) {
+                kfree(elf_phdata);
+                elf_phdata = NULL;
+        }
+        return elf_phdata;
+}
+#ifndef CONFIG_ARCH_BINFMT_ELF_STATE
+/**
+ * struct arch_elf_state - arch-specific ELF loading state
+ *
+ * This structure is used to preserve architecture specific data during
+ * the loading of an ELF file, throughout the checking of architecture
+ * specific ELF headers & through to the point where the ELF load is
+ * known to be proceeding (ie. SET_PERSONALITY).
+ *
+ * This implementation is a dummy for architectures which require no
+ * specific state.
+ */
+struct arch_elf_state {
+};
+#define INIT_ARCH_ELF_STATE {}
+/**
+ * arch_elf_pt_proc() - check a PT_LOPROC..PT_HIPROC ELF program header
+ * @ehdr:       The main ELF header
+ * @phdr:       The program header to check
+ * @elf:        The open ELF file
+ * @is_interp:  True if the phdr is from the interpreter of the ELF being
+ *              loaded, else false.
+ * @state:      Architecture-specific state preserved throughout the process
+ *              of loading the ELF.
+ *
+ * Inspects the program header phdr to validate its correctness and/or
+ * suitability for the system. Called once per ELF program header in the
+ * range PT_LOPROC to PT_HIPROC, for both the ELF being loaded and its
+ * interpreter.
+ *
+ * Return: Zero to proceed with the ELF load, non-zero to fail the ELF load
+ *         with that return code.
+ */
+static inline int arch_elf_pt_proc(struct elfhdr *ehdr,
+                                   struct elf_phdr *phdr,
+                                   struct file *elf, bool is_interp,
+                                   struct arch_elf_state *state)
+{
+        /* Dummy implementation, always proceed */
+        return 0;
+}
+/**
+ * arch_check_elf() - check a PT_LOPROC..PT_HIPROC ELF program header
+ * @ehdr:       The main ELF header
+ * @has_interp: True if the ELF has an interpreter, else false.
+ * @state:      Architecture-specific state preserved throughout the process
+ *              of loading the ELF.
+ *
+ * Provides a final opportunity for architecture code to reject the loading
+ * of the ELF & cause an exec syscall to return an error. This is called after
+ * all program headers to be checked by arch_elf_pt_proc have been.
+ *
+ * Return: Zero to proceed with the ELF load, non-zero to fail the ELF load
+ *         with that return code.
+ */
+static inline int arch_check_elf(struct elfhdr *ehdr, bool has_interp,
+                                 struct arch_elf_state *state)
+{
+        /* Dummy implementation, always proceed */
+        return 0;
+}
+#endif /* !CONFIG_ARCH_BINFMT_ELF_STATE */
 /* This is much more generalized than the library routine read function,
   so we keep this separate.  Technically the library read function
@@ -394,16 +515,15 @@ static unsigned long total_mapping_size(struct elf_phdr *cmds, int nr)
 static unsigned long load_elf_interp(struct elfhdr *interp_elf_ex,
                struct file *interpreter, unsigned long *interp_map_addr,
-                unsigned long no_base)
+                unsigned long no_base, struct elf_phdr *interp_elf_phdata)
 {
-        struct elf_phdr *elf_phdata;
        struct elf_phdr *eppnt;
        unsigned long load_addr = 0;
        int load_addr_set = 0;
        unsigned long last_bss = 0, elf_bss = 0;
        unsigned long error = ~0UL;
        unsigned long total_size;
-        int retval, i, size;
+        int i;
        /* First of all, some simple consistency checks */
        if (interp_elf_ex->e_type != ET_EXEC &&
@@ -414,40 +534,14 @@ static unsigned long load_elf_interp(struct elfhdr *interp_elf_ex,
        if (!interpreter->f_op->mmap)
                goto out;
-        /*
+        total_size = total_mapping_size(interp_elf_phdata,
-         * If the size of this structure has changed, then punt, since
+                                        interp_elf_ex->e_phnum);
-         * we will be doing the wrong thing.
-         */
-        if (interp_elf_ex->e_phentsize != sizeof(struct elf_phdr))
-                goto out;
-        if (interp_elf_ex->e_phnum < 1 ||
-                interp_elf_ex->e_phnum > 65536U / sizeof(struct elf_phdr))
-                goto out;
-        /* Now read in all of the header information */
-        size = sizeof(struct elf_phdr) * interp_elf_ex->e_phnum;
-        if (size > ELF_MIN_ALIGN)
-                goto out;
-        elf_phdata = kmalloc(size, GFP_KERNEL);
-        if (!elf_phdata)
-                goto out;
-        retval = kernel_read(interpreter, interp_elf_ex->e_phoff,
-                             (char *)elf_phdata, size);
-        error = -EIO;
-        if (retval != size) {
-                if (retval < 0)
-                        error = retval; 
-                goto out_close;
-        }
-        total_size = total_mapping_size(elf_phdata, interp_elf_ex->e_phnum);
        if (!total_size) {
                error = -EINVAL;
-                goto out_close;
+                goto out;
        }
-        eppnt = elf_phdata;
+        eppnt = interp_elf_phdata;
        for (i = 0; i < interp_elf_ex->e_phnum; i++, eppnt++) {
                if (eppnt->p_type == PT_LOAD) {
                        int elf_type = MAP_PRIVATE | MAP_DENYWRITE;
@@ -474,7 +568,7 @@ static unsigned long load_elf_interp(struct elfhdr *interp_elf_ex,
                                *interp_map_addr = map_addr;
                        error = map_addr;
                        if (BAD_ADDR(map_addr))
-                                goto out_close;
+                                goto out;
                        if (!load_addr_set &&
                            interp_elf_ex->e_type == ET_DYN) {
@@ -493,7 +587,7 @@ static unsigned long load_elf_interp(struct elfhdr *interp_elf_ex,
                            eppnt->p_memsz > TASK_SIZE ||
                            TASK_SIZE - eppnt->p_memsz < k) {
                                error = -ENOMEM;
-                                goto out_close;
+                                goto out;
                        }
                        /*
@@ -523,7 +617,7 @@ static unsigned long load_elf_interp(struct elfhdr *interp_elf_ex,
                 */
                if (padzero(elf_bss)) {
                        error = -EFAULT;
-                        goto out_close;
+                        goto out;
                }
                /* What we have mapped so far */
@@ -532,13 +626,10 @@ static unsigned long load_elf_interp(struct elfhdr *interp_elf_ex,
                /* Map the last of the bss segment */
                error = vm_brk(elf_bss, last_bss - elf_bss);
                if (BAD_ADDR(error))
-                        goto out_close;
+                        goto out;
        }
        error = load_addr;
-out_close:
-        kfree(elf_phdata);
 out:
        return error;
 }
@@ -575,10 +666,9 @@ static int load_elf_binary(struct linux_binprm *bprm)
        int load_addr_set = 0;
        char * elf_interpreter = NULL;
        unsigned long error;
-        struct elf_phdr *elf_ppnt, *elf_phdata;
+        struct elf_phdr *elf_ppnt, *elf_phdata, *interp_elf_phdata = NULL;
        unsigned long elf_bss, elf_brk;
        int retval, i;
-        unsigned int size;
        unsigned long elf_entry;
        unsigned long interp_load_addr = 0;
        unsigned long start_code, end_code, start_data, end_data;
@@ -589,6 +679,7 @@ static int load_elf_binary(struct linux_binprm *bprm)
                struct elfhdr elf_ex;
                struct elfhdr interp_elf_ex;
        } *loc;
+        struct arch_elf_state arch_state = INIT_ARCH_ELF_STATE;
        loc = kmalloc(sizeof(*loc), GFP_KERNEL);
        if (!loc) {
@@ -611,26 +702,10 @@ static int load_elf_binary(struct linux_binprm *bprm)
        if (!bprm->file->f_op->mmap)
                goto out;
-        /* Now read in all of the header information */
+        elf_phdata = load_elf_phdrs(&loc->elf_ex, bprm->file);
-        if (loc->elf_ex.e_phentsize != sizeof(struct elf_phdr))
-                goto out;
-        if (loc->elf_ex.e_phnum < 1 ||
-                loc->elf_ex.e_phnum > 65536U / sizeof(struct elf_phdr))
-                goto out;
-        size = loc->elf_ex.e_phnum * sizeof(struct elf_phdr);
-        retval = -ENOMEM;
-        elf_phdata = kmalloc(size, GFP_KERNEL);
        if (!elf_phdata)
                goto out;
-        retval = kernel_read(bprm->file, loc->elf_ex.e_phoff,
-                             (char *)elf_phdata, size);
-        if (retval != size) {
-                if (retval >= 0)
-                        retval = -EIO;
-                goto out_free_ph;
-        }
        elf_ppnt = elf_phdata;
        elf_bss = 0;
        elf_brk = 0;
@@ -699,12 +774,21 @@ static int load_elf_binary(struct linux_binprm *bprm)
        elf_ppnt = elf_phdata;
        for (i = 0; i < loc->elf_ex.e_phnum; i++, elf_ppnt++)
-                if (elf_ppnt->p_type == PT_GNU_STACK) {
+                switch (elf_ppnt->p_type) {
+                case PT_GNU_STACK:
                        if (elf_ppnt->p_flags & PF_X)
                                executable_stack = EXSTACK_ENABLE_X;
                        else
                                executable_stack = EXSTACK_DISABLE_X;
                        break;
+                case PT_LOPROC ... PT_HIPROC:
+                        retval = arch_elf_pt_proc(&loc->elf_ex, elf_ppnt,
+                                                  bprm->file, false,
+                                                  &arch_state);
+                        if (retval)
+                                goto out_free_dentry;
+                        break;
                }
        /* Some simple consistency checks for the interpreter */
@@ -716,8 +800,36 @@ static int load_elf_binary(struct linux_binprm *bprm)
                /* Verify the interpreter has a valid arch */
                if (!elf_check_arch(&loc->interp_elf_ex))
                        goto out_free_dentry;
+                /* Load the interpreter program headers */
+                interp_elf_phdata = load_elf_phdrs(&loc->interp_elf_ex,
+                                                   interpreter);
+                if (!interp_elf_phdata)
+                        goto out_free_dentry;
+                /* Pass PT_LOPROC..PT_HIPROC headers to arch code */
+                elf_ppnt = interp_elf_phdata;
+                for (i = 0; i < loc->interp_elf_ex.e_phnum; i++, elf_ppnt++)
+                        switch (elf_ppnt->p_type) {
+                        case PT_LOPROC ... PT_HIPROC:
+                                retval = arch_elf_pt_proc(&loc->interp_elf_ex,
+                                                          elf_ppnt, interpreter,
+                                                          true, &arch_state);
+                                if (retval)
+                                        goto out_free_dentry;
+                                break;
+                        }
        }
+        /*
+         * Allow arch code to reject the ELF at this point, whilst it's
+         * still possible to return an error to the code that invoked
+         * the exec syscall.
+         */
+        retval = arch_check_elf(&loc->elf_ex, !!interpreter, &arch_state);
+        if (retval)
+                goto out_free_dentry;
        /* Flush all traces of the currently running executable */
        retval = flush_old_exec(bprm);
        if (retval)
@@ -725,7 +837,7 @@ static int load_elf_binary(struct linux_binprm *bprm)
        /* Do this immediately, since STACK_TOP as used in setup_arg_pages
           may depend on the personality.  */
-        SET_PERSONALITY(loc->elf_ex);
+        SET_PERSONALITY2(loc->elf_ex, &arch_state);
        if (elf_read_implies_exec(loc->elf_ex, executable_stack))
                current->personality |= READ_IMPLIES_EXEC;
@@ -890,7 +1002,7 @@ static int load_elf_binary(struct linux_binprm *bprm)
                elf_entry = load_elf_interp(&loc->interp_elf_ex,
                                            interpreter,
                                            &interp_map_addr,
-                                            load_bias);
+                                            load_bias, interp_elf_phdata);
                if (!IS_ERR((void *)elf_entry)) {
                        /*
                         * load_elf_interp() returns relocation
@@ -917,6 +1029,7 @@ static int load_elf_binary(struct linux_binprm *bprm)
                }
        }
+        kfree(interp_elf_phdata);
        kfree(elf_phdata);
        set_binfmt(&elf_format);
@@ -981,6 +1094,7 @@ out_ret:
        /* error cleanup */
 out_free_dentry:
+        kfree(interp_elf_phdata);
        allow_write_access(interpreter);
        if (interpreter)
                fput(interpreter);
diff --git a/fs/btrfs/check-integrity.c b/fs/btrfs/check-integrity.c
index cb7f3fe9c9f6..d897ef803b3b 100644
--- a/fs/btrfs/check-integrity.c
+++ b/fs/btrfs/check-integrity.c
@@ -94,6 +94,7 @@
 #include <linux/mutex.h>
 #include <linux/genhd.h>
 #include <linux/blkdev.h>
+#include <linux/vmalloc.h>
 #include "ctree.h"
 #include "disk-io.h"
 #include "hash.h"
@@ -326,9 +327,6 @@ static int btrfsic_handle_extent_data(struct btrfsic_state *state,
 static int btrfsic_map_block(struct btrfsic_state *state, u64 bytenr, u32 len,
                             struct btrfsic_block_data_ctx *block_ctx_out,
                             int mirror_num);
-static int btrfsic_map_superblock(struct btrfsic_state *state, u64 bytenr,
-                                  u32 len, struct block_device *bdev,
-                                  struct btrfsic_block_data_ctx *block_ctx_out);
 static void btrfsic_release_block_ctx(struct btrfsic_block_data_ctx *block_ctx);
 static int btrfsic_read_block(struct btrfsic_state *state,
                              struct btrfsic_block_data_ctx *block_ctx);
@@ -1326,24 +1324,25 @@ static int btrfsic_create_link_to_next_block(
                l = NULL;
                next_block->generation = BTRFSIC_GENERATION_UNKNOWN;
        } else {
-                if (next_block->logical_bytenr != next_bytenr &&
+                if (state->print_mask & BTRFSIC_PRINT_MASK_VERBOSE) {
-                    !(!next_block->is_metadata &&
+                        if (next_block->logical_bytenr != next_bytenr &&
-                      0 == next_block->logical_bytenr)) {
+                            !(!next_block->is_metadata &&
-                        printk(KERN_INFO
+                              0 == next_block->logical_bytenr))
-                               "Referenced block @%llu (%s/%llu/%d)"
+                                printk(KERN_INFO
-                               " found in hash table, %c,"
+                                       "Referenced block @%llu (%s/%llu/%d) found in hash table, %c, bytenr mismatch (!= stored %llu).\n",
-                               " bytenr mismatch (!= stored %llu).\n",
+                                       next_bytenr, next_block_ctx->dev->name,
-                               next_bytenr, next_block_ctx->dev->name,
+                                       next_block_ctx->dev_bytenr, *mirror_nump,
-                               next_block_ctx->dev_bytenr, *mirror_nump,
+                                       btrfsic_get_block_type(state,
-                               btrfsic_get_block_type(state, next_block),
+                                                              next_block),
-                               next_block->logical_bytenr);
+                                       next_block->logical_bytenr);
-                } else if (state->print_mask & BTRFSIC_PRINT_MASK_VERBOSE)
+                        else
-                        printk(KERN_INFO
+                                printk(KERN_INFO
-                               "Referenced block @%llu (%s/%llu/%d)"
+                                       "Referenced block @%llu (%s/%llu/%d) found in hash table, %c.\n",
-                               " found in hash table, %c.\n",
+                                       next_bytenr, next_block_ctx->dev->name,
-                               next_bytenr, next_block_ctx->dev->name,
+                                       next_block_ctx->dev_bytenr, *mirror_nump,
-                               next_block_ctx->dev_bytenr, *mirror_nump,
+                                       btrfsic_get_block_type(state,
-                               btrfsic_get_block_type(state, next_block));
+                                                              next_block));
+                }
                next_block->logical_bytenr = next_bytenr;
                next_block->mirror_num = *mirror_nump;
@@ -1529,7 +1528,9 @@ static int btrfsic_handle_extent_data(
                                return -1;
                        }
                        if (!block_was_created) {
-                                if (next_block->logical_bytenr != next_bytenr &&
+                                if ((state->print_mask &
+                                     BTRFSIC_PRINT_MASK_VERBOSE) &&
+                                    next_block->logical_bytenr != next_bytenr &&
                                    !(!next_block->is_metadata &&
                                      0 == next_block->logical_bytenr)) {
                                        printk(KERN_INFO
@@ -1607,25 +1608,6 @@ static int btrfsic_map_block(struct btrfsic_state *state, u64 bytenr, u32 len,
        return ret;
 }
-static int btrfsic_map_superblock(struct btrfsic_state *state, u64 bytenr,
-                                  u32 len, struct block_device *bdev,
-                                  struct btrfsic_block_data_ctx *block_ctx_out)
-{
-        block_ctx_out->dev = btrfsic_dev_state_lookup(bdev);
-        block_ctx_out->dev_bytenr = bytenr;
-        block_ctx_out->start = bytenr;
-        block_ctx_out->len = len;
-        block_ctx_out->datav = NULL;
-        block_ctx_out->pagev = NULL;
-        block_ctx_out->mem_to_free = NULL;
-        if (NULL != block_ctx_out->dev) {
-                return 0;
-        } else {
-                printk(KERN_INFO "btrfsic: error, cannot lookup dev (#2)!\n");
-                return -ENXIO;
-        }
-}
 static void btrfsic_release_block_ctx(struct btrfsic_block_data_ctx *block_ctx)
 {
        if (block_ctx->mem_to_free) {
@@ -1901,25 +1883,26 @@ again:
                                                               dev_state,
                                                               dev_bytenr);
                        }
-                        if (block->logical_bytenr != bytenr &&
+                        if (state->print_mask & BTRFSIC_PRINT_MASK_VERBOSE) {
-                            !(!block->is_metadata &&
+                                if (block->logical_bytenr != bytenr &&
-                              block->logical_bytenr == 0))
+                                    !(!block->is_metadata &&
-                                printk(KERN_INFO
+                                      block->logical_bytenr == 0))
-                                       "Written block @%llu (%s/%llu/%d)"
+                                        printk(KERN_INFO
-                                       " found in hash table, %c,"
+                                               "Written block @%llu (%s/%llu/%d) found in hash table, %c, bytenr mismatch (!= stored %llu).\n",
-                                       " bytenr mismatch"
+                                               bytenr, dev_state->name,
-                                       " (!= stored %llu).\n",
+                                               dev_bytenr,
-                                       bytenr, dev_state->name, dev_bytenr,
+                                               block->mirror_num,
-                                       block->mirror_num,
+                                               btrfsic_get_block_type(state,
-                                       btrfsic_get_block_type(state, block),
+                                                                      block),
-                                       block->logical_bytenr);
+                                               block->logical_bytenr);
-                        else if (state->print_mask & BTRFSIC_PRINT_MASK_VERBOSE)
+                                else
-                                printk(KERN_INFO
+                                        printk(KERN_INFO
-                                       "Written block @%llu (%s/%llu/%d)"
+                                               "Written block @%llu (%s/%llu/%d) found in hash table, %c.\n",
-                                       " found in hash table, %c.\n",
+                                               bytenr, dev_state->name,
-                                       bytenr, dev_state->name, dev_bytenr,
+                                               dev_bytenr, block->mirror_num,
-                                       block->mirror_num,
+                                               btrfsic_get_block_type(state,
-                                       btrfsic_get_block_type(state, block));
+                                                                      block));
+                        }
                        block->logical_bytenr = bytenr;
                } else {
                        if (num_pages * PAGE_CACHE_SIZE <
@@ -2002,24 +1985,13 @@ again:
                        }
                }
-                if (block->is_superblock)
-                        ret = btrfsic_map_superblock(state, bytenr,
-                                                     processed_len,
-                                                     bdev, &block_ctx);
-                else
-                        ret = btrfsic_map_block(state, bytenr, processed_len,
-                                                &block_ctx, 0);
-                if (ret) {
-                        printk(KERN_INFO
-                               "btrfsic: btrfsic_map_block(root @%llu)"
-                               " failed!\n", bytenr);
-                        goto continue_loop;
-                }
-                block_ctx.datav = mapped_datav;
-                /* the following is required in case of writes to mirrors,
-                 * use the same that was used for the lookup */
                block_ctx.dev = dev_state;
                block_ctx.dev_bytenr = dev_bytenr;
+                block_ctx.start = bytenr;
+                block_ctx.len = processed_len;
+                block_ctx.pagev = NULL;
+                block_ctx.mem_to_free = NULL;
+                block_ctx.datav = mapped_datav;
                if (is_metadata || state->include_extent_data) {
                        block->never_written = 0;
@@ -2133,10 +2105,6 @@ again:
                        /* this is getting ugly for the
                         * include_extent_data case... */
                        bytenr = 0;     /* unknown */
-                        block_ctx.start = bytenr;
-                        block_ctx.len = processed_len;
-                        block_ctx.mem_to_free = NULL;
-                        block_ctx.pagev = NULL;
                } else {
                        processed_len = state->metablock_size;
                        bytenr = btrfs_stack_header_bytenr(
@@ -2149,22 +2117,15 @@ again:
                                       "Written block @%llu (%s/%llu/?)"
                                       " !found in hash table, M.\n",
                                       bytenr, dev_state->name, dev_bytenr);
-                        ret = btrfsic_map_block(state, bytenr, processed_len,
-                                                &block_ctx, 0);
-                        if (ret) {
-                                printk(KERN_INFO
-                                       "btrfsic: btrfsic_map_block(root @%llu)"
-                                       " failed!\n",
-                                       dev_bytenr);
-                                goto continue_loop;
-                        }
                }
-                block_ctx.datav = mapped_datav;
-                /* the following is required in case of writes to mirrors,
-                 * use the same that was used for the lookup */
                block_ctx.dev = dev_state;
                block_ctx.dev_bytenr = dev_bytenr;
+                block_ctx.start = bytenr;
+                block_ctx.len = processed_len;
+                block_ctx.pagev = NULL;
+                block_ctx.mem_to_free = NULL;
+                block_ctx.datav = mapped_datav;
                block = btrfsic_block_alloc();
                if (NULL == block) {
@@ -3130,10 +3091,13 @@ int btrfsic_mount(struct btrfs_root *root,
                       root->sectorsize, PAGE_CACHE_SIZE);
                return -1;
        }
-        state = kzalloc(sizeof(*state), GFP_NOFS);
+        state = kzalloc(sizeof(*state), GFP_KERNEL | __GFP_NOWARN | __GFP_REPEAT);
-        if (NULL == state) {
+        if (!state) {
-                printk(KERN_INFO "btrfs check-integrity: kmalloc() failed!\n");
+                state = vzalloc(sizeof(*state));
-                return -1;
+                if (!state) {
+                        printk(KERN_INFO "btrfs check-integrity: vzalloc() failed!\n");
+                        return -1;
+                }
        }
        if (!btrfsic_is_initialized) {
@@ -3277,5 +3241,8 @@ void btrfsic_unmount(struct btrfs_root *root,
        mutex_unlock(&btrfsic_mutex);
-        kfree(state);
+        if (is_vmalloc_addr(state))
+                vfree(state);
+        else
+                kfree(state);
 }
diff --git a/fs/btrfs/compression.c b/fs/btrfs/compression.c
index dcd9be32ac57..e9df8862012c 100644
--- a/fs/btrfs/compression.c
+++ b/fs/btrfs/compression.c
@@ -224,16 +224,19 @@ out:
 * Clear the writeback bits on all of the file
 * pages for a compressed write
 */
-static noinline void end_compressed_writeback(struct inode *inode, u64 start,
+static noinline void end_compressed_writeback(struct inode *inode,
-                                              unsigned long ram_size)
+                                              const struct compressed_bio *cb)
 {
-        unsigned long index = start >> PAGE_CACHE_SHIFT;
+        unsigned long index = cb->start >> PAGE_CACHE_SHIFT;
-        unsigned long end_index = (start + ram_size - 1) >> PAGE_CACHE_SHIFT;
+        unsigned long end_index = (cb->start + cb->len - 1) >> PAGE_CACHE_SHIFT;
        struct page *pages[16];
        unsigned long nr_pages = end_index - index + 1;
        int i;
        int ret;
+        if (cb->errors)
+                mapping_set_error(inode->i_mapping, -EIO);
        while (nr_pages > 0) {
                ret = find_get_pages_contig(inode->i_mapping, index,
                                     min_t(unsigned long,
@@ -244,6 +247,8 @@ static noinline void end_compressed_writeback(struct inode *inode, u64 start,
                        continue;
                }
                for (i = 0; i < ret; i++) {
+                        if (cb->errors)
+                                SetPageError(pages[i]);
                        end_page_writeback(pages[i]);
                        page_cache_release(pages[i]);
                }
@@ -287,10 +292,11 @@ static void end_compressed_bio_write(struct bio *bio, int err)
        tree->ops->writepage_end_io_hook(cb->compressed_pages[0],
                                         cb->start,
                                         cb->start + cb->len - 1,
-                                         NULL, 1);
+                                         NULL,
+                                         err ? 0 : 1);
        cb->compressed_pages[0]->mapping = NULL;
-        end_compressed_writeback(inode, cb->start, cb->len);
+        end_compressed_writeback(inode, cb);
        /* note, our inode could be gone now */
        /*
diff --git a/fs/btrfs/ctree.c b/fs/btrfs/ctree.c
index 150822ee0a0b..14a72ed14ef7 100644
--- a/fs/btrfs/ctree.c
+++ b/fs/btrfs/ctree.c
@@ -2929,7 +2929,7 @@ done:
         */
        if (!p->leave_spinning)
                btrfs_set_path_blocking(p);
-        if (ret < 0)
+        if (ret < 0 && !p->skip_release_on_error)
                btrfs_release_path(p);
        return ret;
 }
diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h
index fe69edda11fb..e6fbbd74b716 100644
--- a/fs/btrfs/ctree.h
+++ b/fs/btrfs/ctree.h
@@ -607,6 +607,7 @@ struct btrfs_path {
        unsigned int leave_spinning:1;
        unsigned int search_commit_root:1;
        unsigned int need_commit_sem:1;
+        unsigned int skip_release_on_error:1;
 };
 /*
@@ -1170,6 +1171,7 @@ struct btrfs_space_info {
        struct percpu_counter total_bytes_pinned;
        struct list_head list;
+        struct list_head ro_bgs;
        struct rw_semaphore groups_sem;
        /* for block groups in our same type */
@@ -1276,6 +1278,8 @@ struct btrfs_block_group_cache {
        unsigned int ro:1;
        unsigned int dirty:1;
        unsigned int iref:1;
+        unsigned int has_caching_ctl:1;
+        unsigned int removed:1;
        int disk_cache_state;
@@ -1305,6 +1309,11 @@ struct btrfs_block_group_cache {
        /* For delayed block group creation or deletion of empty block groups */
        struct list_head bg_list;
+        /* For read-only block groups */
+        struct list_head ro_list;
+        atomic_t trimming;
 };
 /* delayed seq elem */
@@ -1402,6 +1411,11 @@ struct btrfs_fs_info {
         */
        u64 last_trans_log_full_commit;
        unsigned long mount_opt;
+        /*
+         * Track requests for actions that need to be done during transaction
+         * commit (like for some mount options).
+         */
+        unsigned long pending_changes;
        unsigned long compress_type:4;
        int commit_interval;
        /*
@@ -1729,6 +1743,12 @@ struct btrfs_fs_info {
        /* For btrfs to record security options */
        struct security_mnt_opts security_opts;
+        /*
+         * Chunks that can't be freed yet (under a trim/discard operation)
+         * and will be latter freed. Protected by fs_info->chunk_mutex.
+         */
+        struct list_head pinned_chunks;
 };
 struct btrfs_subvolume_writers {
@@ -2093,7 +2113,6 @@ struct btrfs_ioctl_defrag_range_args {
 #define BTRFS_MOUNT_CHECK_INTEGRITY_INCLUDING_EXTENT_DATA (1 << 21)
 #define BTRFS_MOUNT_PANIC_ON_FATAL_ERROR        (1 << 22)
 #define BTRFS_MOUNT_RESCAN_UUID_TREE    (1 << 23)
-#define BTRFS_MOUNT_CHANGE_INODE_CACHE  (1 << 24)
 #define BTRFS_DEFAULT_COMMIT_INTERVAL   (30)
 #define BTRFS_DEFAULT_MAX_INLINE        (8192)
@@ -2103,6 +2122,7 @@ struct btrfs_ioctl_defrag_range_args {
 #define btrfs_raw_test_opt(o, opt)      ((o) & BTRFS_MOUNT_##opt)
 #define btrfs_test_opt(root, opt)       ((root)->fs_info->mount_opt & \
                                         BTRFS_MOUNT_##opt)
 #define btrfs_set_and_info(root, opt, fmt, args...)                     \
 {                                                                       \
        if (!btrfs_test_opt(root, opt))                                 \
@@ -2118,6 +2138,49 @@ struct btrfs_ioctl_defrag_range_args {
 }
 /*
+ * Requests for changes that need to be done during transaction commit.
+ *
+ * Internal mount options that are used for special handling of the real
+ * mount options (eg. cannot be set during remount and have to be set during
+ * transaction commit)
+ */
+#define BTRFS_PENDING_SET_INODE_MAP_CACHE       (0)
+#define BTRFS_PENDING_CLEAR_INODE_MAP_CACHE     (1)
+#define BTRFS_PENDING_COMMIT                    (2)
+#define btrfs_test_pending(info, opt)   \
+        test_bit(BTRFS_PENDING_##opt, &(info)->pending_changes)
+#define btrfs_set_pending(info, opt)    \
+        set_bit(BTRFS_PENDING_##opt, &(info)->pending_changes)
+#define btrfs_clear_pending(info, opt)  \
+        clear_bit(BTRFS_PENDING_##opt, &(info)->pending_changes)
+/*
+ * Helpers for setting pending mount option changes.
+ *
+ * Expects corresponding macros
+ * BTRFS_PENDING_SET_ and CLEAR_ + short mount option name
+ */
+#define btrfs_set_pending_and_info(info, opt, fmt, args...)            \
+do {                                                                   \
+       if (!btrfs_raw_test_opt((info)->mount_opt, opt)) {              \
+               btrfs_info((info), fmt, ##args);                        \
+               btrfs_set_pending((info), SET_##opt);                   \
+               btrfs_clear_pending((info), CLEAR_##opt);               \
+       }                                                               \
+} while(0)
+#define btrfs_clear_pending_and_info(info, opt, fmt, args...)          \
+do {                                                                   \
+       if (btrfs_raw_test_opt((info)->mount_opt, opt)) {               \
+               btrfs_info((info), fmt, ##args);                        \
+               btrfs_set_pending((info), CLEAR_##opt);                 \
+               btrfs_clear_pending((info), SET_##opt);                 \
+       }                                                               \
+} while(0)
+/*
 * Inode flags
 */
 #define BTRFS_INODE_NODATASUM           (1 << 0)
@@ -3351,7 +3414,8 @@ int btrfs_make_block_group(struct btrfs_trans_handle *trans,
                           u64 type, u64 chunk_objectid, u64 chunk_offset,
                           u64 size);
 int btrfs_remove_block_group(struct btrfs_trans_handle *trans,
-                             struct btrfs_root *root, u64 group_start);
+                             struct btrfs_root *root, u64 group_start,
+                             struct extent_map *em);
 void btrfs_delete_unused_bgs(struct btrfs_fs_info *fs_info);
 void btrfs_create_pending_block_groups(struct btrfs_trans_handle *trans,
                                       struct btrfs_root *root);
@@ -3427,8 +3491,8 @@ int btrfs_init_space_info(struct btrfs_fs_info *fs_info);
 int btrfs_delayed_refs_qgroup_accounting(struct btrfs_trans_handle *trans,
                                         struct btrfs_fs_info *fs_info);
 int __get_raid_index(u64 flags);
-int btrfs_start_nocow_write(struct btrfs_root *root);
+int btrfs_start_write_no_snapshoting(struct btrfs_root *root);
-void btrfs_end_nocow_write(struct btrfs_root *root);
+void btrfs_end_write_no_snapshoting(struct btrfs_root *root);
 /* ctree.c */
 int btrfs_bin_search(struct extent_buffer *eb, struct btrfs_key *key,
                     int level, int *slot);
@@ -3686,6 +3750,10 @@ struct btrfs_dir_item *btrfs_lookup_xattr(struct btrfs_trans_handle *trans,
 int verify_dir_item(struct btrfs_root *root,
                    struct extent_buffer *leaf,
                    struct btrfs_dir_item *dir_item);
+struct btrfs_dir_item *btrfs_match_dir_item_name(struct btrfs_root *root,
+                                                 struct btrfs_path *path,
+                                                 const char *name,
+                                                 int name_len);
 /* orphan.c */
 int btrfs_insert_orphan_item(struct btrfs_trans_handle *trans,
@@ -3857,6 +3925,7 @@ int btrfs_prealloc_file_range_trans(struct inode *inode,
                                    struct btrfs_trans_handle *trans, int mode,
                                    u64 start, u64 num_bytes, u64 min_size,
                                    loff_t actual_len, u64 *alloc_hint);
+int btrfs_inode_check_errors(struct inode *inode);
 extern const struct dentry_operations btrfs_dentry_operations;
 /* ioctl.c */
@@ -3901,6 +3970,7 @@ int btrfs_dirty_pages(struct btrfs_root *root, struct inode *inode,
                      struct page **pages, size_t num_pages,
                      loff_t pos, size_t write_bytes,
                      struct extent_state **cached);
+int btrfs_fdatawrite_range(struct inode *inode, loff_t start, loff_t end);
 /* tree-defrag.c */
 int btrfs_defrag_leaves(struct btrfs_trans_handle *trans,
@@ -4097,7 +4167,12 @@ int btrfs_scrub_progress(struct btrfs_root *root, u64 devid,
 /* dev-replace.c */
 void btrfs_bio_counter_inc_blocked(struct btrfs_fs_info *fs_info);
 void btrfs_bio_counter_inc_noblocked(struct btrfs_fs_info *fs_info);
-void btrfs_bio_counter_dec(struct btrfs_fs_info *fs_info);
+void btrfs_bio_counter_sub(struct btrfs_fs_info *fs_info, s64 amount);
+static inline void btrfs_bio_counter_dec(struct btrfs_fs_info *fs_info)
+{
+        btrfs_bio_counter_sub(fs_info, 1);
+}
 /* reada.c */
 struct reada_control {
diff --git a/fs/btrfs/dev-replace.c b/fs/btrfs/dev-replace.c
index 6f662b34ba0e..ca6a3a3b6b6c 100644
--- a/fs/btrfs/dev-replace.c
+++ b/fs/btrfs/dev-replace.c
@@ -316,11 +316,6 @@ int btrfs_dev_replace_start(struct btrfs_root *root,
        struct btrfs_device *tgt_device = NULL;
        struct btrfs_device *src_device = NULL;
-        if (btrfs_fs_incompat(fs_info, RAID56)) {
-                btrfs_warn(fs_info, "dev_replace cannot yet handle RAID5/RAID6");
-                return -EOPNOTSUPP;
-        }
        switch (args->start.cont_reading_from_srcdev_mode) {
        case BTRFS_IOCTL_DEV_REPLACE_CONT_READING_FROM_SRCDEV_MODE_ALWAYS:
        case BTRFS_IOCTL_DEV_REPLACE_CONT_READING_FROM_SRCDEV_MODE_AVOID:
@@ -422,9 +417,15 @@ int btrfs_dev_replace_start(struct btrfs_root *root,
                              &dev_replace->scrub_progress, 0, 1);
        ret = btrfs_dev_replace_finishing(root->fs_info, ret);
-        WARN_ON(ret);
+        /* don't warn if EINPROGRESS, someone else might be running scrub */
+        if (ret == -EINPROGRESS) {
+                args->result = BTRFS_IOCTL_DEV_REPLACE_RESULT_SCRUB_INPROGRESS;
+                ret = 0;
+        } else {
+                WARN_ON(ret);
+        }
-        return 0;
+        return ret;
 leave:
        dev_replace->srcdev = NULL;
@@ -542,7 +543,7 @@ static int btrfs_dev_replace_finishing(struct btrfs_fs_info *fs_info,
                        btrfs_destroy_dev_replace_tgtdev(fs_info, tgt_device);
                mutex_unlock(&dev_replace->lock_finishing_cancel_unmount);
-                return 0;
+                return scrub_ret;
        }
        printk_in_rcu(KERN_INFO
@@ -571,15 +572,11 @@ static int btrfs_dev_replace_finishing(struct btrfs_fs_info *fs_info,
        list_add(&tgt_device->dev_alloc_list, &fs_info->fs_devices->alloc_list);
        fs_info->fs_devices->rw_devices++;
-        /* replace the sysfs entry */
-        btrfs_kobj_rm_device(fs_info, src_device);
-        btrfs_kobj_add_device(fs_info, tgt_device);
        btrfs_dev_replace_unlock(dev_replace);
        btrfs_rm_dev_replace_blocked(fs_info);
-        btrfs_rm_dev_replace_srcdev(fs_info, src_device);
+        btrfs_rm_dev_replace_remove_srcdev(fs_info, src_device);
        btrfs_rm_dev_replace_unblocked(fs_info);
@@ -594,6 +591,11 @@ static int btrfs_dev_replace_finishing(struct btrfs_fs_info *fs_info,
        mutex_unlock(&root->fs_info->fs_devices->device_list_mutex);
        mutex_unlock(&uuid_mutex);
+        /* replace the sysfs entry */
+        btrfs_kobj_rm_device(fs_info, src_device);
+        btrfs_kobj_add_device(fs_info, tgt_device);
+        btrfs_rm_dev_replace_free_srcdev(fs_info, src_device);
        /* write back the superblocks */
        trans = btrfs_start_transaction(root, 0);
        if (!IS_ERR(trans))
@@ -920,9 +922,9 @@ void btrfs_bio_counter_inc_noblocked(struct btrfs_fs_info *fs_info)
        percpu_counter_inc(&fs_info->bio_counter);
 }
-void btrfs_bio_counter_dec(struct btrfs_fs_info *fs_info)
+void btrfs_bio_counter_sub(struct btrfs_fs_info *fs_info, s64 amount)
 {
-        percpu_counter_dec(&fs_info->bio_counter);
+        percpu_counter_sub(&fs_info->bio_counter, amount);
        if (waitqueue_active(&fs_info->replace_wait))
                wake_up(&fs_info->replace_wait);
diff --git a/fs/btrfs/dir-item.c b/fs/btrfs/dir-item.c
index fc8df866e919..1752625fb4dd 100644
--- a/fs/btrfs/dir-item.c
+++ b/fs/btrfs/dir-item.c
@@ -21,10 +21,6 @@
 #include "hash.h"
 #include "transaction.h"
-static struct btrfs_dir_item *btrfs_match_dir_item_name(struct btrfs_root *root,
-                              struct btrfs_path *path,
-                              const char *name, int name_len);
 /*
 * insert a name into a directory, doing overflow properly if there is a hash
 * collision.  data_size indicates how big the item inserted should be.  On
@@ -383,9 +379,9 @@ struct btrfs_dir_item *btrfs_lookup_xattr(struct btrfs_trans_handle *trans,
 * this walks through all the entries in a dir item and finds one
 * for a specific name.
 */
-static struct btrfs_dir_item *btrfs_match_dir_item_name(struct btrfs_root *root,
+struct btrfs_dir_item *btrfs_match_dir_item_name(struct btrfs_root *root,
-                              struct btrfs_path *path,
+                                                 struct btrfs_path *path,
-                              const char *name, int name_len)
+                                                 const char *name, int name_len)
 {
        struct btrfs_dir_item *dir_item;
        unsigned long name_ptr;
diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
index 1bf9f897065d..30965120772b 100644
--- a/fs/btrfs/disk-io.c
+++ b/fs/btrfs/disk-io.c
@@ -2384,6 +2384,8 @@ int open_ctree(struct super_block *sb,
        init_waitqueue_head(&fs_info->transaction_blocked_wait);
        init_waitqueue_head(&fs_info->async_submit_wait);
+        INIT_LIST_HEAD(&fs_info->pinned_chunks);
        ret = btrfs_alloc_stripe_hash_table(fs_info);
        if (ret) {
                err = ret;
@@ -2830,9 +2832,11 @@ retry_root_backup:
                btrfs_set_opt(fs_info->mount_opt, SSD);
        }
-        /* Set the real inode map cache flag */
+        /*
-        if (btrfs_test_opt(tree_root, CHANGE_INODE_CACHE))
+         * Mount does not set all options immediatelly, we can do it now and do
-                btrfs_set_opt(tree_root->fs_info->mount_opt, INODE_MAP_CACHE);
+         * not have to wait for transaction commit
+         */
+        btrfs_apply_pending_changes(fs_info);
 #ifdef CONFIG_BTRFS_FS_CHECK_INTEGRITY
        if (btrfs_test_opt(tree_root, CHECK_INTEGRITY)) {
@@ -3713,6 +3717,17 @@ void close_ctree(struct btrfs_root *root)
        btrfs_free_block_rsv(root, root->orphan_block_rsv);
        root->orphan_block_rsv = NULL;
+        lock_chunks(root);
+        while (!list_empty(&fs_info->pinned_chunks)) {
+                struct extent_map *em;
+                em = list_first_entry(&fs_info->pinned_chunks,
+                                      struct extent_map, list);
+                list_del_init(&em->list);
+                free_extent_map(em);
+        }
+        unlock_chunks(root);
 }
 int btrfs_buffer_uptodate(struct extent_buffer *buf, u64 parent_transid,
@@ -3839,12 +3854,12 @@ static int btrfs_check_super_valid(struct btrfs_fs_info *fs_info,
         */
        if (!IS_ALIGNED(btrfs_super_root(sb), 4096))
                printk(KERN_WARNING "BTRFS: tree_root block unaligned: %llu\n",
-                                sb->root);
+                                btrfs_super_root(sb));
        if (!IS_ALIGNED(btrfs_super_chunk_root(sb), 4096))
-                printk(KERN_WARNING "BTRFS: tree_root block unaligned: %llu\n",
+                printk(KERN_WARNING "BTRFS: chunk_root block unaligned: %llu\n",
-                                sb->chunk_root);
+                                btrfs_super_chunk_root(sb));
        if (!IS_ALIGNED(btrfs_super_log_root(sb), 4096))
-                printk(KERN_WARNING "BTRFS: tree_root block unaligned: %llu\n",
+                printk(KERN_WARNING "BTRFS: log_root block unaligned: %llu\n",
                                btrfs_super_log_root(sb));
        if (memcmp(fs_info->fsid, sb->dev_item.fsid, BTRFS_UUID_SIZE) != 0) {
@@ -4129,6 +4144,25 @@ again:
        return 0;
 }
+static void btrfs_free_pending_ordered(struct btrfs_transaction *cur_trans,
+                                       struct btrfs_fs_info *fs_info)
+{
+        struct btrfs_ordered_extent *ordered;
+        spin_lock(&fs_info->trans_lock);
+        while (!list_empty(&cur_trans->pending_ordered)) {
+                ordered = list_first_entry(&cur_trans->pending_ordered,
+                                           struct btrfs_ordered_extent,
+                                           trans_list);
+                list_del_init(&ordered->trans_list);
+                spin_unlock(&fs_info->trans_lock);
+                btrfs_put_ordered_extent(ordered);
+                spin_lock(&fs_info->trans_lock);
+        }
+        spin_unlock(&fs_info->trans_lock);
+}
 void btrfs_cleanup_one_transaction(struct btrfs_transaction *cur_trans,
                                   struct btrfs_root *root)
 {
@@ -4140,6 +4174,7 @@ void btrfs_cleanup_one_transaction(struct btrfs_transaction *cur_trans,
        cur_trans->state = TRANS_STATE_UNBLOCKED;
        wake_up(&root->fs_info->transaction_wait);
+        btrfs_free_pending_ordered(cur_trans, root->fs_info);
        btrfs_destroy_delayed_inodes(root);
        btrfs_assert_delayed_root_empty(root);
diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
index 47c1ba141082..222d6aea4a8a 100644
--- a/fs/btrfs/extent-tree.c
+++ b/fs/btrfs/extent-tree.c
@@ -315,12 +315,6 @@ get_caching_control(struct btrfs_block_group_cache *cache)
        struct btrfs_caching_control *ctl;
        spin_lock(&cache->lock);
-        if (cache->cached != BTRFS_CACHE_STARTED) {
-                spin_unlock(&cache->lock);
-                return NULL;
-        }
-        /* We're loading it the fast way, so we don't have a caching_ctl. */
        if (!cache->caching_ctl) {
                spin_unlock(&cache->lock);
                return NULL;
@@ -594,6 +588,7 @@ static int cache_block_group(struct btrfs_block_group_cache *cache,
        spin_unlock(&cache->lock);
        if (fs_info->mount_opt & BTRFS_MOUNT_SPACE_CACHE) {
+                mutex_lock(&caching_ctl->mutex);
                ret = load_free_space_cache(fs_info, cache);
                spin_lock(&cache->lock);
@@ -601,15 +596,19 @@ static int cache_block_group(struct btrfs_block_group_cache *cache,
                        cache->caching_ctl = NULL;
                        cache->cached = BTRFS_CACHE_FINISHED;
                        cache->last_byte_to_unpin = (u64)-1;
+                        caching_ctl->progress = (u64)-1;
                } else {
                        if (load_cache_only) {
                                cache->caching_ctl = NULL;
                                cache->cached = BTRFS_CACHE_NO;
                        } else {
                                cache->cached = BTRFS_CACHE_STARTED;
+                                cache->has_caching_ctl = 1;
                        }
                }
                spin_unlock(&cache->lock);
+                mutex_unlock(&caching_ctl->mutex);
                wake_up(&caching_ctl->wait);
                if (ret == 1) {
                        put_caching_control(caching_ctl);
@@ -627,6 +626,7 @@ static int cache_block_group(struct btrfs_block_group_cache *cache,
                        cache->cached = BTRFS_CACHE_NO;
                } else {
                        cache->cached = BTRFS_CACHE_STARTED;
+                        cache->has_caching_ctl = 1;
                }
                spin_unlock(&cache->lock);
                wake_up(&caching_ctl->wait);
@@ -3162,7 +3162,19 @@ next_block_group(struct btrfs_root *root,
                 struct btrfs_block_group_cache *cache)
 {
        struct rb_node *node;
        spin_lock(&root->fs_info->block_group_cache_lock);
+        /* If our block group was removed, we need a full search. */
+        if (RB_EMPTY_NODE(&cache->cache_node)) {
+                const u64 next_bytenr = cache->key.objectid + cache->key.offset;
+                spin_unlock(&root->fs_info->block_group_cache_lock);
+                btrfs_put_block_group(cache);
+                cache = btrfs_lookup_first_block_group(root->fs_info,
+                                                       next_bytenr);
+                return cache;
+        }
        node = rb_next(&cache->cache_node);
        btrfs_put_block_group(cache);
        if (node) {
@@ -3504,6 +3516,7 @@ static int update_space_info(struct btrfs_fs_info *info, u64 flags,
        found->chunk_alloc = 0;
        found->flush = 0;
        init_waitqueue_head(&found->wait);
+        INIT_LIST_HEAD(&found->ro_bgs);
        ret = kobject_init_and_add(&found->kobj, &space_info_ktype,
                                    info->space_info_kobj, "%s",
@@ -5425,7 +5438,17 @@ static int update_block_group(struct btrfs_root *root,
                        spin_unlock(&cache->space_info->lock);
                } else {
                        old_val -= num_bytes;
+                        btrfs_set_block_group_used(&cache->item, old_val);
+                        cache->pinned += num_bytes;
+                        cache->space_info->bytes_pinned += num_bytes;
+                        cache->space_info->bytes_used -= num_bytes;
+                        cache->space_info->disk_used -= num_bytes * factor;
+                        spin_unlock(&cache->lock);
+                        spin_unlock(&cache->space_info->lock);
+                        set_extent_dirty(info->pinned_extents,
+                                         bytenr, bytenr + num_bytes - 1,
+                                         GFP_NOFS | __GFP_NOFAIL);
                        /*
                         * No longer have used bytes in this block group, queue
                         * it for deletion.
@@ -5439,17 +5462,6 @@ static int update_block_group(struct btrfs_root *root,
                                }
                                spin_unlock(&info->unused_bgs_lock);
                        }
-                        btrfs_set_block_group_used(&cache->item, old_val);
-                        cache->pinned += num_bytes;
-                        cache->space_info->bytes_pinned += num_bytes;
-                        cache->space_info->bytes_used -= num_bytes;
-                        cache->space_info->disk_used -= num_bytes * factor;
-                        spin_unlock(&cache->lock);
-                        spin_unlock(&cache->space_info->lock);
-                        set_extent_dirty(info->pinned_extents,
-                                         bytenr, bytenr + num_bytes - 1,
-                                         GFP_NOFS | __GFP_NOFAIL);
                }
                btrfs_put_block_group(cache);
                total -= num_bytes;
@@ -8511,6 +8523,7 @@ static int set_block_group_ro(struct btrfs_block_group_cache *cache, int force)
            min_allocable_bytes <= sinfo->total_bytes) {
                sinfo->bytes_readonly += num_bytes;
                cache->ro = 1;
+                list_add_tail(&cache->ro_list, &sinfo->ro_bgs);
                ret = 0;
        }
 out:
@@ -8565,15 +8578,20 @@ int btrfs_force_chunk_alloc(struct btrfs_trans_handle *trans,
 /*
 * helper to account the unused space of all the readonly block group in the
- * list. takes mirrors into account.
+ * space_info. takes mirrors into account.
 */
-static u64 __btrfs_get_ro_block_group_free_space(struct list_head *groups_list)
+u64 btrfs_account_ro_block_groups_free_space(struct btrfs_space_info *sinfo)
 {
        struct btrfs_block_group_cache *block_group;
        u64 free_bytes = 0;
        int factor;
-        list_for_each_entry(block_group, groups_list, list) {
+        /* It's df, we don't care if it's racey */
+        if (list_empty(&sinfo->ro_bgs))
+                return 0;
+        spin_lock(&sinfo->lock);
+        list_for_each_entry(block_group, &sinfo->ro_bgs, ro_list) {
                spin_lock(&block_group->lock);
                if (!block_group->ro) {
@@ -8594,26 +8612,6 @@ static u64 __btrfs_get_ro_block_group_free_space(struct list_head *groups_list)
                spin_unlock(&block_group->lock);
        }
-        return free_bytes;
-}
-/*
- * helper to account the unused space of all the readonly block group in the
- * space_info. takes mirrors into account.
- */
-u64 btrfs_account_ro_block_groups_free_space(struct btrfs_space_info *sinfo)
-{
-        int i;
-        u64 free_bytes = 0;
-        spin_lock(&sinfo->lock);
-        for (i = 0; i < BTRFS_NR_RAID_TYPES; i++)
-                if (!list_empty(&sinfo->block_groups[i]))
-                        free_bytes += __btrfs_get_ro_block_group_free_space(
-                                                &sinfo->block_groups[i]);
        spin_unlock(&sinfo->lock);
        return free_bytes;
@@ -8633,6 +8631,7 @@ void btrfs_set_block_group_rw(struct btrfs_root *root,
                    cache->bytes_super - btrfs_block_group_used(&cache->item);
        sinfo->bytes_readonly -= num_bytes;
        cache->ro = 0;
+        list_del_init(&cache->ro_list);
        spin_unlock(&cache->lock);
        spin_unlock(&sinfo->lock);
 }
@@ -9002,7 +9001,9 @@ btrfs_create_block_group_cache(struct btrfs_root *root, u64 start, u64 size)
        INIT_LIST_HEAD(&cache->list);
        INIT_LIST_HEAD(&cache->cluster_list);
        INIT_LIST_HEAD(&cache->bg_list);
+        INIT_LIST_HEAD(&cache->ro_list);
        btrfs_init_free_space_ctl(cache);
+        atomic_set(&cache->trimming, 0);
        return cache;
 }
@@ -9195,9 +9196,8 @@ void btrfs_create_pending_block_groups(struct btrfs_trans_handle *trans,
        int ret = 0;
        list_for_each_entry_safe(block_group, tmp, &trans->new_bgs, bg_list) {
-                list_del_init(&block_group->bg_list);
                if (ret)
-                        continue;
+                        goto next;
                spin_lock(&block_group->lock);
                memcpy(&item, &block_group->item, sizeof(item));
@@ -9212,6 +9212,8 @@ void btrfs_create_pending_block_groups(struct btrfs_trans_handle *trans,
                                               key.objectid, key.offset);
                if (ret)
                        btrfs_abort_transaction(trans, extent_root, ret);
+next:
+                list_del_init(&block_group->bg_list);
        }
 }
@@ -9304,7 +9306,8 @@ static void clear_avail_alloc_bits(struct btrfs_fs_info *fs_info, u64 flags)
 }
 int btrfs_remove_block_group(struct btrfs_trans_handle *trans,
-                             struct btrfs_root *root, u64 group_start)
+                             struct btrfs_root *root, u64 group_start,
+                             struct extent_map *em)
 {
        struct btrfs_path *path;
        struct btrfs_block_group_cache *block_group;
@@ -9316,6 +9319,8 @@ int btrfs_remove_block_group(struct btrfs_trans_handle *trans,
        int ret;
        int index;
        int factor;
+        struct btrfs_caching_control *caching_ctl = NULL;
+        bool remove_em;
        root = root->fs_info->extent_root;
@@ -9400,6 +9405,7 @@ int btrfs_remove_block_group(struct btrfs_trans_handle *trans,
        spin_lock(&root->fs_info->block_group_cache_lock);
        rb_erase(&block_group->cache_node,
                 &root->fs_info->block_group_cache_tree);
+        RB_CLEAR_NODE(&block_group->cache_node);
        if (root->fs_info->first_logical_byte == block_group->key.objectid)
                root->fs_info->first_logical_byte = (u64)-1;
@@ -9411,6 +9417,7 @@ int btrfs_remove_block_group(struct btrfs_trans_handle *trans,
         * are still on the list after taking the semaphore
         */
        list_del_init(&block_group->list);
+        list_del_init(&block_group->ro_list);
        if (list_empty(&block_group->space_info->block_groups[index])) {
                kobj = block_group->space_info->block_group_kobjs[index];
                block_group->space_info->block_group_kobjs[index] = NULL;
@@ -9422,8 +9429,32 @@ int btrfs_remove_block_group(struct btrfs_trans_handle *trans,
                kobject_put(kobj);
        }
+        if (block_group->has_caching_ctl)
+                caching_ctl = get_caching_control(block_group);
        if (block_group->cached == BTRFS_CACHE_STARTED)
                wait_block_group_cache_done(block_group);
+        if (block_group->has_caching_ctl) {
+                down_write(&root->fs_info->commit_root_sem);
+                if (!caching_ctl) {
+                        struct btrfs_caching_control *ctl;
+                        list_for_each_entry(ctl,
+                                    &root->fs_info->caching_block_groups, list)
+                                if (ctl->block_group == block_group) {
+                                        caching_ctl = ctl;
+                                        atomic_inc(&caching_ctl->count);
+                                        break;
+                                }
+                }
+                if (caching_ctl)
+                        list_del_init(&caching_ctl->list);
+                up_write(&root->fs_info->commit_root_sem);
+                if (caching_ctl) {
+                        /* Once for the caching bgs list and once for us. */
+                        put_caching_control(caching_ctl);
+                        put_caching_control(caching_ctl);
+                }
+        }
        btrfs_remove_free_space_cache(block_group);
@@ -9435,6 +9466,71 @@ int btrfs_remove_block_group(struct btrfs_trans_handle *trans,
        memcpy(&key, &block_group->key, sizeof(key));
+        lock_chunks(root);
+        if (!list_empty(&em->list)) {
+                /* We're in the transaction->pending_chunks list. */
+                free_extent_map(em);
+        }
+        spin_lock(&block_group->lock);
+        block_group->removed = 1;
+        /*
+         * At this point trimming can't start on this block group, because we
+         * removed the block group from the tree fs_info->block_group_cache_tree
+         * so no one can't find it anymore and even if someone already got this
+         * block group before we removed it from the rbtree, they have already
+         * incremented block_group->trimming - if they didn't, they won't find
+         * any free space entries because we already removed them all when we
+         * called btrfs_remove_free_space_cache().
+         *
+         * And we must not remove the extent map from the fs_info->mapping_tree
+         * to prevent the same logical address range and physical device space
+         * ranges from being reused for a new block group. This is because our
+         * fs trim operation (btrfs_trim_fs() / btrfs_ioctl_fitrim()) is
+         * completely transactionless, so while it is trimming a range the
+         * currently running transaction might finish and a new one start,
+         * allowing for new block groups to be created that can reuse the same
+         * physical device locations unless we take this special care.
+         */
+        remove_em = (atomic_read(&block_group->trimming) == 0);
+        /*
+         * Make sure a trimmer task always sees the em in the pinned_chunks list
+         * if it sees block_group->removed == 1 (needs to lock block_group->lock
+         * before checking block_group->removed).
+         */
+        if (!remove_em) {
+                /*
+                 * Our em might be in trans->transaction->pending_chunks which
+                 * is protected by fs_info->chunk_mutex ([lock|unlock]_chunks),
+                 * and so is the fs_info->pinned_chunks list.
+                 *
+                 * So at this point we must be holding the chunk_mutex to avoid
+                 * any races with chunk allocation (more specifically at
+                 * volumes.c:contains_pending_extent()), to ensure it always
+                 * sees the em, either in the pending_chunks list or in the
+                 * pinned_chunks list.
+                 */
+                list_move_tail(&em->list, &root->fs_info->pinned_chunks);
+        }
+        spin_unlock(&block_group->lock);
+        if (remove_em) {
+                struct extent_map_tree *em_tree;
+                em_tree = &root->fs_info->mapping_tree.map_tree;
+                write_lock(&em_tree->lock);
+                /*
+                 * The em might be in the pending_chunks list, so make sure the
+                 * chunk mutex is locked, since remove_extent_mapping() will
+                 * delete us from that list.
+                 */
+                remove_extent_mapping(em_tree, em);
+                write_unlock(&em_tree->lock);
+                /* once for the tree */
+                free_extent_map(em);
+        }
+        unlock_chunks(root);
        btrfs_put_block_group(block_group);
        btrfs_put_block_group(block_group);
@@ -9523,10 +9619,18 @@ void btrfs_delete_unused_bgs(struct btrfs_fs_info *fs_info)
                 */
                start = block_group->key.objectid;
                end = start + block_group->key.offset - 1;
-                clear_extent_bits(&fs_info->freed_extents[0], start, end,
+                ret = clear_extent_bits(&fs_info->freed_extents[0], start, end,
                                  EXTENT_DIRTY, GFP_NOFS);
-                clear_extent_bits(&fs_info->freed_extents[1], start, end,
+                if (ret) {
+                        btrfs_set_block_group_rw(root, block_group);
+                        goto end_trans;
+                }
+                ret = clear_extent_bits(&fs_info->freed_extents[1], start, end,
                                  EXTENT_DIRTY, GFP_NOFS);
+                if (ret) {
+                        btrfs_set_block_group_rw(root, block_group);
+                        goto end_trans;
+                }
                /* Reset pinned so btrfs_put_block_group doesn't complain */
                block_group->pinned = 0;
@@ -9537,6 +9641,7 @@ void btrfs_delete_unused_bgs(struct btrfs_fs_info *fs_info)
                 */
                ret = btrfs_remove_chunk(trans, root,
                                         block_group->key.objectid);
+end_trans:
                btrfs_end_transaction(trans, root);
 next:
                btrfs_put_block_group(block_group);
@@ -9657,12 +9762,14 @@ int btrfs_trim_fs(struct btrfs_root *root, struct fstrim_range *range)
 }
 /*
- * btrfs_{start,end}_write() is similar to mnt_{want, drop}_write(),
+ * btrfs_{start,end}_write_no_snapshoting() are similar to
- * they are used to prevent the some tasks writing data into the page cache
+ * mnt_{want,drop}_write(), they are used to prevent some tasks from writing
- * by nocow before the subvolume is snapshoted, but flush the data into
+ * data into the page cache through nocow before the subvolume is snapshoted,
- * the disk after the snapshot creation.
+ * but flush the data into disk after the snapshot creation, or to prevent
+ * operations while snapshoting is ongoing and that cause the snapshot to be
+ * inconsistent (writes followed by expanding truncates for example).
 */
-void btrfs_end_nocow_write(struct btrfs_root *root)
+void btrfs_end_write_no_snapshoting(struct btrfs_root *root)
 {
        percpu_counter_dec(&root->subv_writers->counter);
        /*
@@ -9674,7 +9781,7 @@ void btrfs_end_nocow_write(struct btrfs_root *root)
                wake_up(&root->subv_writers->wait);
 }
-int btrfs_start_nocow_write(struct btrfs_root *root)
+int btrfs_start_write_no_snapshoting(struct btrfs_root *root)
 {
        if (atomic_read(&root->will_be_snapshoted))
                return 0;
@@ -9685,7 +9792,7 @@ int btrfs_start_nocow_write(struct btrfs_root *root)
         */
        smp_mb();
        if (atomic_read(&root->will_be_snapshoted)) {
-                btrfs_end_nocow_write(root);
+                btrfs_end_write_no_snapshoting(root);
                return 0;
        }
        return 1;
diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c
index bf3f424e0013..4ebabd237153 100644
--- a/fs/btrfs/extent_io.c
+++ b/fs/btrfs/extent_io.c
@@ -595,9 +595,14 @@ int clear_extent_bit(struct extent_io_tree *tree, u64 start, u64 end,
                clear = 1;
 again:
        if (!prealloc && (mask & __GFP_WAIT)) {
+                /*
+                 * Don't care for allocation failure here because we might end
+                 * up not needing the pre-allocated extent state at all, which
+                 * is the case if we only have in the tree extent states that
+                 * cover our input range and don't cover too any other range.
+                 * If we end up needing a new extent state we allocate it later.
+                 */
                prealloc = alloc_extent_state(mask);
-                if (!prealloc)
-                        return -ENOMEM;
        }
        spin_lock(&tree->lock);
@@ -796,17 +801,25 @@ static void set_state_bits(struct extent_io_tree *tree,
        state->state |= bits_to_set;
 }
-static void cache_state(struct extent_state *state,
+static void cache_state_if_flags(struct extent_state *state,
-                        struct extent_state **cached_ptr)
+                                 struct extent_state **cached_ptr,
+                                 const u64 flags)
 {
        if (cached_ptr && !(*cached_ptr)) {
-                if (state->state & (EXTENT_IOBITS | EXTENT_BOUNDARY)) {
+                if (!flags || (state->state & flags)) {
                        *cached_ptr = state;
                        atomic_inc(&state->refs);
                }
        }
 }
+static void cache_state(struct extent_state *state,
+                        struct extent_state **cached_ptr)
+{
+        return cache_state_if_flags(state, cached_ptr,
+                                    EXTENT_IOBITS | EXTENT_BOUNDARY);
+}
 /*
 * set some bits on a range in the tree.  This may require allocations or
 * sleeping, so the gfp mask is used to indicate what is allowed.
@@ -1058,13 +1071,21 @@ int convert_extent_bit(struct extent_io_tree *tree, u64 start, u64 end,
        int err = 0;
        u64 last_start;
        u64 last_end;
+        bool first_iteration = true;
        btrfs_debug_check_extent_io_range(tree, start, end);
 again:
        if (!prealloc && (mask & __GFP_WAIT)) {
+                /*
+                 * Best effort, don't worry if extent state allocation fails
+                 * here for the first iteration. We might have a cached state
+                 * that matches exactly the target range, in which case no
+                 * extent state allocations are needed. We'll only know this
+                 * after locking the tree.
+                 */
                prealloc = alloc_extent_state(mask);
-                if (!prealloc)
+                if (!prealloc && !first_iteration)
                        return -ENOMEM;
        }
@@ -1234,6 +1255,7 @@ search_again:
        spin_unlock(&tree->lock);
        if (mask & __GFP_WAIT)
                cond_resched();
+        first_iteration = false;
        goto again;
 }
@@ -1482,7 +1504,7 @@ int find_first_extent_bit(struct extent_io_tree *tree, u64 start,
        state = find_first_extent_bit_state(tree, start, bits);
 got_it:
        if (state) {
-                cache_state(state, cached_state);
+                cache_state_if_flags(state, cached_state, 0);
                *start_ret = state->start;
                *end_ret = state->end;
                ret = 0;
@@ -1746,6 +1768,9 @@ int extent_clear_unlock_delalloc(struct inode *inode, u64 start, u64 end,
        if (page_ops == 0)
                return 0;
+        if ((page_ops & PAGE_SET_ERROR) && nr_pages > 0)
+                mapping_set_error(inode->i_mapping, -EIO);
        while (nr_pages > 0) {
                ret = find_get_pages_contig(inode->i_mapping, index,
                                     min_t(unsigned long,
@@ -1763,6 +1788,8 @@ int extent_clear_unlock_delalloc(struct inode *inode, u64 start, u64 end,
                                clear_page_dirty_for_io(pages[i]);
                        if (page_ops & PAGE_SET_WRITEBACK)
                                set_page_writeback(pages[i]);
+                        if (page_ops & PAGE_SET_ERROR)
+                                SetPageError(pages[i]);
                        if (page_ops & PAGE_END_WRITEBACK)
                                end_page_writeback(pages[i]);
                        if (page_ops & PAGE_UNLOCK)
diff --git a/fs/btrfs/extent_io.h b/fs/btrfs/extent_io.h
index 6d4b938be986..ece9ce87edff 100644
--- a/fs/btrfs/extent_io.h
+++ b/fs/btrfs/extent_io.h
@@ -49,6 +49,7 @@
 #define PAGE_SET_WRITEBACK      (1 << 2)
 #define PAGE_END_WRITEBACK      (1 << 3)
 #define PAGE_SET_PRIVATE2       (1 << 4)
+#define PAGE_SET_ERROR          (1 << 5)
 /*
 * page->private values.  Every page that is controlled by the extent
diff --git a/fs/btrfs/extent_map.c b/fs/btrfs/extent_map.c
index 225302b39afb..6a98bddd8f33 100644
--- a/fs/btrfs/extent_map.c
+++ b/fs/btrfs/extent_map.c
@@ -287,8 +287,6 @@ int unpin_extent_cache(struct extent_map_tree *tree, u64 start, u64 len,
        if (!em)
                goto out;
-        if (!test_bit(EXTENT_FLAG_LOGGING, &em->flags))
-                list_move(&em->list, &tree->modified_extents);
        em->generation = gen;
        clear_bit(EXTENT_FLAG_PINNED, &em->flags);
        em->mod_start = em->start;
diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c
index a18ceabd99a8..e4090259569b 100644
--- a/fs/btrfs/file.c
+++ b/fs/btrfs/file.c
@@ -1428,7 +1428,7 @@ static noinline int check_can_nocow(struct inode *inode, loff_t pos,
        u64 num_bytes;
        int ret;
-        ret = btrfs_start_nocow_write(root);
+        ret = btrfs_start_write_no_snapshoting(root);
        if (!ret)
                return -ENOSPC;
@@ -1451,7 +1451,7 @@ static noinline int check_can_nocow(struct inode *inode, loff_t pos,
        ret = can_nocow_extent(inode, lockstart, &num_bytes, NULL, NULL, NULL);
        if (ret <= 0) {
                ret = 0;
-                btrfs_end_nocow_write(root);
+                btrfs_end_write_no_snapshoting(root);
        } else {
                *write_bytes = min_t(size_t, *write_bytes ,
                                     num_bytes - pos + lockstart);
@@ -1543,7 +1543,7 @@ static noinline ssize_t __btrfs_buffered_write(struct file *file,
                                btrfs_free_reserved_data_space(inode,
                                                               reserve_bytes);
                        else
-                                btrfs_end_nocow_write(root);
+                                btrfs_end_write_no_snapshoting(root);
                        break;
                }
@@ -1632,7 +1632,7 @@ again:
                release_bytes = 0;
                if (only_release_metadata)
-                        btrfs_end_nocow_write(root);
+                        btrfs_end_write_no_snapshoting(root);
                if (only_release_metadata && copied > 0) {
                        u64 lockstart = round_down(pos, root->sectorsize);
@@ -1661,7 +1661,7 @@ again:
        if (release_bytes) {
                if (only_release_metadata) {
-                        btrfs_end_nocow_write(root);
+                        btrfs_end_write_no_snapshoting(root);
                        btrfs_delalloc_release_metadata(inode, release_bytes);
                } else {
                        btrfs_delalloc_release_space(inode, release_bytes);
@@ -1676,6 +1676,7 @@ static ssize_t __btrfs_direct_write(struct kiocb *iocb,
                                    loff_t pos)
 {
        struct file *file = iocb->ki_filp;
+        struct inode *inode = file_inode(file);
        ssize_t written;
        ssize_t written_buffered;
        loff_t endbyte;
@@ -1692,8 +1693,15 @@ static ssize_t __btrfs_direct_write(struct kiocb *iocb,
                err = written_buffered;
                goto out;
        }
+        /*
+         * Ensure all data is persisted. We want the next direct IO read to be
+         * able to read what was just written.
+         */
        endbyte = pos + written_buffered - 1;
-        err = filemap_write_and_wait_range(file->f_mapping, pos, endbyte);
+        err = btrfs_fdatawrite_range(inode, pos, endbyte);
+        if (err)
+                goto out;
+        err = filemap_fdatawait_range(inode->i_mapping, pos, endbyte);
        if (err)
                goto out;
        written += written_buffered;
@@ -1854,10 +1862,7 @@ static int start_ordered_ops(struct inode *inode, loff_t start, loff_t end)
        int ret;
        atomic_inc(&BTRFS_I(inode)->sync_writers);
-        ret = filemap_fdatawrite_range(inode->i_mapping, start, end);
+        ret = btrfs_fdatawrite_range(inode, start, end);
-        if (!ret && test_bit(BTRFS_INODE_HAS_ASYNC_EXTENT,
-                             &BTRFS_I(inode)->runtime_flags))
-                ret = filemap_fdatawrite_range(inode->i_mapping, start, end);
        atomic_dec(&BTRFS_I(inode)->sync_writers);
        return ret;
@@ -2810,3 +2815,29 @@ int btrfs_auto_defrag_init(void)
        return 0;
 }
+int btrfs_fdatawrite_range(struct inode *inode, loff_t start, loff_t end)
+{
+        int ret;
+        /*
+         * So with compression we will find and lock a dirty page and clear the
+         * first one as dirty, setup an async extent, and immediately return
+         * with the entire range locked but with nobody actually marked with
+         * writeback.  So we can't just filemap_write_and_wait_range() and
+         * expect it to work since it will just kick off a thread to do the
+         * actual work.  So we need to call filemap_fdatawrite_range _again_
+         * since it will wait on the page lock, which won't be unlocked until
+         * after the pages have been marked as writeback and so we're good to go
+         * from there.  We have to do this otherwise we'll miss the ordered
+         * extents and that results in badness.  Please Josef, do not think you
+         * know better and pull this out at some point in the future, it is
+         * right and you are wrong.
+         */
+        ret = filemap_fdatawrite_range(inode->i_mapping, start, end);
+        if (!ret && test_bit(BTRFS_INODE_HAS_ASYNC_EXTENT,
+                             &BTRFS_I(inode)->runtime_flags))
+                ret = filemap_fdatawrite_range(inode->i_mapping, start, end);
+        return ret;
+}
diff --git a/fs/btrfs/free-space-cache.c b/fs/btrfs/free-space-cache.c
index 33848196550e..030847bf7cec 100644
--- a/fs/btrfs/free-space-cache.c
+++ b/fs/btrfs/free-space-cache.c
@@ -27,10 +27,17 @@
 #include "disk-io.h"
 #include "extent_io.h"
 #include "inode-map.h"
+#include "volumes.h"
 #define BITS_PER_BITMAP         (PAGE_CACHE_SIZE * 8)
 #define MAX_CACHE_BYTES_PER_GIG (32 * 1024)
+struct btrfs_trim_range {
+        u64 start;
+        u64 bytes;
+        struct list_head list;
+};
 static int link_free_space(struct btrfs_free_space_ctl *ctl,
                           struct btrfs_free_space *info);
 static void unlink_free_space(struct btrfs_free_space_ctl *ctl,
@@ -881,6 +888,7 @@ int write_cache_extent_entries(struct io_ctl *io_ctl,
        int ret;
        struct btrfs_free_cluster *cluster = NULL;
        struct rb_node *node = rb_first(&ctl->free_space_offset);
+        struct btrfs_trim_range *trim_entry;
        /* Get the cluster for this block_group if it exists */
        if (block_group && !list_empty(&block_group->cluster_list)) {
@@ -916,6 +924,21 @@ int write_cache_extent_entries(struct io_ctl *io_ctl,
                        cluster = NULL;
                }
        }
+        /*
+         * Make sure we don't miss any range that was removed from our rbtree
+         * because trimming is running. Otherwise after a umount+mount (or crash
+         * after committing the transaction) we would leak free space and get
+         * an inconsistent free space cache report from fsck.
+         */
+        list_for_each_entry(trim_entry, &ctl->trimming_ranges, list) {
+                ret = io_ctl_add_entry(io_ctl, trim_entry->start,
+                                       trim_entry->bytes, NULL);
+                if (ret)
+                        goto fail;
+                *entries += 1;
+        }
        return 0;
 fail:
        return -ENOSPC;
@@ -1135,12 +1158,15 @@ static int __btrfs_write_out_cache(struct btrfs_root *root, struct inode *inode,
        io_ctl_set_generation(&io_ctl, trans->transid);
+        mutex_lock(&ctl->cache_writeout_mutex);
        /* Write out the extent entries in the free space cache */
        ret = write_cache_extent_entries(&io_ctl, ctl,
                                         block_group, &entries, &bitmaps,
                                         &bitmap_list);
-        if (ret)
+        if (ret) {
+                mutex_unlock(&ctl->cache_writeout_mutex);
                goto out_nospc;
+        }
        /*
         * Some spaces that are freed in the current transaction are pinned,
@@ -1148,11 +1174,18 @@ static int __btrfs_write_out_cache(struct btrfs_root *root, struct inode *inode,
         * committed, we shouldn't lose them.
         */
        ret = write_pinned_extent_entries(root, block_group, &io_ctl, &entries);
-        if (ret)
+        if (ret) {
+                mutex_unlock(&ctl->cache_writeout_mutex);
                goto out_nospc;
+        }
-        /* At last, we write out all the bitmaps. */
+        /*
+         * At last, we write out all the bitmaps and keep cache_writeout_mutex
+         * locked while doing it because a concurrent trim can be manipulating
+         * or freeing the bitmap.
+         */
        ret = write_bitmap_entries(&io_ctl, &bitmap_list);
+        mutex_unlock(&ctl->cache_writeout_mutex);
        if (ret)
                goto out_nospc;
@@ -2295,6 +2328,8 @@ void btrfs_init_free_space_ctl(struct btrfs_block_group_cache *block_group)
        ctl->start = block_group->key.objectid;
        ctl->private = block_group;
        ctl->op = &free_space_op;
+        INIT_LIST_HEAD(&ctl->trimming_ranges);
+        mutex_init(&ctl->cache_writeout_mutex);
        /*
         * we only want to have 32k of ram per block group for keeping
@@ -2911,10 +2946,12 @@ void btrfs_init_free_cluster(struct btrfs_free_cluster *cluster)
 static int do_trimming(struct btrfs_block_group_cache *block_group,
                       u64 *total_trimmed, u64 start, u64 bytes,
-                       u64 reserved_start, u64 reserved_bytes)
+                       u64 reserved_start, u64 reserved_bytes,
+                       struct btrfs_trim_range *trim_entry)
 {
        struct btrfs_space_info *space_info = block_group->space_info;
        struct btrfs_fs_info *fs_info = block_group->fs_info;
+        struct btrfs_free_space_ctl *ctl = block_group->free_space_ctl;
        int ret;
        int update = 0;
        u64 trimmed = 0;
@@ -2934,7 +2971,10 @@ static int do_trimming(struct btrfs_block_group_cache *block_group,
        if (!ret)
                *total_trimmed += trimmed;
+        mutex_lock(&ctl->cache_writeout_mutex);
        btrfs_add_free_space(block_group, reserved_start, reserved_bytes);
+        list_del(&trim_entry->list);
+        mutex_unlock(&ctl->cache_writeout_mutex);
        if (update) {
                spin_lock(&space_info->lock);
@@ -2962,16 +3002,21 @@ static int trim_no_bitmap(struct btrfs_block_group_cache *block_group,
        u64 bytes;
        while (start < end) {
+                struct btrfs_trim_range trim_entry;
+                mutex_lock(&ctl->cache_writeout_mutex);
                spin_lock(&ctl->tree_lock);
                if (ctl->free_space < minlen) {
                        spin_unlock(&ctl->tree_lock);
+                        mutex_unlock(&ctl->cache_writeout_mutex);
                        break;
                }
                entry = tree_search_offset(ctl, start, 0, 1);
                if (!entry) {
                        spin_unlock(&ctl->tree_lock);
+                        mutex_unlock(&ctl->cache_writeout_mutex);
                        break;
                }
@@ -2980,6 +3025,7 @@ static int trim_no_bitmap(struct btrfs_block_group_cache *block_group,
                        node = rb_next(&entry->offset_index);
                        if (!node) {
                                spin_unlock(&ctl->tree_lock);
+                                mutex_unlock(&ctl->cache_writeout_mutex);
                                goto out;
                        }
                        entry = rb_entry(node, struct btrfs_free_space,
@@ -2988,6 +3034,7 @@ static int trim_no_bitmap(struct btrfs_block_group_cache *block_group,
                if (entry->offset >= end) {
                        spin_unlock(&ctl->tree_lock);
+                        mutex_unlock(&ctl->cache_writeout_mutex);
                        break;
                }
@@ -2997,6 +3044,7 @@ static int trim_no_bitmap(struct btrfs_block_group_cache *block_group,
                bytes = min(extent_start + extent_bytes, end) - start;
                if (bytes < minlen) {
                        spin_unlock(&ctl->tree_lock);
+                        mutex_unlock(&ctl->cache_writeout_mutex);
                        goto next;
                }
@@ -3004,9 +3052,13 @@ static int trim_no_bitmap(struct btrfs_block_group_cache *block_group,
                kmem_cache_free(btrfs_free_space_cachep, entry);
                spin_unlock(&ctl->tree_lock);
+                trim_entry.start = extent_start;
+                trim_entry.bytes = extent_bytes;
+                list_add_tail(&trim_entry.list, &ctl->trimming_ranges);
+                mutex_unlock(&ctl->cache_writeout_mutex);
                ret = do_trimming(block_group, total_trimmed, start, bytes,
-                                  extent_start, extent_bytes);
+                                  extent_start, extent_bytes, &trim_entry);
                if (ret)
                        break;
 next:
@@ -3035,17 +3087,21 @@ static int trim_bitmaps(struct btrfs_block_group_cache *block_group,
        while (offset < end) {
                bool next_bitmap = false;
+                struct btrfs_trim_range trim_entry;
+                mutex_lock(&ctl->cache_writeout_mutex);
                spin_lock(&ctl->tree_lock);
                if (ctl->free_space < minlen) {
                        spin_unlock(&ctl->tree_lock);
+                        mutex_unlock(&ctl->cache_writeout_mutex);
                        break;
                }
                entry = tree_search_offset(ctl, offset, 1, 0);
                if (!entry) {
                        spin_unlock(&ctl->tree_lock);
+                        mutex_unlock(&ctl->cache_writeout_mutex);
                        next_bitmap = true;
                        goto next;
                }
@@ -3054,6 +3110,7 @@ static int trim_bitmaps(struct btrfs_block_group_cache *block_group,
                ret2 = search_bitmap(ctl, entry, &start, &bytes);
                if (ret2 || start >= end) {
                        spin_unlock(&ctl->tree_lock);
+                        mutex_unlock(&ctl->cache_writeout_mutex);
                        next_bitmap = true;
                        goto next;
                }
@@ -3061,6 +3118,7 @@ static int trim_bitmaps(struct btrfs_block_group_cache *block_group,
                bytes = min(bytes, end - start);
                if (bytes < minlen) {
                        spin_unlock(&ctl->tree_lock);
+                        mutex_unlock(&ctl->cache_writeout_mutex);
                        goto next;
                }
@@ -3069,9 +3127,13 @@ static int trim_bitmaps(struct btrfs_block_group_cache *block_group,
                        free_bitmap(ctl, entry);
                spin_unlock(&ctl->tree_lock);
+                trim_entry.start = start;
+                trim_entry.bytes = bytes;
+                list_add_tail(&trim_entry.list, &ctl->trimming_ranges);
+                mutex_unlock(&ctl->cache_writeout_mutex);
                ret = do_trimming(block_group, total_trimmed, start, bytes,
-                                  start, bytes);
+                                  start, bytes, &trim_entry);
                if (ret)
                        break;
 next:
@@ -3101,11 +3163,52 @@ int btrfs_trim_block_group(struct btrfs_block_group_cache *block_group,
        *trimmed = 0;
+        spin_lock(&block_group->lock);
+        if (block_group->removed) {
+                spin_unlock(&block_group->lock);
+                return 0;
+        }
+        atomic_inc(&block_group->trimming);
+        spin_unlock(&block_group->lock);
        ret = trim_no_bitmap(block_group, trimmed, start, end, minlen);
        if (ret)
-                return ret;
+                goto out;
        ret = trim_bitmaps(block_group, trimmed, start, end, minlen);
+out:
+        spin_lock(&block_group->lock);
+        if (atomic_dec_and_test(&block_group->trimming) &&
+            block_group->removed) {
+                struct extent_map_tree *em_tree;
+                struct extent_map *em;
+                spin_unlock(&block_group->lock);
+                em_tree = &block_group->fs_info->mapping_tree.map_tree;
+                write_lock(&em_tree->lock);
+                em = lookup_extent_mapping(em_tree, block_group->key.objectid,
+                                           1);
+                BUG_ON(!em); /* logic error, can't happen */
+                remove_extent_mapping(em_tree, em);
+                write_unlock(&em_tree->lock);
+                lock_chunks(block_group->fs_info->chunk_root);
+                list_del_init(&em->list);
+                unlock_chunks(block_group->fs_info->chunk_root);
+                /* once for us and once for the tree */
+                free_extent_map(em);
+                free_extent_map(em);
+                /*
+                 * We've left one free space entry and other tasks trimming
+                 * this block group have left 1 entry each one. Free them.
+                 */
+                __btrfs_remove_free_space_cache(block_group->free_space_ctl);
+        } else {
+                spin_unlock(&block_group->lock);
+        }
        return ret;
 }
diff --git a/fs/btrfs/free-space-cache.h b/fs/btrfs/free-space-cache.h
index 0cf4977ef70d..88b2238a0aed 100644
--- a/fs/btrfs/free-space-cache.h
+++ b/fs/btrfs/free-space-cache.h
@@ -38,6 +38,8 @@ struct btrfs_free_space_ctl {
        u64 start;
        struct btrfs_free_space_op *op;
        void *private;
+        struct mutex cache_writeout_mutex;
+        struct list_head trimming_ranges;
 };
 struct btrfs_free_space_op {
diff --git a/fs/btrfs/inode-map.c b/fs/btrfs/inode-map.c
index 83d646bd2e4b..74faea3a516e 100644
--- a/fs/btrfs/inode-map.c
+++ b/fs/btrfs/inode-map.c
@@ -178,7 +178,7 @@ static void start_caching(struct btrfs_root *root)
                          root->root_key.objectid);
        if (IS_ERR(tsk)) {
                btrfs_warn(root->fs_info, "failed to start inode caching task");
-                btrfs_clear_and_info(root, CHANGE_INODE_CACHE,
+                btrfs_clear_pending_and_info(root->fs_info, INODE_MAP_CACHE,
                                "disabling inode map caching");
        }
 }
@@ -364,6 +364,8 @@ void btrfs_init_free_ino_ctl(struct btrfs_root *root)
        ctl->start = 0;
        ctl->private = NULL;
        ctl->op = &free_ino_op;
+        INIT_LIST_HEAD(&ctl->trimming_ranges);
+        mutex_init(&ctl->cache_writeout_mutex);
        /*
         * Initially we allow to use 16K of ram to cache chunks of
diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index ff0dcc016b71..e687bb0dc73a 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -382,7 +382,7 @@ static inline int inode_need_compress(struct inode *inode)
 * are written in the same order that the flusher thread sent them
 * down.
 */
-static noinline int compress_file_range(struct inode *inode,
+static noinline void compress_file_range(struct inode *inode,
                                        struct page *locked_page,
                                        u64 start, u64 end,
                                        struct async_cow *async_cow,
@@ -411,14 +411,6 @@ static noinline int compress_file_range(struct inode *inode,
            (start > 0 || end + 1 < BTRFS_I(inode)->disk_i_size))
                btrfs_add_inode_defrag(NULL, inode);
-        /*
-         * skip compression for a small file range(<=blocksize) that
-         * isn't an inline extent, since it dosen't save disk space at all.
-         */
-        if ((end - start + 1) <= blocksize &&
-            (start > 0 || end + 1 < BTRFS_I(inode)->disk_i_size))
-                goto cleanup_and_bail_uncompressed;
        actual_end = min_t(u64, isize, end + 1);
 again:
        will_compress = 0;
@@ -440,6 +432,14 @@ again:
        total_compressed = actual_end - start;
+        /*
+         * skip compression for a small file range(<=blocksize) that
+         * isn't an inline extent, since it dosen't save disk space at all.
+         */
+        if (total_compressed <= blocksize &&
+           (start > 0 || end + 1 < BTRFS_I(inode)->disk_i_size))
+                goto cleanup_and_bail_uncompressed;
        /* we want to make sure that amount of ram required to uncompress
         * an extent is reasonable, so we limit the total size in ram
         * of a compressed extent to 128k.  This is a crucial number
@@ -527,7 +527,10 @@ cont:
                if (ret <= 0) {
                        unsigned long clear_flags = EXTENT_DELALLOC |
                                EXTENT_DEFRAG;
+                        unsigned long page_error_op;
                        clear_flags |= (ret < 0) ? EXTENT_DO_ACCOUNTING : 0;
+                        page_error_op = ret < 0 ? PAGE_SET_ERROR : 0;
                        /*
                         * inline extent creation worked or returned error,
@@ -538,6 +541,7 @@ cont:
                                                     clear_flags, PAGE_UNLOCK |
                                                     PAGE_CLEAR_DIRTY |
                                                     PAGE_SET_WRITEBACK |
+                                                     page_error_op |
                                                     PAGE_END_WRITEBACK);
                        goto free_pages_out;
                }
@@ -620,8 +624,7 @@ cleanup_and_bail_uncompressed:
                *num_added += 1;
        }
-out:
+        return;
-        return ret;
 free_pages_out:
        for (i = 0; i < nr_pages_ret; i++) {
@@ -629,8 +632,22 @@ free_pages_out:
                page_cache_release(pages[i]);
        }
        kfree(pages);
+}
-        goto out;
+static void free_async_extent_pages(struct async_extent *async_extent)
+{
+        int i;
+        if (!async_extent->pages)
+                return;
+        for (i = 0; i < async_extent->nr_pages; i++) {
+                WARN_ON(async_extent->pages[i]->mapping);
+                page_cache_release(async_extent->pages[i]);
+        }
+        kfree(async_extent->pages);
+        async_extent->nr_pages = 0;
+        async_extent->pages = NULL;
 }
 /*
@@ -639,7 +656,7 @@ free_pages_out:
 * queued.  We walk all the async extents created by compress_file_range
 * and send them down to the disk.
 */
-static noinline int submit_compressed_extents(struct inode *inode,
+static noinline void submit_compressed_extents(struct inode *inode,
                                              struct async_cow *async_cow)
 {
        struct async_extent *async_extent;
@@ -651,9 +668,6 @@ static noinline int submit_compressed_extents(struct inode *inode,
        struct extent_io_tree *io_tree;
        int ret = 0;
-        if (list_empty(&async_cow->extents))
-                return 0;
 again:
        while (!list_empty(&async_cow->extents)) {
                async_extent = list_entry(async_cow->extents.next,
@@ -709,15 +723,7 @@ retry:
                                           async_extent->compressed_size,
                                           0, alloc_hint, &ins, 1, 1);
                if (ret) {
-                        int i;
+                        free_async_extent_pages(async_extent);
-                        for (i = 0; i < async_extent->nr_pages; i++) {
-                                WARN_ON(async_extent->pages[i]->mapping);
-                                page_cache_release(async_extent->pages[i]);
-                        }
-                        kfree(async_extent->pages);
-                        async_extent->nr_pages = 0;
-                        async_extent->pages = NULL;
                        if (ret == -ENOSPC) {
                                unlock_extent(io_tree, async_extent->start,
@@ -814,15 +820,26 @@ retry:
                                    ins.objectid,
                                    ins.offset, async_extent->pages,
                                    async_extent->nr_pages);
+                if (ret) {
+                        struct extent_io_tree *tree = &BTRFS_I(inode)->io_tree;
+                        struct page *p = async_extent->pages[0];
+                        const u64 start = async_extent->start;
+                        const u64 end = start + async_extent->ram_size - 1;
+                        p->mapping = inode->i_mapping;
+                        tree->ops->writepage_end_io_hook(p, start, end,
+                                                         NULL, 0);
+                        p->mapping = NULL;
+                        extent_clear_unlock_delalloc(inode, start, end, NULL, 0,
+                                                     PAGE_END_WRITEBACK |
+                                                     PAGE_SET_ERROR);
+                        free_async_extent_pages(async_extent);
+                }
                alloc_hint = ins.objectid + ins.offset;
                kfree(async_extent);
-                if (ret)
-                        goto out;
                cond_resched();
        }
-        ret = 0;
+        return;
-out:
-        return ret;
 out_free_reserve:
        btrfs_free_reserved_extent(root, ins.objectid, ins.offset, 1);
 out_free:
@@ -832,7 +849,9 @@ out_free:
                                     NULL, EXTENT_LOCKED | EXTENT_DELALLOC |
                                     EXTENT_DEFRAG | EXTENT_DO_ACCOUNTING,
                                     PAGE_UNLOCK | PAGE_CLEAR_DIRTY |
-                                     PAGE_SET_WRITEBACK | PAGE_END_WRITEBACK);
+                                     PAGE_SET_WRITEBACK | PAGE_END_WRITEBACK |
+                                     PAGE_SET_ERROR);
+        free_async_extent_pages(async_extent);
        kfree(async_extent);
        goto again;
 }
@@ -1318,7 +1337,7 @@ next_slot:
                         * we fall into common COW way.
                         */
                        if (!nolock) {
-                                err = btrfs_start_nocow_write(root);
+                                err = btrfs_start_write_no_snapshoting(root);
                                if (!err)
                                        goto out_check;
                        }
@@ -1342,7 +1361,7 @@ out_check:
                if (extent_end <= start) {
                        path->slots[0]++;
                        if (!nolock && nocow)
-                                btrfs_end_nocow_write(root);
+                                btrfs_end_write_no_snapshoting(root);
                        goto next_slot;
                }
                if (!nocow) {
@@ -1362,7 +1381,7 @@ out_check:
                                             page_started, nr_written, 1);
                        if (ret) {
                                if (!nolock && nocow)
-                                        btrfs_end_nocow_write(root);
+                                        btrfs_end_write_no_snapshoting(root);
                                goto error;
                        }
                        cow_start = (u64)-1;
@@ -1413,7 +1432,7 @@ out_check:
                                                      num_bytes);
                        if (ret) {
                                if (!nolock && nocow)
-                                        btrfs_end_nocow_write(root);
+                                        btrfs_end_write_no_snapshoting(root);
                                goto error;
                        }
                }
@@ -1424,7 +1443,7 @@ out_check:
                                             EXTENT_DELALLOC, PAGE_UNLOCK |
                                             PAGE_SET_PRIVATE2);
                if (!nolock && nocow)
-                        btrfs_end_nocow_write(root);
+                        btrfs_end_write_no_snapshoting(root);
                cur_offset = extent_end;
                if (cur_offset > end)
                        break;
@@ -4580,6 +4599,26 @@ next:
        return err;
 }
+static int wait_snapshoting_atomic_t(atomic_t *a)
+{
+        schedule();
+        return 0;
+}
+static void wait_for_snapshot_creation(struct btrfs_root *root)
+{
+        while (true) {
+                int ret;
+                ret = btrfs_start_write_no_snapshoting(root);
+                if (ret)
+                        break;
+                wait_on_atomic_t(&root->will_be_snapshoted,
+                                 wait_snapshoting_atomic_t,
+                                 TASK_UNINTERRUPTIBLE);
+        }
+}
 static int btrfs_setsize(struct inode *inode, struct iattr *attr)
 {
        struct btrfs_root *root = BTRFS_I(inode)->root;
@@ -4604,17 +4643,30 @@ static int btrfs_setsize(struct inode *inode, struct iattr *attr)
        if (newsize > oldsize) {
                truncate_pagecache(inode, newsize);
+                /*
+                 * Don't do an expanding truncate while snapshoting is ongoing.
+                 * This is to ensure the snapshot captures a fully consistent
+                 * state of this file - if the snapshot captures this expanding
+                 * truncation, it must capture all writes that happened before
+                 * this truncation.
+                 */
+                wait_for_snapshot_creation(root);
                ret = btrfs_cont_expand(inode, oldsize, newsize);
-                if (ret)
+                if (ret) {
+                        btrfs_end_write_no_snapshoting(root);
                        return ret;
+                }
                trans = btrfs_start_transaction(root, 1);
-                if (IS_ERR(trans))
+                if (IS_ERR(trans)) {
+                        btrfs_end_write_no_snapshoting(root);
                        return PTR_ERR(trans);
+                }
                i_size_write(inode, newsize);
                btrfs_ordered_update_i_size(inode, i_size_read(inode), NULL);
                ret = btrfs_update_inode(trans, root, inode);
+                btrfs_end_write_no_snapshoting(root);
                btrfs_end_transaction(trans, root);
        } else {
@@ -7000,9 +7052,12 @@ static int lock_extent_direct(struct inode *inode, u64 lockstart, u64 lockend,
                        btrfs_put_ordered_extent(ordered);
                } else {
                        /* Screw you mmap */
-                        ret = filemap_write_and_wait_range(inode->i_mapping,
+                        ret = btrfs_fdatawrite_range(inode, lockstart, lockend);
-                                                           lockstart,
+                        if (ret)
-                                                           lockend);
+                                break;
+                        ret = filemap_fdatawait_range(inode->i_mapping,
+                                                      lockstart,
+                                                      lockend);
                        if (ret)
                                break;
@@ -9442,6 +9497,21 @@ out_inode:
 }
+/* Inspired by filemap_check_errors() */
+int btrfs_inode_check_errors(struct inode *inode)
+{
+        int ret = 0;
+        if (test_bit(AS_ENOSPC, &inode->i_mapping->flags) &&
+            test_and_clear_bit(AS_ENOSPC, &inode->i_mapping->flags))
+                ret = -ENOSPC;
+        if (test_bit(AS_EIO, &inode->i_mapping->flags) &&
+            test_and_clear_bit(AS_EIO, &inode->i_mapping->flags))
+                ret = -EIO;
+        return ret;
+}
 static const struct inode_operations btrfs_dir_inode_operations = {
        .getattr        = btrfs_getattr,
        .lookup         = btrfs_lookup,
diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c
index 080fe66c0349..d49fe8a0f6b5 100644
--- a/fs/btrfs/ioctl.c
+++ b/fs/btrfs/ioctl.c
@@ -617,7 +617,7 @@ fail:
        return ret;
 }
-static void btrfs_wait_nocow_write(struct btrfs_root *root)
+static void btrfs_wait_for_no_snapshoting_writes(struct btrfs_root *root)
 {
        s64 writers;
        DEFINE_WAIT(wait);
@@ -649,7 +649,7 @@ static int create_snapshot(struct btrfs_root *root, struct inode *dir,
        atomic_inc(&root->will_be_snapshoted);
        smp_mb__after_atomic();
-        btrfs_wait_nocow_write(root);
+        btrfs_wait_for_no_snapshoting_writes(root);
        ret = btrfs_start_delalloc_inodes(root, 0);
        if (ret)
@@ -717,35 +717,6 @@ static int create_snapshot(struct btrfs_root *root, struct inode *dir,
        if (ret)
                goto fail;
-        /*
-         * If orphan cleanup did remove any orphans, it means the tree was
-         * modified and therefore the commit root is not the same as the
-         * current root anymore. This is a problem, because send uses the
-         * commit root and therefore can see inode items that don't exist
-         * in the current root anymore, and for example make calls to
-         * btrfs_iget, which will do tree lookups based on the current root
-         * and not on the commit root. Those lookups will fail, returning a
-         * -ESTALE error, and making send fail with that error. So make sure
-         * a send does not see any orphans we have just removed, and that it
-         * will see the same inodes regardless of whether a transaction
-         * commit happened before it started (meaning that the commit root
-         * will be the same as the current root) or not.
-         */
-        if (readonly && pending_snapshot->snap->node !=
-            pending_snapshot->snap->commit_root) {
-                trans = btrfs_join_transaction(pending_snapshot->snap);
-                if (IS_ERR(trans) && PTR_ERR(trans) != -ENOENT) {
-                        ret = PTR_ERR(trans);
-                        goto fail;
-                }
-                if (!IS_ERR(trans)) {
-                        ret = btrfs_commit_transaction(trans,
-                                                       pending_snapshot->snap);
-                        if (ret)
-                                goto fail;
-                }
-        }
        inode = btrfs_lookup_dentry(dentry->d_parent->d_inode, dentry);
        if (IS_ERR(inode)) {
                ret = PTR_ERR(inode);
@@ -761,7 +732,8 @@ fail:
 free:
        kfree(pending_snapshot);
 out:
-        atomic_dec(&root->will_be_snapshoted);
+        if (atomic_dec_and_test(&root->will_be_snapshoted))
+                wake_up_atomic_t(&root->will_be_snapshoted);
        return ret;
 }
diff --git a/fs/btrfs/ordered-data.c b/fs/btrfs/ordered-data.c
index ac734ec4cc20..534544e08f76 100644
--- a/fs/btrfs/ordered-data.c
+++ b/fs/btrfs/ordered-data.c
@@ -220,6 +220,7 @@ static int __btrfs_add_ordered_extent(struct inode *inode, u64 file_offset,
        INIT_LIST_HEAD(&entry->work_list);
        init_completion(&entry->completion);
        INIT_LIST_HEAD(&entry->log_list);
+        INIT_LIST_HEAD(&entry->trans_list);
        trace_btrfs_ordered_extent_add(inode, entry);
@@ -431,19 +432,31 @@ out:
 /* Needs to either be called under a log transaction or the log_mutex */
 void btrfs_get_logged_extents(struct inode *inode,
-                              struct list_head *logged_list)
+                              struct list_head *logged_list,
+                              const loff_t start,
+                              const loff_t end)
 {
        struct btrfs_ordered_inode_tree *tree;
        struct btrfs_ordered_extent *ordered;
        struct rb_node *n;
+        struct rb_node *prev;
        tree = &BTRFS_I(inode)->ordered_tree;
        spin_lock_irq(&tree->lock);
-        for (n = rb_first(&tree->tree); n; n = rb_next(n)) {
+        n = __tree_search(&tree->tree, end, &prev);
+        if (!n)
+                n = prev;
+        for (; n; n = rb_prev(n)) {
                ordered = rb_entry(n, struct btrfs_ordered_extent, rb_node);
+                if (ordered->file_offset > end)
+                        continue;
+                if (entry_end(ordered) <= start)
+                        break;
                if (!list_empty(&ordered->log_list))
                        continue;
-                list_add_tail(&ordered->log_list, logged_list);
+                if (test_bit(BTRFS_ORDERED_LOGGED, &ordered->flags))
+                        continue;
+                list_add(&ordered->log_list, logged_list);
                atomic_inc(&ordered->refs);
        }
        spin_unlock_irq(&tree->lock);
@@ -472,7 +485,8 @@ void btrfs_submit_logged_extents(struct list_head *logged_list,
        spin_unlock_irq(&log->log_extents_lock[index]);
 }
-void btrfs_wait_logged_extents(struct btrfs_root *log, u64 transid)
+void btrfs_wait_logged_extents(struct btrfs_trans_handle *trans,
+                               struct btrfs_root *log, u64 transid)
 {
        struct btrfs_ordered_extent *ordered;
        int index = transid % 2;
@@ -497,7 +511,8 @@ void btrfs_wait_logged_extents(struct btrfs_root *log, u64 transid)
                wait_event(ordered->wait, test_bit(BTRFS_ORDERED_IO_DONE,
                                                   &ordered->flags));
-                btrfs_put_ordered_extent(ordered);
+                if (!test_and_set_bit(BTRFS_ORDERED_LOGGED, &ordered->flags))
+                        list_add_tail(&ordered->trans_list, &trans->ordered);
                spin_lock_irq(&log->log_extents_lock[index]);
        }
        spin_unlock_irq(&log->log_extents_lock[index]);
@@ -725,30 +740,10 @@ int btrfs_wait_ordered_range(struct inode *inode, u64 start, u64 len)
        /* start IO across the range first to instantiate any delalloc
         * extents
         */
-        ret = filemap_fdatawrite_range(inode->i_mapping, start, orig_end);
+        ret = btrfs_fdatawrite_range(inode, start, orig_end);
        if (ret)
                return ret;
-        /*
-         * So with compression we will find and lock a dirty page and clear the
-         * first one as dirty, setup an async extent, and immediately return
-         * with the entire range locked but with nobody actually marked with
-         * writeback.  So we can't just filemap_write_and_wait_range() and
-         * expect it to work since it will just kick off a thread to do the
-         * actual work.  So we need to call filemap_fdatawrite_range _again_
-         * since it will wait on the page lock, which won't be unlocked until
-         * after the pages have been marked as writeback and so we're good to go
-         * from there.  We have to do this otherwise we'll miss the ordered
-         * extents and that results in badness.  Please Josef, do not think you
-         * know better and pull this out at some point in the future, it is
-         * right and you are wrong.
-         */
-        if (test_bit(BTRFS_INODE_HAS_ASYNC_EXTENT,
-                     &BTRFS_I(inode)->runtime_flags)) {
-                ret = filemap_fdatawrite_range(inode->i_mapping, start,
-                                               orig_end);
-                if (ret)
-                        return ret;
-        }
        ret = filemap_fdatawait_range(inode->i_mapping, start, orig_end);
        if (ret)
                return ret;
diff --git a/fs/btrfs/ordered-data.h b/fs/btrfs/ordered-data.h
index d81a274d621e..e96cd4ccd805 100644
--- a/fs/btrfs/ordered-data.h
+++ b/fs/btrfs/ordered-data.h
@@ -71,6 +71,8 @@ struct btrfs_ordered_sum {
                                       ordered extent */
 #define BTRFS_ORDERED_TRUNCATED 9 /* Set when we have to truncate an extent */
+#define BTRFS_ORDERED_LOGGED 10 /* Set when we've waited on this ordered extent
+                                 * in the logging code. */
 struct btrfs_ordered_extent {
        /* logical offset in the file */
        u64 file_offset;
@@ -121,6 +123,9 @@ struct btrfs_ordered_extent {
        /* If we need to wait on this to be done */
        struct list_head log_list;
+        /* If the transaction needs to wait on this ordered extent */
+        struct list_head trans_list;
        /* used to wait for the BTRFS_ORDERED_COMPLETE bit */
        wait_queue_head_t wait;
@@ -193,11 +198,14 @@ int btrfs_find_ordered_sum(struct inode *inode, u64 offset, u64 disk_bytenr,
 int btrfs_wait_ordered_extents(struct btrfs_root *root, int nr);
 void btrfs_wait_ordered_roots(struct btrfs_fs_info *fs_info, int nr);
 void btrfs_get_logged_extents(struct inode *inode,
-                              struct list_head *logged_list);
+                              struct list_head *logged_list,
+                              const loff_t start,
+                              const loff_t end);
 void btrfs_put_logged_extents(struct list_head *logged_list);
 void btrfs_submit_logged_extents(struct list_head *logged_list,
                                 struct btrfs_root *log);
-void btrfs_wait_logged_extents(struct btrfs_root *log, u64 transid);
+void btrfs_wait_logged_extents(struct btrfs_trans_handle *trans,
+                               struct btrfs_root *log, u64 transid);
 void btrfs_free_logged_extents(struct btrfs_root *log, u64 transid);
 int __init ordered_data_init(void);
 void ordered_data_exit(void);
diff --git a/fs/btrfs/raid56.c b/fs/btrfs/raid56.c
index 6a41631cb959..8ab2a17bbba8 100644
--- a/fs/btrfs/raid56.c
+++ b/fs/btrfs/raid56.c
@@ -58,9 +58,23 @@
 */
 #define RBIO_CACHE_READY_BIT    3
+/*
+ * bbio and raid_map is managed by the caller, so we shouldn't free
+ * them here. And besides that, all rbios with this flag should not
+ * be cached, because we need raid_map to check the rbios' stripe
+ * is the same or not, but it is very likely that the caller has
+ * free raid_map, so don't cache those rbios.
+ */
+#define RBIO_HOLD_BBIO_MAP_BIT  4
 #define RBIO_CACHE_SIZE 1024
+enum btrfs_rbio_ops {
+        BTRFS_RBIO_WRITE        = 0,
+        BTRFS_RBIO_READ_REBUILD = 1,
+        BTRFS_RBIO_PARITY_SCRUB = 2,
+};
 struct btrfs_raid_bio {
        struct btrfs_fs_info *fs_info;
        struct btrfs_bio *bbio;
@@ -117,13 +131,16 @@ struct btrfs_raid_bio {
        /* number of data stripes (no p/q) */
        int nr_data;
+        int real_stripes;
+        int stripe_npages;
        /*
         * set if we're doing a parity rebuild
         * for a read from higher up, which is handled
         * differently from a parity rebuild as part of
         * rmw
         */
-        int read_rebuild;
+        enum btrfs_rbio_ops operation;
        /* first bad stripe */
        int faila;
@@ -131,6 +148,7 @@ struct btrfs_raid_bio {
        /* second bad stripe (for raid6 use) */
        int failb;
+        int scrubp;
        /*
         * number of pages needed to represent the full
         * stripe
@@ -144,8 +162,13 @@ struct btrfs_raid_bio {
         */
        int bio_list_bytes;
+        int generic_bio_cnt;
        atomic_t refs;
+        atomic_t stripes_pending;
+        atomic_t error;
        /*
         * these are two arrays of pointers.  We allocate the
         * rbio big enough to hold them both and setup their
@@ -162,6 +185,11 @@ struct btrfs_raid_bio {
         * here for faster lookup
         */
        struct page **bio_pages;
+        /*
+         * bitmap to record which horizontal stripe has data
+         */
+        unsigned long *dbitmap;
 };
 static int __raid56_parity_recover(struct btrfs_raid_bio *rbio);
@@ -176,6 +204,10 @@ static void __free_raid_bio(struct btrfs_raid_bio *rbio);
 static void index_rbio_pages(struct btrfs_raid_bio *rbio);
 static int alloc_rbio_pages(struct btrfs_raid_bio *rbio);
+static noinline void finish_parity_scrub(struct btrfs_raid_bio *rbio,
+                                         int need_check);
+static void async_scrub_parity(struct btrfs_raid_bio *rbio);
 /*
 * the stripe hash table is used for locking, and to collect
 * bios in hopes of making a full stripe
@@ -324,6 +356,7 @@ static void merge_rbio(struct btrfs_raid_bio *dest,
 {
        bio_list_merge(&dest->bio_list, &victim->bio_list);
        dest->bio_list_bytes += victim->bio_list_bytes;
+        dest->generic_bio_cnt += victim->generic_bio_cnt;
        bio_list_init(&victim->bio_list);
 }
@@ -577,11 +610,20 @@ static int rbio_can_merge(struct btrfs_raid_bio *last,
            cur->raid_map[0])
                return 0;
-        /* reads can't merge with writes */
+        /* we can't merge with different operations */
-        if (last->read_rebuild !=
+        if (last->operation != cur->operation)
-            cur->read_rebuild) {
+                return 0;
+        /*
+         * We've need read the full stripe from the drive.
+         * check and repair the parity and write the new results.
+         *
+         * We're not allowed to add any new bios to the
+         * bio list here, anyone else that wants to
+         * change this stripe needs to do their own rmw.
+         */
+        if (last->operation == BTRFS_RBIO_PARITY_SCRUB ||
+            cur->operation == BTRFS_RBIO_PARITY_SCRUB)
                return 0;
-        }
        return 1;
 }
@@ -601,7 +643,7 @@ static struct page *rbio_pstripe_page(struct btrfs_raid_bio *rbio, int index)
 */
 static struct page *rbio_qstripe_page(struct btrfs_raid_bio *rbio, int index)
 {
-        if (rbio->nr_data + 1 == rbio->bbio->num_stripes)
+        if (rbio->nr_data + 1 == rbio->real_stripes)
                return NULL;
        index += ((rbio->nr_data + 1) * rbio->stripe_len) >>
@@ -772,11 +814,14 @@ static noinline void unlock_stripe(struct btrfs_raid_bio *rbio)
                        spin_unlock(&rbio->bio_list_lock);
                        spin_unlock_irqrestore(&h->lock, flags);
-                        if (next->read_rebuild)
+                        if (next->operation == BTRFS_RBIO_READ_REBUILD)
                                async_read_rebuild(next);
-                        else {
+                        else if (next->operation == BTRFS_RBIO_WRITE) {
                                steal_rbio(rbio, next);
                                async_rmw_stripe(next);
+                        } else if (next->operation == BTRFS_RBIO_PARITY_SCRUB) {
+                                steal_rbio(rbio, next);
+                                async_scrub_parity(next);
                        }
                        goto done_nolock;
@@ -796,6 +841,21 @@ done_nolock:
                remove_rbio_from_cache(rbio);
 }
+static inline void
+__free_bbio_and_raid_map(struct btrfs_bio *bbio, u64 *raid_map, int need)
+{
+        if (need) {
+                kfree(raid_map);
+                kfree(bbio);
+        }
+}
+static inline void free_bbio_and_raid_map(struct btrfs_raid_bio *rbio)
+{
+        __free_bbio_and_raid_map(rbio->bbio, rbio->raid_map,
+                        !test_bit(RBIO_HOLD_BBIO_MAP_BIT, &rbio->flags));
+}
 static void __free_raid_bio(struct btrfs_raid_bio *rbio)
 {
        int i;
@@ -814,8 +874,9 @@ static void __free_raid_bio(struct btrfs_raid_bio *rbio)
                        rbio->stripe_pages[i] = NULL;
                }
        }
-        kfree(rbio->raid_map);
-        kfree(rbio->bbio);
+        free_bbio_and_raid_map(rbio);
        kfree(rbio);
 }
@@ -833,6 +894,10 @@ static void rbio_orig_end_io(struct btrfs_raid_bio *rbio, int err, int uptodate)
 {
        struct bio *cur = bio_list_get(&rbio->bio_list);
        struct bio *next;
+        if (rbio->generic_bio_cnt)
+                btrfs_bio_counter_sub(rbio->fs_info, rbio->generic_bio_cnt);
        free_raid_bio(rbio);
        while (cur) {
@@ -858,13 +923,13 @@ static void raid_write_end_io(struct bio *bio, int err)
        bio_put(bio);
-        if (!atomic_dec_and_test(&rbio->bbio->stripes_pending))
+        if (!atomic_dec_and_test(&rbio->stripes_pending))
                return;
        err = 0;
        /* OK, we have read all the stripes we need to. */
-        if (atomic_read(&rbio->bbio->error) > rbio->bbio->max_errors)
+        if (atomic_read(&rbio->error) > rbio->bbio->max_errors)
                err = -EIO;
        rbio_orig_end_io(rbio, err, 0);
@@ -925,16 +990,16 @@ static struct btrfs_raid_bio *alloc_rbio(struct btrfs_root *root,
 {
        struct btrfs_raid_bio *rbio;
        int nr_data = 0;
-        int num_pages = rbio_nr_pages(stripe_len, bbio->num_stripes);
+        int real_stripes = bbio->num_stripes - bbio->num_tgtdevs;
+        int num_pages = rbio_nr_pages(stripe_len, real_stripes);
+        int stripe_npages = DIV_ROUND_UP(stripe_len, PAGE_SIZE);
        void *p;
-        rbio = kzalloc(sizeof(*rbio) + num_pages * sizeof(struct page *) * 2,
+        rbio = kzalloc(sizeof(*rbio) + num_pages * sizeof(struct page *) * 2 +
+                       DIV_ROUND_UP(stripe_npages, BITS_PER_LONG / 8),
                        GFP_NOFS);
-        if (!rbio) {
+        if (!rbio)
-                kfree(raid_map);
-                kfree(bbio);
                return ERR_PTR(-ENOMEM);
-        }
        bio_list_init(&rbio->bio_list);
        INIT_LIST_HEAD(&rbio->plug_list);
@@ -946,9 +1011,13 @@ static struct btrfs_raid_bio *alloc_rbio(struct btrfs_root *root,
        rbio->fs_info = root->fs_info;
        rbio->stripe_len = stripe_len;
        rbio->nr_pages = num_pages;
+        rbio->real_stripes = real_stripes;
+        rbio->stripe_npages = stripe_npages;
        rbio->faila = -1;
        rbio->failb = -1;
        atomic_set(&rbio->refs, 1);
+        atomic_set(&rbio->error, 0);
+        atomic_set(&rbio->stripes_pending, 0);
        /*
         * the stripe_pages and bio_pages array point to the extra
@@ -957,11 +1026,12 @@ static struct btrfs_raid_bio *alloc_rbio(struct btrfs_root *root,
        p = rbio + 1;
        rbio->stripe_pages = p;
        rbio->bio_pages = p + sizeof(struct page *) * num_pages;
+        rbio->dbitmap = p + sizeof(struct page *) * num_pages * 2;
-        if (raid_map[bbio->num_stripes - 1] == RAID6_Q_STRIPE)
+        if (raid_map[real_stripes - 1] == RAID6_Q_STRIPE)
-                nr_data = bbio->num_stripes - 2;
+                nr_data = real_stripes - 2;
        else
-                nr_data = bbio->num_stripes - 1;
+                nr_data = real_stripes - 1;
        rbio->nr_data = nr_data;
        return rbio;
@@ -1073,7 +1143,7 @@ static int rbio_add_io_page(struct btrfs_raid_bio *rbio,
 static void validate_rbio_for_rmw(struct btrfs_raid_bio *rbio)
 {
        if (rbio->faila >= 0 || rbio->failb >= 0) {
-                BUG_ON(rbio->faila == rbio->bbio->num_stripes - 1);
+                BUG_ON(rbio->faila == rbio->real_stripes - 1);
                __raid56_parity_recover(rbio);
        } else {
                finish_rmw(rbio);
@@ -1134,7 +1204,7 @@ static void index_rbio_pages(struct btrfs_raid_bio *rbio)
 static noinline void finish_rmw(struct btrfs_raid_bio *rbio)
 {
        struct btrfs_bio *bbio = rbio->bbio;
-        void *pointers[bbio->num_stripes];
+        void *pointers[rbio->real_stripes];
        int stripe_len = rbio->stripe_len;
        int nr_data = rbio->nr_data;
        int stripe;
@@ -1148,11 +1218,11 @@ static noinline void finish_rmw(struct btrfs_raid_bio *rbio)
        bio_list_init(&bio_list);
-        if (bbio->num_stripes - rbio->nr_data == 1) {
+        if (rbio->real_stripes - rbio->nr_data == 1) {
-                p_stripe = bbio->num_stripes - 1;
+                p_stripe = rbio->real_stripes - 1;
-        } else if (bbio->num_stripes - rbio->nr_data == 2) {
+        } else if (rbio->real_stripes - rbio->nr_data == 2) {
-                p_stripe = bbio->num_stripes - 2;
+                p_stripe = rbio->real_stripes - 2;
-                q_stripe = bbio->num_stripes - 1;
+                q_stripe = rbio->real_stripes - 1;
        } else {
                BUG();
        }
@@ -1169,7 +1239,7 @@ static noinline void finish_rmw(struct btrfs_raid_bio *rbio)
        set_bit(RBIO_RMW_LOCKED_BIT, &rbio->flags);
        spin_unlock_irq(&rbio->bio_list_lock);
-        atomic_set(&rbio->bbio->error, 0);
+        atomic_set(&rbio->error, 0);
        /*
         * now that we've set rmw_locked, run through the
@@ -1209,7 +1279,7 @@ static noinline void finish_rmw(struct btrfs_raid_bio *rbio)
                        SetPageUptodate(p);
                        pointers[stripe++] = kmap(p);
-                        raid6_call.gen_syndrome(bbio->num_stripes, PAGE_SIZE,
+                        raid6_call.gen_syndrome(rbio->real_stripes, PAGE_SIZE,
                                                pointers);
                } else {
                        /* raid5 */
@@ -1218,7 +1288,7 @@ static noinline void finish_rmw(struct btrfs_raid_bio *rbio)
                }
-                for (stripe = 0; stripe < bbio->num_stripes; stripe++)
+                for (stripe = 0; stripe < rbio->real_stripes; stripe++)
                        kunmap(page_in_rbio(rbio, stripe, pagenr, 0));
        }
@@ -1227,7 +1297,7 @@ static noinline void finish_rmw(struct btrfs_raid_bio *rbio)
         * higher layers (the bio_list in our rbio) and our p/q.  Ignore
         * everything else.
         */
-        for (stripe = 0; stripe < bbio->num_stripes; stripe++) {
+        for (stripe = 0; stripe < rbio->real_stripes; stripe++) {
                for (pagenr = 0; pagenr < pages_per_stripe; pagenr++) {
                        struct page *page;
                        if (stripe < rbio->nr_data) {
@@ -1245,8 +1315,34 @@ static noinline void finish_rmw(struct btrfs_raid_bio *rbio)
                }
        }
-        atomic_set(&bbio->stripes_pending, bio_list_size(&bio_list));
+        if (likely(!bbio->num_tgtdevs))
-        BUG_ON(atomic_read(&bbio->stripes_pending) == 0);
+                goto write_data;
+        for (stripe = 0; stripe < rbio->real_stripes; stripe++) {
+                if (!bbio->tgtdev_map[stripe])
+                        continue;
+                for (pagenr = 0; pagenr < pages_per_stripe; pagenr++) {
+                        struct page *page;
+                        if (stripe < rbio->nr_data) {
+                                page = page_in_rbio(rbio, stripe, pagenr, 1);
+                                if (!page)
+                                        continue;
+                        } else {
+                               page = rbio_stripe_page(rbio, stripe, pagenr);
+                        }
+                        ret = rbio_add_io_page(rbio, &bio_list, page,
+                                               rbio->bbio->tgtdev_map[stripe],
+                                               pagenr, rbio->stripe_len);
+                        if (ret)
+                                goto cleanup;
+                }
+        }
+write_data:
+        atomic_set(&rbio->stripes_pending, bio_list_size(&bio_list));
+        BUG_ON(atomic_read(&rbio->stripes_pending) == 0);
        while (1) {
                bio = bio_list_pop(&bio_list);
@@ -1283,7 +1379,8 @@ static int find_bio_stripe(struct btrfs_raid_bio *rbio,
                stripe = &rbio->bbio->stripes[i];
                stripe_start = stripe->physical;
                if (physical >= stripe_start &&
-                    physical < stripe_start + rbio->stripe_len) {
+                    physical < stripe_start + rbio->stripe_len &&
+                    bio->bi_bdev == stripe->dev->bdev) {
                        return i;
                }
        }
@@ -1331,11 +1428,11 @@ static int fail_rbio_index(struct btrfs_raid_bio *rbio, int failed)
        if (rbio->faila == -1) {
                /* first failure on this rbio */
                rbio->faila = failed;
-                atomic_inc(&rbio->bbio->error);
+                atomic_inc(&rbio->error);
        } else if (rbio->failb == -1) {
                /* second failure on this rbio */
                rbio->failb = failed;
-                atomic_inc(&rbio->bbio->error);
+                atomic_inc(&rbio->error);
        } else {
                ret = -EIO;
        }
@@ -1394,11 +1491,11 @@ static void raid_rmw_end_io(struct bio *bio, int err)
        bio_put(bio);
-        if (!atomic_dec_and_test(&rbio->bbio->stripes_pending))
+        if (!atomic_dec_and_test(&rbio->stripes_pending))
                return;
        err = 0;
-        if (atomic_read(&rbio->bbio->error) > rbio->bbio->max_errors)
+        if (atomic_read(&rbio->error) > rbio->bbio->max_errors)
                goto cleanup;
        /*
@@ -1439,7 +1536,6 @@ static void async_read_rebuild(struct btrfs_raid_bio *rbio)
 static int raid56_rmw_stripe(struct btrfs_raid_bio *rbio)
 {
        int bios_to_read = 0;
-        struct btrfs_bio *bbio = rbio->bbio;
        struct bio_list bio_list;
        int ret;
        int nr_pages = DIV_ROUND_UP(rbio->stripe_len, PAGE_CACHE_SIZE);
@@ -1455,7 +1551,7 @@ static int raid56_rmw_stripe(struct btrfs_raid_bio *rbio)
        index_rbio_pages(rbio);
-        atomic_set(&rbio->bbio->error, 0);
+        atomic_set(&rbio->error, 0);
        /*
         * build a list of bios to read all the missing parts of this
         * stripe
@@ -1503,7 +1599,7 @@ static int raid56_rmw_stripe(struct btrfs_raid_bio *rbio)
         * the bbio may be freed once we submit the last bio.  Make sure
         * not to touch it after that
         */
-        atomic_set(&bbio->stripes_pending, bios_to_read);
+        atomic_set(&rbio->stripes_pending, bios_to_read);
        while (1) {
                bio = bio_list_pop(&bio_list);
                if (!bio)
@@ -1686,19 +1782,30 @@ int raid56_parity_write(struct btrfs_root *root, struct bio *bio,
        struct btrfs_raid_bio *rbio;
        struct btrfs_plug_cb *plug = NULL;
        struct blk_plug_cb *cb;
+        int ret;
        rbio = alloc_rbio(root, bbio, raid_map, stripe_len);
-        if (IS_ERR(rbio))
+        if (IS_ERR(rbio)) {
+                __free_bbio_and_raid_map(bbio, raid_map, 1);
                return PTR_ERR(rbio);
+        }
        bio_list_add(&rbio->bio_list, bio);
        rbio->bio_list_bytes = bio->bi_iter.bi_size;
+        rbio->operation = BTRFS_RBIO_WRITE;
+        btrfs_bio_counter_inc_noblocked(root->fs_info);
+        rbio->generic_bio_cnt = 1;
        /*
         * don't plug on full rbios, just get them out the door
         * as quickly as we can
         */
-        if (rbio_is_full(rbio))
+        if (rbio_is_full(rbio)) {
-                return full_stripe_write(rbio);
+                ret = full_stripe_write(rbio);
+                if (ret)
+                        btrfs_bio_counter_dec(root->fs_info);
+                return ret;
+        }
        cb = blk_check_plugged(btrfs_raid_unplug, root->fs_info,
                               sizeof(*plug));
@@ -1709,10 +1816,13 @@ int raid56_parity_write(struct btrfs_root *root, struct bio *bio,
                        INIT_LIST_HEAD(&plug->rbio_list);
                }
                list_add_tail(&rbio->plug_list, &plug->rbio_list);
+                ret = 0;
        } else {
-                return __raid56_parity_write(rbio);
+                ret = __raid56_parity_write(rbio);
+                if (ret)
+                        btrfs_bio_counter_dec(root->fs_info);
        }
-        return 0;
+        return ret;
 }
 /*
@@ -1730,7 +1840,7 @@ static void __raid_recover_end_io(struct btrfs_raid_bio *rbio)
        int err;
        int i;
-        pointers = kzalloc(rbio->bbio->num_stripes * sizeof(void *),
+        pointers = kzalloc(rbio->real_stripes * sizeof(void *),
                           GFP_NOFS);
        if (!pointers) {
                err = -ENOMEM;
@@ -1740,7 +1850,7 @@ static void __raid_recover_end_io(struct btrfs_raid_bio *rbio)
        faila = rbio->faila;
        failb = rbio->failb;
-        if (rbio->read_rebuild) {
+        if (rbio->operation == BTRFS_RBIO_READ_REBUILD) {
                spin_lock_irq(&rbio->bio_list_lock);
                set_bit(RBIO_RMW_LOCKED_BIT, &rbio->flags);
                spin_unlock_irq(&rbio->bio_list_lock);
@@ -1749,15 +1859,23 @@ static void __raid_recover_end_io(struct btrfs_raid_bio *rbio)
        index_rbio_pages(rbio);
        for (pagenr = 0; pagenr < nr_pages; pagenr++) {
+                /*
+                 * Now we just use bitmap to mark the horizontal stripes in
+                 * which we have data when doing parity scrub.
+                 */
+                if (rbio->operation == BTRFS_RBIO_PARITY_SCRUB &&
+                    !test_bit(pagenr, rbio->dbitmap))
+                        continue;
                /* setup our array of pointers with pages
                 * from each stripe
                 */
-                for (stripe = 0; stripe < rbio->bbio->num_stripes; stripe++) {
+                for (stripe = 0; stripe < rbio->real_stripes; stripe++) {
                        /*
                         * if we're rebuilding a read, we have to use
                         * pages from the bio list
                         */
-                        if (rbio->read_rebuild &&
+                        if (rbio->operation == BTRFS_RBIO_READ_REBUILD &&
                            (stripe == faila || stripe == failb)) {
                                page = page_in_rbio(rbio, stripe, pagenr, 0);
                        } else {
@@ -1767,7 +1885,7 @@ static void __raid_recover_end_io(struct btrfs_raid_bio *rbio)
                }
                /* all raid6 handling here */
-                if (rbio->raid_map[rbio->bbio->num_stripes - 1] ==
+                if (rbio->raid_map[rbio->real_stripes - 1] ==
                    RAID6_Q_STRIPE) {
                        /*
@@ -1817,10 +1935,10 @@ static void __raid_recover_end_io(struct btrfs_raid_bio *rbio)
                        }
                        if (rbio->raid_map[failb] == RAID5_P_STRIPE) {
-                                raid6_datap_recov(rbio->bbio->num_stripes,
+                                raid6_datap_recov(rbio->real_stripes,
                                                  PAGE_SIZE, faila, pointers);
                        } else {
-                                raid6_2data_recov(rbio->bbio->num_stripes,
+                                raid6_2data_recov(rbio->real_stripes,
                                                  PAGE_SIZE, faila, failb,
                                                  pointers);
                        }
@@ -1850,7 +1968,7 @@ pstripe:
                 * know they can be trusted.  If this was a read reconstruction,
                 * other endio functions will fiddle the uptodate bits
                 */
-                if (!rbio->read_rebuild) {
+                if (rbio->operation == BTRFS_RBIO_WRITE) {
                        for (i = 0;  i < nr_pages; i++) {
                                if (faila != -1) {
                                        page = rbio_stripe_page(rbio, faila, i);
@@ -1862,12 +1980,12 @@ pstripe:
                                }
                        }
                }
-                for (stripe = 0; stripe < rbio->bbio->num_stripes; stripe++) {
+                for (stripe = 0; stripe < rbio->real_stripes; stripe++) {
                        /*
                         * if we're rebuilding a read, we have to use
                         * pages from the bio list
                         */
-                        if (rbio->read_rebuild &&
+                        if (rbio->operation == BTRFS_RBIO_READ_REBUILD &&
                            (stripe == faila || stripe == failb)) {
                                page = page_in_rbio(rbio, stripe, pagenr, 0);
                        } else {
@@ -1882,9 +2000,9 @@ cleanup:
        kfree(pointers);
 cleanup_io:
+        if (rbio->operation == BTRFS_RBIO_READ_REBUILD) {
-        if (rbio->read_rebuild) {
+                if (err == 0 &&
-                if (err == 0)
+                    !test_bit(RBIO_HOLD_BBIO_MAP_BIT, &rbio->flags))
                        cache_rbio_pages(rbio);
                else
                        clear_bit(RBIO_CACHE_READY_BIT, &rbio->flags);
@@ -1893,7 +2011,13 @@ cleanup_io:
        } else if (err == 0) {
                rbio->faila = -1;
                rbio->failb = -1;
-                finish_rmw(rbio);
+                if (rbio->operation == BTRFS_RBIO_WRITE)
+                        finish_rmw(rbio);
+                else if (rbio->operation == BTRFS_RBIO_PARITY_SCRUB)
+                        finish_parity_scrub(rbio, 0);
+                else
+                        BUG();
        } else {
                rbio_orig_end_io(rbio, err, 0);
        }
@@ -1917,10 +2041,10 @@ static void raid_recover_end_io(struct bio *bio, int err)
                set_bio_pages_uptodate(bio);
        bio_put(bio);
-        if (!atomic_dec_and_test(&rbio->bbio->stripes_pending))
+        if (!atomic_dec_and_test(&rbio->stripes_pending))
                return;
-        if (atomic_read(&rbio->bbio->error) > rbio->bbio->max_errors)
+        if (atomic_read(&rbio->error) > rbio->bbio->max_errors)
                rbio_orig_end_io(rbio, -EIO, 0);
        else
                __raid_recover_end_io(rbio);
@@ -1937,7 +2061,6 @@ static void raid_recover_end_io(struct bio *bio, int err)
 static int __raid56_parity_recover(struct btrfs_raid_bio *rbio)
 {
        int bios_to_read = 0;
-        struct btrfs_bio *bbio = rbio->bbio;
        struct bio_list bio_list;
        int ret;
        int nr_pages = DIV_ROUND_UP(rbio->stripe_len, PAGE_CACHE_SIZE);
@@ -1951,16 +2074,16 @@ static int __raid56_parity_recover(struct btrfs_raid_bio *rbio)
        if (ret)
                goto cleanup;
-        atomic_set(&rbio->bbio->error, 0);
+        atomic_set(&rbio->error, 0);
        /*
         * read everything that hasn't failed.  Thanks to the
         * stripe cache, it is possible that some or all of these
         * pages are going to be uptodate.
         */
-        for (stripe = 0; stripe < bbio->num_stripes; stripe++) {
+        for (stripe = 0; stripe < rbio->real_stripes; stripe++) {
                if (rbio->faila == stripe || rbio->failb == stripe) {
-                        atomic_inc(&rbio->bbio->error);
+                        atomic_inc(&rbio->error);
                        continue;
                }
@@ -1990,7 +2113,7 @@ static int __raid56_parity_recover(struct btrfs_raid_bio *rbio)
                 * were up to date, or we might have no bios to read because
                 * the devices were gone.
                 */
-                if (atomic_read(&rbio->bbio->error) <= rbio->bbio->max_errors) {
+                if (atomic_read(&rbio->error) <= rbio->bbio->max_errors) {
                        __raid_recover_end_io(rbio);
                        goto out;
                } else {
@@ -2002,7 +2125,7 @@ static int __raid56_parity_recover(struct btrfs_raid_bio *rbio)
         * the bbio may be freed once we submit the last bio.  Make sure
         * not to touch it after that
         */
-        atomic_set(&bbio->stripes_pending, bios_to_read);
+        atomic_set(&rbio->stripes_pending, bios_to_read);
        while (1) {
                bio = bio_list_pop(&bio_list);
                if (!bio)
@@ -2021,7 +2144,7 @@ out:
        return 0;
 cleanup:
-        if (rbio->read_rebuild)
+        if (rbio->operation == BTRFS_RBIO_READ_REBUILD)
                rbio_orig_end_io(rbio, -EIO, 0);
        return -EIO;
 }
@@ -2034,34 +2157,42 @@ cleanup:
 */
 int raid56_parity_recover(struct btrfs_root *root, struct bio *bio,
                          struct btrfs_bio *bbio, u64 *raid_map,
-                          u64 stripe_len, int mirror_num)
+                          u64 stripe_len, int mirror_num, int generic_io)
 {
        struct btrfs_raid_bio *rbio;
        int ret;
        rbio = alloc_rbio(root, bbio, raid_map, stripe_len);
-        if (IS_ERR(rbio))
+        if (IS_ERR(rbio)) {
+                __free_bbio_and_raid_map(bbio, raid_map, generic_io);
                return PTR_ERR(rbio);
+        }
-        rbio->read_rebuild = 1;
+        rbio->operation = BTRFS_RBIO_READ_REBUILD;
        bio_list_add(&rbio->bio_list, bio);
        rbio->bio_list_bytes = bio->bi_iter.bi_size;
        rbio->faila = find_logical_bio_stripe(rbio, bio);
        if (rbio->faila == -1) {
                BUG();
-                kfree(raid_map);
+                __free_bbio_and_raid_map(bbio, raid_map, generic_io);
-                kfree(bbio);
                kfree(rbio);
                return -EIO;
        }
+        if (generic_io) {
+                btrfs_bio_counter_inc_noblocked(root->fs_info);
+                rbio->generic_bio_cnt = 1;
+        } else {
+                set_bit(RBIO_HOLD_BBIO_MAP_BIT, &rbio->flags);
+        }
        /*
         * reconstruct from the q stripe if they are
         * asking for mirror 3
         */
        if (mirror_num == 3)
-                rbio->failb = bbio->num_stripes - 2;
+                rbio->failb = rbio->real_stripes - 2;
        ret = lock_stripe_add(rbio);
@@ -2098,3 +2229,483 @@ static void read_rebuild_work(struct btrfs_work *work)
        rbio = container_of(work, struct btrfs_raid_bio, work);
        __raid56_parity_recover(rbio);
 }
+/*
+ * The following code is used to scrub/replace the parity stripe
+ *
+ * Note: We need make sure all the pages that add into the scrub/replace
+ * raid bio are correct and not be changed during the scrub/replace. That
+ * is those pages just hold metadata or file data with checksum.
+ */
+struct btrfs_raid_bio *
+raid56_parity_alloc_scrub_rbio(struct btrfs_root *root, struct bio *bio,
+                               struct btrfs_bio *bbio, u64 *raid_map,
+                               u64 stripe_len, struct btrfs_device *scrub_dev,
+                               unsigned long *dbitmap, int stripe_nsectors)
+{
+        struct btrfs_raid_bio *rbio;
+        int i;
+        rbio = alloc_rbio(root, bbio, raid_map, stripe_len);
+        if (IS_ERR(rbio))
+                return NULL;
+        bio_list_add(&rbio->bio_list, bio);
+        /*
+         * This is a special bio which is used to hold the completion handler
+         * and make the scrub rbio is similar to the other types
+         */
+        ASSERT(!bio->bi_iter.bi_size);
+        rbio->operation = BTRFS_RBIO_PARITY_SCRUB;
+        for (i = 0; i < rbio->real_stripes; i++) {
+                if (bbio->stripes[i].dev == scrub_dev) {
+                        rbio->scrubp = i;
+                        break;
+                }
+        }
+        /* Now we just support the sectorsize equals to page size */
+        ASSERT(root->sectorsize == PAGE_SIZE);
+        ASSERT(rbio->stripe_npages == stripe_nsectors);
+        bitmap_copy(rbio->dbitmap, dbitmap, stripe_nsectors);
+        return rbio;
+}
+void raid56_parity_add_scrub_pages(struct btrfs_raid_bio *rbio,
+                                   struct page *page, u64 logical)
+{
+        int stripe_offset;
+        int index;
+        ASSERT(logical >= rbio->raid_map[0]);
+        ASSERT(logical + PAGE_SIZE <= rbio->raid_map[0] +
+                                rbio->stripe_len * rbio->nr_data);
+        stripe_offset = (int)(logical - rbio->raid_map[0]);
+        index = stripe_offset >> PAGE_CACHE_SHIFT;
+        rbio->bio_pages[index] = page;
+}
+/*
+ * We just scrub the parity that we have correct data on the same horizontal,
+ * so we needn't allocate all pages for all the stripes.
+ */
+static int alloc_rbio_essential_pages(struct btrfs_raid_bio *rbio)
+{
+        int i;
+        int bit;
+        int index;
+        struct page *page;
+        for_each_set_bit(bit, rbio->dbitmap, rbio->stripe_npages) {
+                for (i = 0; i < rbio->real_stripes; i++) {
+                        index = i * rbio->stripe_npages + bit;
+                        if (rbio->stripe_pages[index])
+                                continue;
+                        page = alloc_page(GFP_NOFS | __GFP_HIGHMEM);
+                        if (!page)
+                                return -ENOMEM;
+                        rbio->stripe_pages[index] = page;
+                        ClearPageUptodate(page);
+                }
+        }
+        return 0;
+}
+/*
+ * end io function used by finish_rmw.  When we finally
+ * get here, we've written a full stripe
+ */
+static void raid_write_parity_end_io(struct bio *bio, int err)
+{
+        struct btrfs_raid_bio *rbio = bio->bi_private;
+        if (err)
+                fail_bio_stripe(rbio, bio);
+        bio_put(bio);
+        if (!atomic_dec_and_test(&rbio->stripes_pending))
+                return;
+        err = 0;
+        if (atomic_read(&rbio->error))
+                err = -EIO;
+        rbio_orig_end_io(rbio, err, 0);
+}
+static noinline void finish_parity_scrub(struct btrfs_raid_bio *rbio,
+                                         int need_check)
+{
+        struct btrfs_bio *bbio = rbio->bbio;
+        void *pointers[rbio->real_stripes];
+        DECLARE_BITMAP(pbitmap, rbio->stripe_npages);
+        int nr_data = rbio->nr_data;
+        int stripe;
+        int pagenr;
+        int p_stripe = -1;
+        int q_stripe = -1;
+        struct page *p_page = NULL;
+        struct page *q_page = NULL;
+        struct bio_list bio_list;
+        struct bio *bio;
+        int is_replace = 0;
+        int ret;
+        bio_list_init(&bio_list);
+        if (rbio->real_stripes - rbio->nr_data == 1) {
+                p_stripe = rbio->real_stripes - 1;
+        } else if (rbio->real_stripes - rbio->nr_data == 2) {
+                p_stripe = rbio->real_stripes - 2;
+                q_stripe = rbio->real_stripes - 1;
+        } else {
+                BUG();
+        }
+        if (bbio->num_tgtdevs && bbio->tgtdev_map[rbio->scrubp]) {
+                is_replace = 1;
+                bitmap_copy(pbitmap, rbio->dbitmap, rbio->stripe_npages);
+        }
+        /*
+         * Because the higher layers(scrubber) are unlikely to
+         * use this area of the disk again soon, so don't cache
+         * it.
+         */
+        clear_bit(RBIO_CACHE_READY_BIT, &rbio->flags);
+        if (!need_check)
+                goto writeback;
+        p_page = alloc_page(GFP_NOFS | __GFP_HIGHMEM);
+        if (!p_page)
+                goto cleanup;
+        SetPageUptodate(p_page);
+        if (q_stripe != -1) {
+                q_page = alloc_page(GFP_NOFS | __GFP_HIGHMEM);
+                if (!q_page) {
+                        __free_page(p_page);
+                        goto cleanup;
+                }
+                SetPageUptodate(q_page);
+        }
+        atomic_set(&rbio->error, 0);
+        for_each_set_bit(pagenr, rbio->dbitmap, rbio->stripe_npages) {
+                struct page *p;
+                void *parity;
+                /* first collect one page from each data stripe */
+                for (stripe = 0; stripe < nr_data; stripe++) {
+                        p = page_in_rbio(rbio, stripe, pagenr, 0);
+                        pointers[stripe] = kmap(p);
+                }
+                /* then add the parity stripe */
+                pointers[stripe++] = kmap(p_page);
+                if (q_stripe != -1) {
+                        /*
+                         * raid6, add the qstripe and call the
+                         * library function to fill in our p/q
+                         */
+                        pointers[stripe++] = kmap(q_page);
+                        raid6_call.gen_syndrome(rbio->real_stripes, PAGE_SIZE,
+                                                pointers);
+                } else {
+                        /* raid5 */
+                        memcpy(pointers[nr_data], pointers[0], PAGE_SIZE);
+                        run_xor(pointers + 1, nr_data - 1, PAGE_CACHE_SIZE);
+                }
+                /* Check scrubbing pairty and repair it */
+                p = rbio_stripe_page(rbio, rbio->scrubp, pagenr);
+                parity = kmap(p);
+                if (memcmp(parity, pointers[rbio->scrubp], PAGE_CACHE_SIZE))
+                        memcpy(parity, pointers[rbio->scrubp], PAGE_CACHE_SIZE);
+                else
+                        /* Parity is right, needn't writeback */
+                        bitmap_clear(rbio->dbitmap, pagenr, 1);
+                kunmap(p);
+                for (stripe = 0; stripe < rbio->real_stripes; stripe++)
+                        kunmap(page_in_rbio(rbio, stripe, pagenr, 0));
+        }
+        __free_page(p_page);
+        if (q_page)
+                __free_page(q_page);
+writeback:
+        /*
+         * time to start writing.  Make bios for everything from the
+         * higher layers (the bio_list in our rbio) and our p/q.  Ignore
+         * everything else.
+         */
+        for_each_set_bit(pagenr, rbio->dbitmap, rbio->stripe_npages) {
+                struct page *page;
+                page = rbio_stripe_page(rbio, rbio->scrubp, pagenr);
+                ret = rbio_add_io_page(rbio, &bio_list,
+                               page, rbio->scrubp, pagenr, rbio->stripe_len);
+                if (ret)
+                        goto cleanup;
+        }
+        if (!is_replace)
+                goto submit_write;
+        for_each_set_bit(pagenr, pbitmap, rbio->stripe_npages) {
+                struct page *page;
+                page = rbio_stripe_page(rbio, rbio->scrubp, pagenr);
+                ret = rbio_add_io_page(rbio, &bio_list, page,
+                                       bbio->tgtdev_map[rbio->scrubp],
+                                       pagenr, rbio->stripe_len);
+                if (ret)
+                        goto cleanup;
+        }
+submit_write:
+        nr_data = bio_list_size(&bio_list);
+        if (!nr_data) {
+                /* Every parity is right */
+                rbio_orig_end_io(rbio, 0, 0);
+                return;
+        }
+        atomic_set(&rbio->stripes_pending, nr_data);
+        while (1) {
+                bio = bio_list_pop(&bio_list);
+                if (!bio)
+                        break;
+                bio->bi_private = rbio;
+                bio->bi_end_io = raid_write_parity_end_io;
+                BUG_ON(!test_bit(BIO_UPTODATE, &bio->bi_flags));
+                submit_bio(WRITE, bio);
+        }
+        return;
+cleanup:
+        rbio_orig_end_io(rbio, -EIO, 0);
+}
+static inline int is_data_stripe(struct btrfs_raid_bio *rbio, int stripe)
+{
+        if (stripe >= 0 && stripe < rbio->nr_data)
+                return 1;
+        return 0;
+}
+/*
+ * While we're doing the parity check and repair, we could have errors
+ * in reading pages off the disk.  This checks for errors and if we're
+ * not able to read the page it'll trigger parity reconstruction.  The
+ * parity scrub will be finished after we've reconstructed the failed
+ * stripes
+ */
+static void validate_rbio_for_parity_scrub(struct btrfs_raid_bio *rbio)
+{
+        if (atomic_read(&rbio->error) > rbio->bbio->max_errors)
+                goto cleanup;
+        if (rbio->faila >= 0 || rbio->failb >= 0) {
+                int dfail = 0, failp = -1;
+                if (is_data_stripe(rbio, rbio->faila))
+                        dfail++;
+                else if (is_parity_stripe(rbio->faila))
+                        failp = rbio->faila;
+                if (is_data_stripe(rbio, rbio->failb))
+                        dfail++;
+                else if (is_parity_stripe(rbio->failb))
+                        failp = rbio->failb;
+                /*
+                 * Because we can not use a scrubbing parity to repair
+                 * the data, so the capability of the repair is declined.
+                 * (In the case of RAID5, we can not repair anything)
+                 */
+                if (dfail > rbio->bbio->max_errors - 1)
+                        goto cleanup;
+                /*
+                 * If all data is good, only parity is correctly, just
+                 * repair the parity.
+                 */
+                if (dfail == 0) {
+                        finish_parity_scrub(rbio, 0);
+                        return;
+                }
+                /*
+                 * Here means we got one corrupted data stripe and one
+                 * corrupted parity on RAID6, if the corrupted parity
+                 * is scrubbing parity, luckly, use the other one to repair
+                 * the data, or we can not repair the data stripe.
+                 */
+                if (failp != rbio->scrubp)
+                        goto cleanup;
+                __raid_recover_end_io(rbio);
+        } else {
+                finish_parity_scrub(rbio, 1);
+        }
+        return;
+cleanup:
+        rbio_orig_end_io(rbio, -EIO, 0);
+}
+/*
+ * end io for the read phase of the rmw cycle.  All the bios here are physical
+ * stripe bios we've read from the disk so we can recalculate the parity of the
+ * stripe.
+ *
+ * This will usually kick off finish_rmw once all the bios are read in, but it
+ * may trigger parity reconstruction if we had any errors along the way
+ */
+static void raid56_parity_scrub_end_io(struct bio *bio, int err)
+{
+        struct btrfs_raid_bio *rbio = bio->bi_private;
+        if (err)
+                fail_bio_stripe(rbio, bio);
+        else
+                set_bio_pages_uptodate(bio);
+        bio_put(bio);
+        if (!atomic_dec_and_test(&rbio->stripes_pending))
+                return;
+        /*
+         * this will normally call finish_rmw to start our write
+         * but if there are any failed stripes we'll reconstruct
+         * from parity first
+         */
+        validate_rbio_for_parity_scrub(rbio);
+}
+static void raid56_parity_scrub_stripe(struct btrfs_raid_bio *rbio)
+{
+        int bios_to_read = 0;
+        struct bio_list bio_list;
+        int ret;
+        int pagenr;
+        int stripe;
+        struct bio *bio;
+        ret = alloc_rbio_essential_pages(rbio);
+        if (ret)
+                goto cleanup;
+        bio_list_init(&bio_list);
+        atomic_set(&rbio->error, 0);
+        /*
+         * build a list of bios to read all the missing parts of this
+         * stripe
+         */
+        for (stripe = 0; stripe < rbio->real_stripes; stripe++) {
+                for_each_set_bit(pagenr, rbio->dbitmap, rbio->stripe_npages) {
+                        struct page *page;
+                        /*
+                         * we want to find all the pages missing from
+                         * the rbio and read them from the disk.  If
+                         * page_in_rbio finds a page in the bio list
+                         * we don't need to read it off the stripe.
+                         */
+                        page = page_in_rbio(rbio, stripe, pagenr, 1);
+                        if (page)
+                                continue;
+                        page = rbio_stripe_page(rbio, stripe, pagenr);
+                        /*
+                         * the bio cache may have handed us an uptodate
+                         * page.  If so, be happy and use it
+                         */
+                        if (PageUptodate(page))
+                                continue;
+                        ret = rbio_add_io_page(rbio, &bio_list, page,
+                                       stripe, pagenr, rbio->stripe_len);
+                        if (ret)
+                                goto cleanup;
+                }
+        }
+        bios_to_read = bio_list_size(&bio_list);
+        if (!bios_to_read) {
+                /*
+                 * this can happen if others have merged with
+                 * us, it means there is nothing left to read.
+                 * But if there are missing devices it may not be
+                 * safe to do the full stripe write yet.
+                 */
+                goto finish;
+        }
+        /*
+         * the bbio may be freed once we submit the last bio.  Make sure
+         * not to touch it after that
+         */
+        atomic_set(&rbio->stripes_pending, bios_to_read);
+        while (1) {
+                bio = bio_list_pop(&bio_list);
+                if (!bio)
+                        break;
+                bio->bi_private = rbio;
+                bio->bi_end_io = raid56_parity_scrub_end_io;
+                btrfs_bio_wq_end_io(rbio->fs_info, bio,
+                                    BTRFS_WQ_ENDIO_RAID56);
+                BUG_ON(!test_bit(BIO_UPTODATE, &bio->bi_flags));
+                submit_bio(READ, bio);
+        }
+        /* the actual write will happen once the reads are done */
+        return;
+cleanup:
+        rbio_orig_end_io(rbio, -EIO, 0);
+        return;
+finish:
+        validate_rbio_for_parity_scrub(rbio);
+}
+static void scrub_parity_work(struct btrfs_work *work)
+{
+        struct btrfs_raid_bio *rbio;
+        rbio = container_of(work, struct btrfs_raid_bio, work);
+        raid56_parity_scrub_stripe(rbio);
+}
+static void async_scrub_parity(struct btrfs_raid_bio *rbio)
+{
+        btrfs_init_work(&rbio->work, btrfs_rmw_helper,
+                        scrub_parity_work, NULL, NULL);
+        btrfs_queue_work(rbio->fs_info->rmw_workers,
+                         &rbio->work);
+}
+void raid56_parity_submit_scrub_rbio(struct btrfs_raid_bio *rbio)
+{
+        if (!lock_stripe_add(rbio))
+                async_scrub_parity(rbio);
+}
diff --git a/fs/btrfs/raid56.h b/fs/btrfs/raid56.h
index ea5d73bfdfbe..31d4a157b5e3 100644
--- a/fs/btrfs/raid56.h
+++ b/fs/btrfs/raid56.h
@@ -39,13 +39,25 @@ static inline int nr_data_stripes(struct map_lookup *map)
 #define is_parity_stripe(x) (((x) == RAID5_P_STRIPE) ||         \
                             ((x) == RAID6_Q_STRIPE))
+struct btrfs_raid_bio;
+struct btrfs_device;
 int raid56_parity_recover(struct btrfs_root *root, struct bio *bio,
-                                 struct btrfs_bio *bbio, u64 *raid_map,
+                          struct btrfs_bio *bbio, u64 *raid_map,
-                                 u64 stripe_len, int mirror_num);
+                          u64 stripe_len, int mirror_num, int generic_io);
 int raid56_parity_write(struct btrfs_root *root, struct bio *bio,
                               struct btrfs_bio *bbio, u64 *raid_map,
                               u64 stripe_len);
+struct btrfs_raid_bio *
+raid56_parity_alloc_scrub_rbio(struct btrfs_root *root, struct bio *bio,
+                               struct btrfs_bio *bbio, u64 *raid_map,
+                               u64 stripe_len, struct btrfs_device *scrub_dev,
+                               unsigned long *dbitmap, int stripe_nsectors);
+void raid56_parity_add_scrub_pages(struct btrfs_raid_bio *rbio,
+                                   struct page *page, u64 logical);
+void raid56_parity_submit_scrub_rbio(struct btrfs_raid_bio *rbio);
 int btrfs_alloc_stripe_hash_table(struct btrfs_fs_info *info);
 void btrfs_free_stripe_hash_table(struct btrfs_fs_info *info);
 #endif
diff --git a/fs/btrfs/scrub.c b/fs/btrfs/scrub.c
index efa083113827..f2bb13a23f86 100644
--- a/fs/btrfs/scrub.c
+++ b/fs/btrfs/scrub.c
@@ -63,10 +63,18 @@ struct scrub_ctx;
 */
 #define SCRUB_MAX_PAGES_PER_BLOCK       16      /* 64k per node/leaf/sector */
+struct scrub_recover {
+        atomic_t                refs;
+        struct btrfs_bio        *bbio;
+        u64                     *raid_map;
+        u64                     map_length;
+};
 struct scrub_page {
        struct scrub_block      *sblock;
        struct page             *page;
        struct btrfs_device     *dev;
+        struct list_head        list;
        u64                     flags;  /* extent flags */
        u64                     generation;
        u64                     logical;
@@ -79,6 +87,8 @@ struct scrub_page {
                unsigned int    io_error:1;
        };
        u8                      csum[BTRFS_CSUM_SIZE];
+        struct scrub_recover    *recover;
 };
 struct scrub_bio {
@@ -105,14 +115,52 @@ struct scrub_block {
        atomic_t                outstanding_pages;
        atomic_t                ref_count; /* free mem on transition to zero */
        struct scrub_ctx        *sctx;
+        struct scrub_parity     *sparity;
        struct {
                unsigned int    header_error:1;
                unsigned int    checksum_error:1;
                unsigned int    no_io_error_seen:1;
                unsigned int    generation_error:1; /* also sets header_error */
+                /* The following is for the data used to check parity */
+                /* It is for the data with checksum */
+                unsigned int    data_corrected:1;
        };
 };
+/* Used for the chunks with parity stripe such RAID5/6 */
+struct scrub_parity {
+        struct scrub_ctx        *sctx;
+        struct btrfs_device     *scrub_dev;
+        u64                     logic_start;
+        u64                     logic_end;
+        int                     nsectors;
+        int                     stripe_len;
+        atomic_t                ref_count;
+        struct list_head        spages;
+        /* Work of parity check and repair */
+        struct btrfs_work       work;
+        /* Mark the parity blocks which have data */
+        unsigned long           *dbitmap;
+        /*
+         * Mark the parity blocks which have data, but errors happen when
+         * read data or check data
+         */
+        unsigned long           *ebitmap;
+        unsigned long           bitmap[0];
+};
 struct scrub_wr_ctx {
        struct scrub_bio *wr_curr_bio;
        struct btrfs_device *tgtdev;
@@ -196,7 +244,7 @@ static int scrub_setup_recheck_block(struct scrub_ctx *sctx,
 static void scrub_recheck_block(struct btrfs_fs_info *fs_info,
                                struct scrub_block *sblock, int is_metadata,
                                int have_csum, u8 *csum, u64 generation,
-                                u16 csum_size);
+                                u16 csum_size, int retry_failed_mirror);
 static void scrub_recheck_block_checksum(struct btrfs_fs_info *fs_info,
                                         struct scrub_block *sblock,
                                         int is_metadata, int have_csum,
@@ -218,6 +266,8 @@ static void scrub_block_get(struct scrub_block *sblock);
 static void scrub_block_put(struct scrub_block *sblock);
 static void scrub_page_get(struct scrub_page *spage);
 static void scrub_page_put(struct scrub_page *spage);
+static void scrub_parity_get(struct scrub_parity *sparity);
+static void scrub_parity_put(struct scrub_parity *sparity);
 static int scrub_add_page_to_rd_bio(struct scrub_ctx *sctx,
                                    struct scrub_page *spage);
 static int scrub_pages(struct scrub_ctx *sctx, u64 logical, u64 len,
@@ -790,6 +840,20 @@ out:
        scrub_pending_trans_workers_dec(sctx);
 }
+static inline void scrub_get_recover(struct scrub_recover *recover)
+{
+        atomic_inc(&recover->refs);
+}
+static inline void scrub_put_recover(struct scrub_recover *recover)
+{
+        if (atomic_dec_and_test(&recover->refs)) {
+                kfree(recover->bbio);
+                kfree(recover->raid_map);
+                kfree(recover);
+        }
+}
 /*
 * scrub_handle_errored_block gets called when either verification of the
 * pages failed or the bio failed to read, e.g. with EIO. In the latter
@@ -906,7 +970,7 @@ static int scrub_handle_errored_block(struct scrub_block *sblock_to_check)
        /* build and submit the bios for the failed mirror, check checksums */
        scrub_recheck_block(fs_info, sblock_bad, is_metadata, have_csum,
-                            csum, generation, sctx->csum_size);
+                            csum, generation, sctx->csum_size, 1);
        if (!sblock_bad->header_error && !sblock_bad->checksum_error &&
            sblock_bad->no_io_error_seen) {
@@ -920,6 +984,7 @@ static int scrub_handle_errored_block(struct scrub_block *sblock_to_check)
                 */
                spin_lock(&sctx->stat_lock);
                sctx->stat.unverified_errors++;
+                sblock_to_check->data_corrected = 1;
                spin_unlock(&sctx->stat_lock);
                if (sctx->is_dev_replace)
@@ -1019,7 +1084,7 @@ nodatasum_case:
                /* build and submit the bios, check checksums */
                scrub_recheck_block(fs_info, sblock_other, is_metadata,
                                    have_csum, csum, generation,
-                                    sctx->csum_size);
+                                    sctx->csum_size, 0);
                if (!sblock_other->header_error &&
                    !sblock_other->checksum_error &&
@@ -1169,7 +1234,7 @@ nodatasum_case:
                         */
                        scrub_recheck_block(fs_info, sblock_bad,
                                            is_metadata, have_csum, csum,
-                                            generation, sctx->csum_size);
+                                            generation, sctx->csum_size, 1);
                        if (!sblock_bad->header_error &&
                            !sblock_bad->checksum_error &&
                            sblock_bad->no_io_error_seen)
@@ -1180,6 +1245,7 @@ nodatasum_case:
 corrected_error:
                        spin_lock(&sctx->stat_lock);
                        sctx->stat.corrected_errors++;
+                        sblock_to_check->data_corrected = 1;
                        spin_unlock(&sctx->stat_lock);
                        printk_ratelimited_in_rcu(KERN_ERR
                                "BTRFS: fixed up error at logical %llu on dev %s\n",
@@ -1201,11 +1267,18 @@ out:
                     mirror_index++) {
                        struct scrub_block *sblock = sblocks_for_recheck +
                                                     mirror_index;
+                        struct scrub_recover *recover;
                        int page_index;
                        for (page_index = 0; page_index < sblock->page_count;
                             page_index++) {
                                sblock->pagev[page_index]->sblock = NULL;
+                                recover = sblock->pagev[page_index]->recover;
+                                if (recover) {
+                                        scrub_put_recover(recover);
+                                        sblock->pagev[page_index]->recover =
+                                                                        NULL;
+                                }
                                scrub_page_put(sblock->pagev[page_index]);
                        }
                }
@@ -1215,14 +1288,63 @@ out:
        return 0;
 }
+static inline int scrub_nr_raid_mirrors(struct btrfs_bio *bbio, u64 *raid_map)
+{
+        if (raid_map) {
+                if (raid_map[bbio->num_stripes - 1] == RAID6_Q_STRIPE)
+                        return 3;
+                else
+                        return 2;
+        } else {
+                return (int)bbio->num_stripes;
+        }
+}
+static inline void scrub_stripe_index_and_offset(u64 logical, u64 *raid_map,
+                                                 u64 mapped_length,
+                                                 int nstripes, int mirror,
+                                                 int *stripe_index,
+                                                 u64 *stripe_offset)
+{
+        int i;
+        if (raid_map) {
+                /* RAID5/6 */
+                for (i = 0; i < nstripes; i++) {
+                        if (raid_map[i] == RAID6_Q_STRIPE ||
+                            raid_map[i] == RAID5_P_STRIPE)
+                                continue;
+                        if (logical >= raid_map[i] &&
+                            logical < raid_map[i] + mapped_length)
+                                break;
+                }
+                *stripe_index = i;
+                *stripe_offset = logical - raid_map[i];
+        } else {
+                /* The other RAID type */
+                *stripe_index = mirror;
+                *stripe_offset = 0;
+        }
+}
 static int scrub_setup_recheck_block(struct scrub_ctx *sctx,
                                     struct btrfs_fs_info *fs_info,
                                     struct scrub_block *original_sblock,
                                     u64 length, u64 logical,
                                     struct scrub_block *sblocks_for_recheck)
 {
+        struct scrub_recover *recover;
+        struct btrfs_bio *bbio;
+        u64 *raid_map;
+        u64 sublen;
+        u64 mapped_length;
+        u64 stripe_offset;
+        int stripe_index;
        int page_index;
        int mirror_index;
+        int nmirrors;
        int ret;
        /*
@@ -1233,23 +1355,39 @@ static int scrub_setup_recheck_block(struct scrub_ctx *sctx,
        page_index = 0;
        while (length > 0) {
-                u64 sublen = min_t(u64, length, PAGE_SIZE);
+                sublen = min_t(u64, length, PAGE_SIZE);
-                u64 mapped_length = sublen;
+                mapped_length = sublen;
-                struct btrfs_bio *bbio = NULL;
+                bbio = NULL;
+                raid_map = NULL;
                /*
                 * with a length of PAGE_SIZE, each returned stripe
                 * represents one mirror
                 */
-                ret = btrfs_map_block(fs_info, REQ_GET_READ_MIRRORS, logical,
+                ret = btrfs_map_sblock(fs_info, REQ_GET_READ_MIRRORS, logical,
-                                      &mapped_length, &bbio, 0);
+                                       &mapped_length, &bbio, 0, &raid_map);
                if (ret || !bbio || mapped_length < sublen) {
                        kfree(bbio);
+                        kfree(raid_map);
                        return -EIO;
                }
+                recover = kzalloc(sizeof(struct scrub_recover), GFP_NOFS);
+                if (!recover) {
+                        kfree(bbio);
+                        kfree(raid_map);
+                        return -ENOMEM;
+                }
+                atomic_set(&recover->refs, 1);
+                recover->bbio = bbio;
+                recover->raid_map = raid_map;
+                recover->map_length = mapped_length;
                BUG_ON(page_index >= SCRUB_PAGES_PER_RD_BIO);
-                for (mirror_index = 0; mirror_index < (int)bbio->num_stripes;
+                nmirrors = scrub_nr_raid_mirrors(bbio, raid_map);
+                for (mirror_index = 0; mirror_index < nmirrors;
                     mirror_index++) {
                        struct scrub_block *sblock;
                        struct scrub_page *page;
@@ -1265,26 +1403,38 @@ leave_nomem:
                                spin_lock(&sctx->stat_lock);
                                sctx->stat.malloc_errors++;
                                spin_unlock(&sctx->stat_lock);
-                                kfree(bbio);
+                                scrub_put_recover(recover);
                                return -ENOMEM;
                        }
                        scrub_page_get(page);
                        sblock->pagev[page_index] = page;
                        page->logical = logical;
-                        page->physical = bbio->stripes[mirror_index].physical;
+                        scrub_stripe_index_and_offset(logical, raid_map,
+                                                      mapped_length,
+                                                      bbio->num_stripes,
+                                                      mirror_index,
+                                                      &stripe_index,
+                                                      &stripe_offset);
+                        page->physical = bbio->stripes[stripe_index].physical +
+                                         stripe_offset;
+                        page->dev = bbio->stripes[stripe_index].dev;
                        BUG_ON(page_index >= original_sblock->page_count);
                        page->physical_for_dev_replace =
                                original_sblock->pagev[page_index]->
                                physical_for_dev_replace;
                        /* for missing devices, dev->bdev is NULL */
-                        page->dev = bbio->stripes[mirror_index].dev;
                        page->mirror_num = mirror_index + 1;
                        sblock->page_count++;
                        page->page = alloc_page(GFP_NOFS);
                        if (!page->page)
                                goto leave_nomem;
+                        scrub_get_recover(recover);
+                        page->recover = recover;
                }
-                kfree(bbio);
+                scrub_put_recover(recover);
                length -= sublen;
                logical += sublen;
                page_index++;
@@ -1293,6 +1443,51 @@ leave_nomem:
        return 0;
 }
+struct scrub_bio_ret {
+        struct completion event;
+        int error;
+};
+static void scrub_bio_wait_endio(struct bio *bio, int error)
+{
+        struct scrub_bio_ret *ret = bio->bi_private;
+        ret->error = error;
+        complete(&ret->event);
+}
+static inline int scrub_is_page_on_raid56(struct scrub_page *page)
+{
+        return page->recover && page->recover->raid_map;
+}
+static int scrub_submit_raid56_bio_wait(struct btrfs_fs_info *fs_info,
+                                        struct bio *bio,
+                                        struct scrub_page *page)
+{
+        struct scrub_bio_ret done;
+        int ret;
+        init_completion(&done.event);
+        done.error = 0;
+        bio->bi_iter.bi_sector = page->logical >> 9;
+        bio->bi_private = &done;
+        bio->bi_end_io = scrub_bio_wait_endio;
+        ret = raid56_parity_recover(fs_info->fs_root, bio, page->recover->bbio,
+                                    page->recover->raid_map,
+                                    page->recover->map_length,
+                                    page->mirror_num, 0);
+        if (ret)
+                return ret;
+        wait_for_completion(&done.event);
+        if (done.error)
+                return -EIO;
+        return 0;
+}
 /*
 * this function will check the on disk data for checksum errors, header
 * errors and read I/O errors. If any I/O errors happen, the exact pages
@@ -1303,7 +1498,7 @@ leave_nomem:
 static void scrub_recheck_block(struct btrfs_fs_info *fs_info,
                                struct scrub_block *sblock, int is_metadata,
                                int have_csum, u8 *csum, u64 generation,
-                                u16 csum_size)
+                                u16 csum_size, int retry_failed_mirror)
 {
        int page_num;
@@ -1329,11 +1524,17 @@ static void scrub_recheck_block(struct btrfs_fs_info *fs_info,
                        continue;
                }
                bio->bi_bdev = page->dev->bdev;
-                bio->bi_iter.bi_sector = page->physical >> 9;
                bio_add_page(bio, page->page, PAGE_SIZE, 0);
-                if (btrfsic_submit_bio_wait(READ, bio))
+                if (!retry_failed_mirror && scrub_is_page_on_raid56(page)) {
-                        sblock->no_io_error_seen = 0;
+                        if (scrub_submit_raid56_bio_wait(fs_info, bio, page))
+                                sblock->no_io_error_seen = 0;
+                } else {
+                        bio->bi_iter.bi_sector = page->physical >> 9;
+                        if (btrfsic_submit_bio_wait(READ, bio))
+                                sblock->no_io_error_seen = 0;
+                }
                bio_put(bio);
        }
@@ -1486,6 +1687,13 @@ static void scrub_write_block_to_dev_replace(struct scrub_block *sblock)
 {
        int page_num;
+        /*
+         * This block is used for the check of the parity on the source device,
+         * so the data needn't be written into the destination device.
+         */
+        if (sblock->sparity)
+                return;
        for (page_num = 0; page_num < sblock->page_count; page_num++) {
                int ret;
@@ -1867,6 +2075,9 @@ static void scrub_block_put(struct scrub_block *sblock)
        if (atomic_dec_and_test(&sblock->ref_count)) {
                int i;
+                if (sblock->sparity)
+                        scrub_parity_put(sblock->sparity);
                for (i = 0; i < sblock->page_count; i++)
                        scrub_page_put(sblock->pagev[i]);
                kfree(sblock);
@@ -2124,9 +2335,51 @@ static void scrub_bio_end_io_worker(struct btrfs_work *work)
        scrub_pending_bio_dec(sctx);
 }
+static inline void __scrub_mark_bitmap(struct scrub_parity *sparity,
+                                       unsigned long *bitmap,
+                                       u64 start, u64 len)
+{
+        int offset;
+        int nsectors;
+        int sectorsize = sparity->sctx->dev_root->sectorsize;
+        if (len >= sparity->stripe_len) {
+                bitmap_set(bitmap, 0, sparity->nsectors);
+                return;
+        }
+        start -= sparity->logic_start;
+        offset = (int)do_div(start, sparity->stripe_len);
+        offset /= sectorsize;
+        nsectors = (int)len / sectorsize;
+        if (offset + nsectors <= sparity->nsectors) {
+                bitmap_set(bitmap, offset, nsectors);
+                return;
+        }
+        bitmap_set(bitmap, offset, sparity->nsectors - offset);
+        bitmap_set(bitmap, 0, nsectors - (sparity->nsectors - offset));
+}
+static inline void scrub_parity_mark_sectors_error(struct scrub_parity *sparity,
+                                                   u64 start, u64 len)
+{
+        __scrub_mark_bitmap(sparity, sparity->ebitmap, start, len);
+}
+static inline void scrub_parity_mark_sectors_data(struct scrub_parity *sparity,
+                                                  u64 start, u64 len)
+{
+        __scrub_mark_bitmap(sparity, sparity->dbitmap, start, len);
+}
 static void scrub_block_complete(struct scrub_block *sblock)
 {
+        int corrupted = 0;
        if (!sblock->no_io_error_seen) {
+                corrupted = 1;
                scrub_handle_errored_block(sblock);
        } else {
                /*
@@ -2134,9 +2387,19 @@ static void scrub_block_complete(struct scrub_block *sblock)
                 * dev replace case, otherwise write here in dev replace
                 * case.
                 */
-                if (!scrub_checksum(sblock) && sblock->sctx->is_dev_replace)
+                corrupted = scrub_checksum(sblock);
+                if (!corrupted && sblock->sctx->is_dev_replace)
                        scrub_write_block_to_dev_replace(sblock);
        }
+        if (sblock->sparity && corrupted && !sblock->data_corrected) {
+                u64 start = sblock->pagev[0]->logical;
+                u64 end = sblock->pagev[sblock->page_count - 1]->logical +
+                          PAGE_SIZE;
+                scrub_parity_mark_sectors_error(sblock->sparity,
+                                                start, end - start);
+        }
 }
 static int scrub_find_csum(struct scrub_ctx *sctx, u64 logical, u64 len,
@@ -2228,6 +2491,132 @@ behind_scrub_pages:
        return 0;
 }
+static int scrub_pages_for_parity(struct scrub_parity *sparity,
+                                  u64 logical, u64 len,
+                                  u64 physical, struct btrfs_device *dev,
+                                  u64 flags, u64 gen, int mirror_num, u8 *csum)
+{
+        struct scrub_ctx *sctx = sparity->sctx;
+        struct scrub_block *sblock;
+        int index;
+        sblock = kzalloc(sizeof(*sblock), GFP_NOFS);
+        if (!sblock) {
+                spin_lock(&sctx->stat_lock);
+                sctx->stat.malloc_errors++;
+                spin_unlock(&sctx->stat_lock);
+                return -ENOMEM;
+        }
+        /* one ref inside this function, plus one for each page added to
+         * a bio later on */
+        atomic_set(&sblock->ref_count, 1);
+        sblock->sctx = sctx;
+        sblock->no_io_error_seen = 1;
+        sblock->sparity = sparity;
+        scrub_parity_get(sparity);
+        for (index = 0; len > 0; index++) {
+                struct scrub_page *spage;
+                u64 l = min_t(u64, len, PAGE_SIZE);
+                spage = kzalloc(sizeof(*spage), GFP_NOFS);
+                if (!spage) {
+leave_nomem:
+                        spin_lock(&sctx->stat_lock);
+                        sctx->stat.malloc_errors++;
+                        spin_unlock(&sctx->stat_lock);
+                        scrub_block_put(sblock);
+                        return -ENOMEM;
+                }
+                BUG_ON(index >= SCRUB_MAX_PAGES_PER_BLOCK);
+                /* For scrub block */
+                scrub_page_get(spage);
+                sblock->pagev[index] = spage;
+                /* For scrub parity */
+                scrub_page_get(spage);
+                list_add_tail(&spage->list, &sparity->spages);
+                spage->sblock = sblock;
+                spage->dev = dev;
+                spage->flags = flags;
+                spage->generation = gen;
+                spage->logical = logical;
+                spage->physical = physical;
+                spage->mirror_num = mirror_num;
+                if (csum) {
+                        spage->have_csum = 1;
+                        memcpy(spage->csum, csum, sctx->csum_size);
+                } else {
+                        spage->have_csum = 0;
+                }
+                sblock->page_count++;
+                spage->page = alloc_page(GFP_NOFS);
+                if (!spage->page)
+                        goto leave_nomem;
+                len -= l;
+                logical += l;
+                physical += l;
+        }
+        WARN_ON(sblock->page_count == 0);
+        for (index = 0; index < sblock->page_count; index++) {
+                struct scrub_page *spage = sblock->pagev[index];
+                int ret;
+                ret = scrub_add_page_to_rd_bio(sctx, spage);
+                if (ret) {
+                        scrub_block_put(sblock);
+                        return ret;
+                }
+        }
+        /* last one frees, either here or in bio completion for last page */
+        scrub_block_put(sblock);
+        return 0;
+}
+static int scrub_extent_for_parity(struct scrub_parity *sparity,
+                                   u64 logical, u64 len,
+                                   u64 physical, struct btrfs_device *dev,
+                                   u64 flags, u64 gen, int mirror_num)
+{
+        struct scrub_ctx *sctx = sparity->sctx;
+        int ret;
+        u8 csum[BTRFS_CSUM_SIZE];
+        u32 blocksize;
+        if (flags & BTRFS_EXTENT_FLAG_DATA) {
+                blocksize = sctx->sectorsize;
+        } else if (flags & BTRFS_EXTENT_FLAG_TREE_BLOCK) {
+                blocksize = sctx->nodesize;
+        } else {
+                blocksize = sctx->sectorsize;
+                WARN_ON(1);
+        }
+        while (len) {
+                u64 l = min_t(u64, len, blocksize);
+                int have_csum = 0;
+                if (flags & BTRFS_EXTENT_FLAG_DATA) {
+                        /* push csums to sbio */
+                        have_csum = scrub_find_csum(sctx, logical, l, csum);
+                        if (have_csum == 0)
+                                goto skip;
+                }
+                ret = scrub_pages_for_parity(sparity, logical, l, physical, dev,
+                                             flags, gen, mirror_num,
+                                             have_csum ? csum : NULL);
+skip:
+                if (ret)
+                        return ret;
+                len -= l;
+                logical += l;
+                physical += l;
+        }
+        return 0;
+}
 /*
 * Given a physical address, this will calculate it's
 * logical offset. if this is a parity stripe, it will return
@@ -2236,7 +2625,8 @@ behind_scrub_pages:
 * return 0 if it is a data stripe, 1 means parity stripe.
 */
 static int get_raid56_logic_offset(u64 physical, int num,
-                                   struct map_lookup *map, u64 *offset)
+                                   struct map_lookup *map, u64 *offset,
+                                   u64 *stripe_start)
 {
        int i;
        int j = 0;
@@ -2247,6 +2637,9 @@ static int get_raid56_logic_offset(u64 physical, int num,
        last_offset = (physical - map->stripes[num].physical) *
                      nr_data_stripes(map);
+        if (stripe_start)
+                *stripe_start = last_offset;
        *offset = last_offset;
        for (i = 0; i < nr_data_stripes(map); i++) {
                *offset = last_offset + i * map->stripe_len;
@@ -2269,13 +2662,330 @@ static int get_raid56_logic_offset(u64 physical, int num,
        return 1;
 }
+static void scrub_free_parity(struct scrub_parity *sparity)
+{
+        struct scrub_ctx *sctx = sparity->sctx;
+        struct scrub_page *curr, *next;
+        int nbits;
+        nbits = bitmap_weight(sparity->ebitmap, sparity->nsectors);
+        if (nbits) {
+                spin_lock(&sctx->stat_lock);
+                sctx->stat.read_errors += nbits;
+                sctx->stat.uncorrectable_errors += nbits;
+                spin_unlock(&sctx->stat_lock);
+        }
+        list_for_each_entry_safe(curr, next, &sparity->spages, list) {
+                list_del_init(&curr->list);
+                scrub_page_put(curr);
+        }
+        kfree(sparity);
+}
+static void scrub_parity_bio_endio(struct bio *bio, int error)
+{
+        struct scrub_parity *sparity = (struct scrub_parity *)bio->bi_private;
+        struct scrub_ctx *sctx = sparity->sctx;
+        if (error)
+                bitmap_or(sparity->ebitmap, sparity->ebitmap, sparity->dbitmap,
+                          sparity->nsectors);
+        scrub_free_parity(sparity);
+        scrub_pending_bio_dec(sctx);
+        bio_put(bio);
+}
+static void scrub_parity_check_and_repair(struct scrub_parity *sparity)
+{
+        struct scrub_ctx *sctx = sparity->sctx;
+        struct bio *bio;
+        struct btrfs_raid_bio *rbio;
+        struct scrub_page *spage;
+        struct btrfs_bio *bbio = NULL;
+        u64 *raid_map = NULL;
+        u64 length;
+        int ret;
+        if (!bitmap_andnot(sparity->dbitmap, sparity->dbitmap, sparity->ebitmap,
+                           sparity->nsectors))
+                goto out;
+        length = sparity->logic_end - sparity->logic_start + 1;
+        ret = btrfs_map_sblock(sctx->dev_root->fs_info, WRITE,
+                               sparity->logic_start,
+                               &length, &bbio, 0, &raid_map);
+        if (ret || !bbio || !raid_map)
+                goto bbio_out;
+        bio = btrfs_io_bio_alloc(GFP_NOFS, 0);
+        if (!bio)
+                goto bbio_out;
+        bio->bi_iter.bi_sector = sparity->logic_start >> 9;
+        bio->bi_private = sparity;
+        bio->bi_end_io = scrub_parity_bio_endio;
+        rbio = raid56_parity_alloc_scrub_rbio(sctx->dev_root, bio, bbio,
+                                              raid_map, length,
+                                              sparity->scrub_dev,
+                                              sparity->dbitmap,
+                                              sparity->nsectors);
+        if (!rbio)
+                goto rbio_out;
+        list_for_each_entry(spage, &sparity->spages, list)
+                raid56_parity_add_scrub_pages(rbio, spage->page,
+                                              spage->logical);
+        scrub_pending_bio_inc(sctx);
+        raid56_parity_submit_scrub_rbio(rbio);
+        return;
+rbio_out:
+        bio_put(bio);
+bbio_out:
+        kfree(bbio);
+        kfree(raid_map);
+        bitmap_or(sparity->ebitmap, sparity->ebitmap, sparity->dbitmap,
+                  sparity->nsectors);
+        spin_lock(&sctx->stat_lock);
+        sctx->stat.malloc_errors++;
+        spin_unlock(&sctx->stat_lock);
+out:
+        scrub_free_parity(sparity);
+}
+static inline int scrub_calc_parity_bitmap_len(int nsectors)
+{
+        return DIV_ROUND_UP(nsectors, BITS_PER_LONG) * (BITS_PER_LONG / 8);
+}
+static void scrub_parity_get(struct scrub_parity *sparity)
+{
+        atomic_inc(&sparity->ref_count);
+}
+static void scrub_parity_put(struct scrub_parity *sparity)
+{
+        if (!atomic_dec_and_test(&sparity->ref_count))
+                return;
+        scrub_parity_check_and_repair(sparity);
+}
+static noinline_for_stack int scrub_raid56_parity(struct scrub_ctx *sctx,
+                                                  struct map_lookup *map,
+                                                  struct btrfs_device *sdev,
+                                                  struct btrfs_path *path,
+                                                  u64 logic_start,
+                                                  u64 logic_end)
+{
+        struct btrfs_fs_info *fs_info = sctx->dev_root->fs_info;
+        struct btrfs_root *root = fs_info->extent_root;
+        struct btrfs_root *csum_root = fs_info->csum_root;
+        struct btrfs_extent_item *extent;
+        u64 flags;
+        int ret;
+        int slot;
+        struct extent_buffer *l;
+        struct btrfs_key key;
+        u64 generation;
+        u64 extent_logical;
+        u64 extent_physical;
+        u64 extent_len;
+        struct btrfs_device *extent_dev;
+        struct scrub_parity *sparity;
+        int nsectors;
+        int bitmap_len;
+        int extent_mirror_num;
+        int stop_loop = 0;
+        nsectors = map->stripe_len / root->sectorsize;
+        bitmap_len = scrub_calc_parity_bitmap_len(nsectors);
+        sparity = kzalloc(sizeof(struct scrub_parity) + 2 * bitmap_len,
+                          GFP_NOFS);
+        if (!sparity) {
+                spin_lock(&sctx->stat_lock);
+                sctx->stat.malloc_errors++;
+                spin_unlock(&sctx->stat_lock);
+                return -ENOMEM;
+        }
+        sparity->stripe_len = map->stripe_len;
+        sparity->nsectors = nsectors;
+        sparity->sctx = sctx;
+        sparity->scrub_dev = sdev;
+        sparity->logic_start = logic_start;
+        sparity->logic_end = logic_end;
+        atomic_set(&sparity->ref_count, 1);
+        INIT_LIST_HEAD(&sparity->spages);
+        sparity->dbitmap = sparity->bitmap;
+        sparity->ebitmap = (void *)sparity->bitmap + bitmap_len;
+        ret = 0;
+        while (logic_start < logic_end) {
+                if (btrfs_fs_incompat(fs_info, SKINNY_METADATA))
+                        key.type = BTRFS_METADATA_ITEM_KEY;
+                else
+                        key.type = BTRFS_EXTENT_ITEM_KEY;
+                key.objectid = logic_start;
+                key.offset = (u64)-1;
+                ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
+                if (ret < 0)
+                        goto out;
+                if (ret > 0) {
+                        ret = btrfs_previous_extent_item(root, path, 0);
+                        if (ret < 0)
+                                goto out;
+                        if (ret > 0) {
+                                btrfs_release_path(path);
+                                ret = btrfs_search_slot(NULL, root, &key,
+                                                        path, 0, 0);
+                                if (ret < 0)
+                                        goto out;
+                        }
+                }
+                stop_loop = 0;
+                while (1) {
+                        u64 bytes;
+                        l = path->nodes[0];
+                        slot = path->slots[0];
+                        if (slot >= btrfs_header_nritems(l)) {
+                                ret = btrfs_next_leaf(root, path);
+                                if (ret == 0)
+                                        continue;
+                                if (ret < 0)
+                                        goto out;
+                                stop_loop = 1;
+                                break;
+                        }
+                        btrfs_item_key_to_cpu(l, &key, slot);
+                        if (key.type == BTRFS_METADATA_ITEM_KEY)
+                                bytes = root->nodesize;
+                        else
+                                bytes = key.offset;
+                        if (key.objectid + bytes <= logic_start)
+                                goto next;
+                        if (key.type != BTRFS_EXTENT_ITEM_KEY &&
+                            key.type != BTRFS_METADATA_ITEM_KEY)
+                                goto next;
+                        if (key.objectid > logic_end) {
+                                stop_loop = 1;
+                                break;
+                        }
+                        while (key.objectid >= logic_start + map->stripe_len)
+                                logic_start += map->stripe_len;
+                        extent = btrfs_item_ptr(l, slot,
+                                                struct btrfs_extent_item);
+                        flags = btrfs_extent_flags(l, extent);
+                        generation = btrfs_extent_generation(l, extent);
+                        if (key.objectid < logic_start &&
+                            (flags & BTRFS_EXTENT_FLAG_TREE_BLOCK)) {
+                                btrfs_err(fs_info,
+                                          "scrub: tree block %llu spanning stripes, ignored. logical=%llu",
+                                           key.objectid, logic_start);
+                                goto next;
+                        }
+again:
+                        extent_logical = key.objectid;
+                        extent_len = bytes;
+                        if (extent_logical < logic_start) {
+                                extent_len -= logic_start - extent_logical;
+                                extent_logical = logic_start;
+                        }
+                        if (extent_logical + extent_len >
+                            logic_start + map->stripe_len)
+                                extent_len = logic_start + map->stripe_len -
+                                             extent_logical;
+                        scrub_parity_mark_sectors_data(sparity, extent_logical,
+                                                       extent_len);
+                        scrub_remap_extent(fs_info, extent_logical,
+                                           extent_len, &extent_physical,
+                                           &extent_dev,
+                                           &extent_mirror_num);
+                        ret = btrfs_lookup_csums_range(csum_root,
+                                                extent_logical,
+                                                extent_logical + extent_len - 1,
+                                                &sctx->csum_list, 1);
+                        if (ret)
+                                goto out;
+                        ret = scrub_extent_for_parity(sparity, extent_logical,
+                                                      extent_len,
+                                                      extent_physical,
+                                                      extent_dev, flags,
+                                                      generation,
+                                                      extent_mirror_num);
+                        if (ret)
+                                goto out;
+                        scrub_free_csums(sctx);
+                        if (extent_logical + extent_len <
+                            key.objectid + bytes) {
+                                logic_start += map->stripe_len;
+                                if (logic_start >= logic_end) {
+                                        stop_loop = 1;
+                                        break;
+                                }
+                                if (logic_start < key.objectid + bytes) {
+                                        cond_resched();
+                                        goto again;
+                                }
+                        }
+next:
+                        path->slots[0]++;
+                }
+                btrfs_release_path(path);
+                if (stop_loop)
+                        break;
+                logic_start += map->stripe_len;
+        }
+out:
+        if (ret < 0)
+                scrub_parity_mark_sectors_error(sparity, logic_start,
+                                                logic_end - logic_start + 1);
+        scrub_parity_put(sparity);
+        scrub_submit(sctx);
+        mutex_lock(&sctx->wr_ctx.wr_lock);
+        scrub_wr_submit(sctx);
+        mutex_unlock(&sctx->wr_ctx.wr_lock);
+        btrfs_release_path(path);
+        return ret < 0 ? ret : 0;
+}
 static noinline_for_stack int scrub_stripe(struct scrub_ctx *sctx,
                                           struct map_lookup *map,
                                           struct btrfs_device *scrub_dev,
                                           int num, u64 base, u64 length,
                                           int is_dev_replace)
 {
-        struct btrfs_path *path;
+        struct btrfs_path *path, *ppath;
        struct btrfs_fs_info *fs_info = sctx->dev_root->fs_info;
        struct btrfs_root *root = fs_info->extent_root;
        struct btrfs_root *csum_root = fs_info->csum_root;
@@ -2302,6 +3012,8 @@ static noinline_for_stack int scrub_stripe(struct scrub_ctx *sctx,
        u64 extent_logical;
        u64 extent_physical;
        u64 extent_len;
+        u64 stripe_logical;
+        u64 stripe_end;
        struct btrfs_device *extent_dev;
        int extent_mirror_num;
        int stop_loop = 0;
@@ -2327,7 +3039,7 @@ static noinline_for_stack int scrub_stripe(struct scrub_ctx *sctx,
                mirror_num = num % map->num_stripes + 1;
        } else if (map->type & (BTRFS_BLOCK_GROUP_RAID5 |
                                BTRFS_BLOCK_GROUP_RAID6)) {
-                get_raid56_logic_offset(physical, num, map, &offset);
+                get_raid56_logic_offset(physical, num, map, &offset, NULL);
                increment = map->stripe_len * nr_data_stripes(map);
                mirror_num = 1;
        } else {
@@ -2339,6 +3051,12 @@ static noinline_for_stack int scrub_stripe(struct scrub_ctx *sctx,
        if (!path)
                return -ENOMEM;
+        ppath = btrfs_alloc_path();
+        if (!ppath) {
+                btrfs_free_path(ppath);
+                return -ENOMEM;
+        }
        /*
         * work on commit root. The related disk blocks are static as
         * long as COW is applied. This means, it is save to rewrite
@@ -2357,7 +3075,7 @@ static noinline_for_stack int scrub_stripe(struct scrub_ctx *sctx,
        if (map->type & (BTRFS_BLOCK_GROUP_RAID5 |
                         BTRFS_BLOCK_GROUP_RAID6)) {
                get_raid56_logic_offset(physical_end, num,
-                                        map, &logic_end);
+                                        map, &logic_end, NULL);
                logic_end += base;
        } else {
                logic_end = logical + increment * nstripes;
@@ -2404,10 +3122,18 @@ static noinline_for_stack int scrub_stripe(struct scrub_ctx *sctx,
                if (map->type & (BTRFS_BLOCK_GROUP_RAID5 |
                                BTRFS_BLOCK_GROUP_RAID6)) {
                        ret = get_raid56_logic_offset(physical, num,
-                                        map, &logical);
+                                        map, &logical, &stripe_logical);
                        logical += base;
-                        if (ret)
+                        if (ret) {
+                                stripe_logical += base;
+                                stripe_end = stripe_logical + increment - 1;
+                                ret = scrub_raid56_parity(sctx, map, scrub_dev,
+                                                ppath, stripe_logical,
+                                                stripe_end);
+                                if (ret)
+                                        goto out;
                                goto skip;
+                        }
                }
                /*
                 * canceled?
@@ -2558,13 +3284,25 @@ again:
                                         * loop until we find next data stripe
                                         * or we have finished all stripes.
                                         */
-                                        do {
+loop:
-                                                physical += map->stripe_len;
+                                        physical += map->stripe_len;
-                                                ret = get_raid56_logic_offset(
+                                        ret = get_raid56_logic_offset(physical,
-                                                                physical, num,
+                                                        num, map, &logical,
-                                                                map, &logical);
+                                                        &stripe_logical);
-                                                logical += base;
+                                        logical += base;
-                                        } while (physical < physical_end && ret);
+                                        if (ret && physical < physical_end) {
+                                                stripe_logical += base;
+                                                stripe_end = stripe_logical +
+                                                                increment - 1;
+                                                ret = scrub_raid56_parity(sctx,
+                                                        map, scrub_dev, ppath,
+                                                        stripe_logical,
+                                                        stripe_end);
+                                                if (ret)
+                                                        goto out;
+                                                goto loop;
+                                        }
                                } else {
                                        physical += map->stripe_len;
                                        logical += increment;
@@ -2605,6 +3343,7 @@ out:
        blk_finish_plug(&plug);
        btrfs_free_path(path);
+        btrfs_free_path(ppath);
        return ret < 0 ? ret : 0;
 }
@@ -3310,6 +4049,50 @@ out:
        scrub_pending_trans_workers_dec(sctx);
 }
+static int check_extent_to_block(struct inode *inode, u64 start, u64 len,
+                                 u64 logical)
+{
+        struct extent_state *cached_state = NULL;
+        struct btrfs_ordered_extent *ordered;
+        struct extent_io_tree *io_tree;
+        struct extent_map *em;
+        u64 lockstart = start, lockend = start + len - 1;
+        int ret = 0;
+        io_tree = &BTRFS_I(inode)->io_tree;
+        lock_extent_bits(io_tree, lockstart, lockend, 0, &cached_state);
+        ordered = btrfs_lookup_ordered_range(inode, lockstart, len);
+        if (ordered) {
+                btrfs_put_ordered_extent(ordered);
+                ret = 1;
+                goto out_unlock;
+        }
+        em = btrfs_get_extent(inode, NULL, 0, start, len, 0);
+        if (IS_ERR(em)) {
+                ret = PTR_ERR(em);
+                goto out_unlock;
+        }
+        /*
+         * This extent does not actually cover the logical extent anymore,
+         * move on to the next inode.
+         */
+        if (em->block_start > logical ||
+            em->block_start + em->block_len < logical + len) {
+                free_extent_map(em);
+                ret = 1;
+                goto out_unlock;
+        }
+        free_extent_map(em);
+out_unlock:
+        unlock_extent_cached(io_tree, lockstart, lockend, &cached_state,
+                             GFP_NOFS);
+        return ret;
+}
 static int copy_nocow_pages_for_inode(u64 inum, u64 offset, u64 root,
                                      struct scrub_copy_nocow_ctx *nocow_ctx)
 {
@@ -3318,13 +4101,10 @@ static int copy_nocow_pages_for_inode(u64 inum, u64 offset, u64 root,
        struct inode *inode;
        struct page *page;
        struct btrfs_root *local_root;
-        struct btrfs_ordered_extent *ordered;
-        struct extent_map *em;
-        struct extent_state *cached_state = NULL;
        struct extent_io_tree *io_tree;
        u64 physical_for_dev_replace;
+        u64 nocow_ctx_logical;
        u64 len = nocow_ctx->len;
-        u64 lockstart = offset, lockend = offset + len - 1;
        unsigned long index;
        int srcu_index;
        int ret = 0;
@@ -3356,30 +4136,13 @@ static int copy_nocow_pages_for_inode(u64 inum, u64 offset, u64 root,
        physical_for_dev_replace = nocow_ctx->physical_for_dev_replace;
        io_tree = &BTRFS_I(inode)->io_tree;
+        nocow_ctx_logical = nocow_ctx->logical;
-        lock_extent_bits(io_tree, lockstart, lockend, 0, &cached_state);
+        ret = check_extent_to_block(inode, offset, len, nocow_ctx_logical);
-        ordered = btrfs_lookup_ordered_range(inode, lockstart, len);
+        if (ret) {
-        if (ordered) {
+                ret = ret > 0 ? 0 : ret;
-                btrfs_put_ordered_extent(ordered);
+                goto out;
-                goto out_unlock;
-        }
-        em = btrfs_get_extent(inode, NULL, 0, lockstart, len, 0);
-        if (IS_ERR(em)) {
-                ret = PTR_ERR(em);
-                goto out_unlock;
-        }
-        /*
-         * This extent does not actually cover the logical extent anymore,
-         * move on to the next inode.
-         */
-        if (em->block_start > nocow_ctx->logical ||
-            em->block_start + em->block_len < nocow_ctx->logical + len) {
-                free_extent_map(em);
-                goto out_unlock;
        }
-        free_extent_map(em);
        while (len >= PAGE_CACHE_SIZE) {
                index = offset >> PAGE_CACHE_SHIFT;
@@ -3396,7 +4159,7 @@ again:
                                goto next_page;
                } else {
                        ClearPageError(page);
-                        err = extent_read_full_page_nolock(io_tree, page,
+                        err = extent_read_full_page(io_tree, page,
                                                           btrfs_get_extent,
                                                           nocow_ctx->mirror_num);
                        if (err) {
@@ -3421,6 +4184,14 @@ again:
                                goto next_page;
                        }
                }
+                ret = check_extent_to_block(inode, offset, len,
+                                            nocow_ctx_logical);
+                if (ret) {
+                        ret = ret > 0 ? 0 : ret;
+                        goto next_page;
+                }
                err = write_page_nocow(nocow_ctx->sctx,
                                       physical_for_dev_replace, page);
                if (err)
@@ -3434,12 +4205,10 @@ next_page:
                offset += PAGE_CACHE_SIZE;
                physical_for_dev_replace += PAGE_CACHE_SIZE;
+                nocow_ctx_logical += PAGE_CACHE_SIZE;
                len -= PAGE_CACHE_SIZE;
        }
        ret = COPY_COMPLETE;
-out_unlock:
-        unlock_extent_cached(io_tree, lockstart, lockend, &cached_state,
-                             GFP_NOFS);
 out:
        mutex_unlock(&inode->i_mutex);
        iput(inode);
diff --git a/fs/btrfs/send.c b/fs/btrfs/send.c
index 874828dd0a86..804432dbc351 100644
--- a/fs/btrfs/send.c
+++ b/fs/btrfs/send.c
@@ -5507,6 +5507,51 @@ out:
        return ret;
 }
+/*
+ * If orphan cleanup did remove any orphans from a root, it means the tree
+ * was modified and therefore the commit root is not the same as the current
+ * root anymore. This is a problem, because send uses the commit root and
+ * therefore can see inode items that don't exist in the current root anymore,
+ * and for example make calls to btrfs_iget, which will do tree lookups based
+ * on the current root and not on the commit root. Those lookups will fail,
+ * returning a -ESTALE error, and making send fail with that error. So make
+ * sure a send does not see any orphans we have just removed, and that it will
+ * see the same inodes regardless of whether a transaction commit happened
+ * before it started (meaning that the commit root will be the same as the
+ * current root) or not.
+ */
+static int ensure_commit_roots_uptodate(struct send_ctx *sctx)
+{
+        int i;
+        struct btrfs_trans_handle *trans = NULL;
+again:
+        if (sctx->parent_root &&
+            sctx->parent_root->node != sctx->parent_root->commit_root)
+                goto commit_trans;
+        for (i = 0; i < sctx->clone_roots_cnt; i++)
+                if (sctx->clone_roots[i].root->node !=
+                    sctx->clone_roots[i].root->commit_root)
+                        goto commit_trans;
+        if (trans)
+                return btrfs_end_transaction(trans, sctx->send_root);
+        return 0;
+commit_trans:
+        /* Use any root, all fs roots will get their commit roots updated. */
+        if (!trans) {
+                trans = btrfs_join_transaction(sctx->send_root);
+                if (IS_ERR(trans))
+                        return PTR_ERR(trans);
+                goto again;
+        }
+        return btrfs_commit_transaction(trans, sctx->send_root);
+}
 static void btrfs_root_dec_send_in_progress(struct btrfs_root* root)
 {
        spin_lock(&root->root_item_lock);
@@ -5728,6 +5773,10 @@ long btrfs_ioctl_send(struct file *mnt_file, void __user *arg_)
                        NULL);
        sort_clone_roots = 1;
+        ret = ensure_commit_roots_uptodate(sctx);
+        if (ret)
+                goto out;
        current->journal_info = BTRFS_SEND_TRANS_STUB;
        ret = send_subvol(sctx);
        current->journal_info = NULL;
diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c
index 54bd91ece35b..60f7cbe815e9 100644
--- a/fs/btrfs/super.c
+++ b/fs/btrfs/super.c
@@ -262,7 +262,7 @@ void __btrfs_abort_transaction(struct btrfs_trans_handle *trans,
        trans->aborted = errno;
        /* Nothing used. The other threads that have joined this
         * transaction may be able to continue. */
-        if (!trans->blocks_used) {
+        if (!trans->blocks_used && list_empty(&trans->new_bgs)) {
                const char *errstr;
                errstr = btrfs_decode_error(errno);
@@ -642,11 +642,11 @@ int btrfs_parse_options(struct btrfs_root *root, char *options)
                                             "disabling disk space caching");
                        break;
                case Opt_inode_cache:
-                        btrfs_set_and_info(root, CHANGE_INODE_CACHE,
+                        btrfs_set_pending_and_info(info, INODE_MAP_CACHE,
                                           "enabling inode map caching");
                        break;
                case Opt_noinode_cache:
-                        btrfs_clear_and_info(root, CHANGE_INODE_CACHE,
+                        btrfs_clear_pending_and_info(info, INODE_MAP_CACHE,
                                             "disabling inode map caching");
                        break;
                case Opt_clear_cache:
@@ -993,9 +993,17 @@ int btrfs_sync_fs(struct super_block *sb, int wait)
        trans = btrfs_attach_transaction_barrier(root);
        if (IS_ERR(trans)) {
                /* no transaction, don't bother */
-                if (PTR_ERR(trans) == -ENOENT)
+                if (PTR_ERR(trans) == -ENOENT) {
-                        return 0;
+                        /*
-                return PTR_ERR(trans);
+                         * Exit unless we have some pending changes
+                         * that need to go through commit
+                         */
+                        if (fs_info->pending_changes == 0)
+                                return 0;
+                        trans = btrfs_start_transaction(root, 0);
+                } else {
+                        return PTR_ERR(trans);
+                }
        }
        return btrfs_commit_transaction(trans, root);
 }
@@ -1644,8 +1652,20 @@ static int btrfs_calc_avail_data_space(struct btrfs_root *root, u64 *free_bytes)
        int i = 0, nr_devices;
        int ret;
+        /*
+         * We aren't under the device list lock, so this is racey-ish, but good
+         * enough for our purposes.
+         */
        nr_devices = fs_info->fs_devices->open_devices;
-        BUG_ON(!nr_devices);
+        if (!nr_devices) {
+                smp_mb();
+                nr_devices = fs_info->fs_devices->open_devices;
+                ASSERT(nr_devices);
+                if (!nr_devices) {
+                        *free_bytes = 0;
+                        return 0;
+                }
+        }
        devices_info = kmalloc_array(nr_devices, sizeof(*devices_info),
                               GFP_NOFS);
@@ -1670,11 +1690,17 @@ static int btrfs_calc_avail_data_space(struct btrfs_root *root, u64 *free_bytes)
        else
                min_stripe_size = BTRFS_STRIPE_LEN;
-        list_for_each_entry(device, &fs_devices->devices, dev_list) {
+        if (fs_info->alloc_start)
+                mutex_lock(&fs_devices->device_list_mutex);
+        rcu_read_lock();
+        list_for_each_entry_rcu(device, &fs_devices->devices, dev_list) {
                if (!device->in_fs_metadata || !device->bdev ||
                    device->is_tgtdev_for_dev_replace)
                        continue;
+                if (i >= nr_devices)
+                        break;
                avail_space = device->total_bytes - device->bytes_used;
                /* align with stripe_len */
@@ -1689,24 +1715,32 @@ static int btrfs_calc_avail_data_space(struct btrfs_root *root, u64 *free_bytes)
                skip_space = 1024 * 1024;
                /* user can set the offset in fs_info->alloc_start. */
-                if (fs_info->alloc_start + BTRFS_STRIPE_LEN <=
+                if (fs_info->alloc_start &&
-                    device->total_bytes)
+                    fs_info->alloc_start + BTRFS_STRIPE_LEN <=
+                    device->total_bytes) {
+                        rcu_read_unlock();
                        skip_space = max(fs_info->alloc_start, skip_space);
-                /*
+                        /*
-                 * btrfs can not use the free space in [0, skip_space - 1],
+                         * btrfs can not use the free space in
-                 * we must subtract it from the total. In order to implement
+                         * [0, skip_space - 1], we must subtract it from the
-                 * it, we account the used space in this range first.
+                         * total. In order to implement it, we account the used
-                 */
+                         * space in this range first.
-                ret = btrfs_account_dev_extents_size(device, 0, skip_space - 1,
+                         */
-                                                     &used_space);
+                        ret = btrfs_account_dev_extents_size(device, 0,
-                if (ret) {
+                                                             skip_space - 1,
-                        kfree(devices_info);
+                                                             &used_space);
-                        return ret;
+                        if (ret) {
-                }
+                                kfree(devices_info);
+                                mutex_unlock(&fs_devices->device_list_mutex);
+                                return ret;
+                        }
-                /* calc the free space in [0, skip_space - 1] */
+                        rcu_read_lock();
-                skip_space -= used_space;
+                        /* calc the free space in [0, skip_space - 1] */
+                        skip_space -= used_space;
+                }
                /*
                 * we can use the free space in [0, skip_space - 1], subtract
@@ -1725,6 +1759,9 @@ static int btrfs_calc_avail_data_space(struct btrfs_root *root, u64 *free_bytes)
                i++;
        }
+        rcu_read_unlock();
+        if (fs_info->alloc_start)
+                mutex_unlock(&fs_devices->device_list_mutex);
        nr_devices = i;
@@ -1787,8 +1824,6 @@ static int btrfs_statfs(struct dentry *dentry, struct kstatfs *buf)
         * holding chunk_muext to avoid allocating new chunks, holding
         * device_list_mutex to avoid the device being removed
         */
-        mutex_lock(&fs_info->fs_devices->device_list_mutex);
-        mutex_lock(&fs_info->chunk_mutex);
        rcu_read_lock();
        list_for_each_entry_rcu(found, head, list) {
                if (found->flags & BTRFS_BLOCK_GROUP_DATA) {
@@ -1824,17 +1859,12 @@ static int btrfs_statfs(struct dentry *dentry, struct kstatfs *buf)
        buf->f_bfree -= block_rsv->size >> bits;
        spin_unlock(&block_rsv->lock);
-        buf->f_bavail = total_free_data;
+        buf->f_bavail = div_u64(total_free_data, factor);
        ret = btrfs_calc_avail_data_space(fs_info->tree_root, &total_free_data);
-        if (ret) {
+        if (ret)
-                mutex_unlock(&fs_info->chunk_mutex);
-                mutex_unlock(&fs_info->fs_devices->device_list_mutex);
                return ret;
-        }
        buf->f_bavail += div_u64(total_free_data, factor);
        buf->f_bavail = buf->f_bavail >> bits;
-        mutex_unlock(&fs_info->chunk_mutex);
-        mutex_unlock(&fs_info->fs_devices->device_list_mutex);
        buf->f_type = BTRFS_SUPER_MAGIC;
        buf->f_bsize = dentry->d_sb->s_blocksize;
diff --git a/fs/btrfs/sysfs.c b/fs/btrfs/sysfs.c
index b2e7bb4393f6..92db3f648df4 100644
--- a/fs/btrfs/sysfs.c
+++ b/fs/btrfs/sysfs.c
@@ -111,7 +111,6 @@ static ssize_t btrfs_feature_attr_store(struct kobject *kobj,
 {
        struct btrfs_fs_info *fs_info;
        struct btrfs_feature_attr *fa = to_btrfs_feature_attr(a);
-        struct btrfs_trans_handle *trans;
        u64 features, set, clear;
        unsigned long val;
        int ret;
@@ -153,10 +152,6 @@ static ssize_t btrfs_feature_attr_store(struct kobject *kobj,
        btrfs_info(fs_info, "%s %s feature flag",
                   val ? "Setting" : "Clearing", fa->kobj_attr.attr.name);
-        trans = btrfs_start_transaction(fs_info->fs_root, 0);
-        if (IS_ERR(trans))
-                return PTR_ERR(trans);
        spin_lock(&fs_info->super_lock);
        features = get_features(fs_info, fa->feature_set);
        if (val)
@@ -166,9 +161,11 @@ static ssize_t btrfs_feature_attr_store(struct kobject *kobj,
        set_features(fs_info, fa->feature_set, features);
        spin_unlock(&fs_info->super_lock);
-        ret = btrfs_commit_transaction(trans, fs_info->fs_root);
+        /*
-        if (ret)
+         * We don't want to do full transaction commit from inside sysfs
-                return ret;
+         */
+        btrfs_set_pending(fs_info, COMMIT);
+        wake_up_process(fs_info->transaction_kthread);
        return count;
 }
@@ -372,9 +369,6 @@ static ssize_t btrfs_label_store(struct kobject *kobj,
                                 const char *buf, size_t len)
 {
        struct btrfs_fs_info *fs_info = to_fs_info(kobj);
-        struct btrfs_trans_handle *trans;
-        struct btrfs_root *root = fs_info->fs_root;
-        int ret;
        size_t p_len;
        if (fs_info->sb->s_flags & MS_RDONLY)
@@ -389,20 +383,18 @@ static ssize_t btrfs_label_store(struct kobject *kobj,
        if (p_len >= BTRFS_LABEL_SIZE)
                return -EINVAL;
-        trans = btrfs_start_transaction(root, 0);
+        spin_lock(&fs_info->super_lock);
-        if (IS_ERR(trans))
-                return PTR_ERR(trans);
-        spin_lock(&root->fs_info->super_lock);
        memset(fs_info->super_copy->label, 0, BTRFS_LABEL_SIZE);
        memcpy(fs_info->super_copy->label, buf, p_len);
-        spin_unlock(&root->fs_info->super_lock);
+        spin_unlock(&fs_info->super_lock);
-        ret = btrfs_commit_transaction(trans, root);
-        if (!ret)
+        /*
-                return len;
+         * We don't want to do full transaction commit from inside sysfs
+         */
+        btrfs_set_pending(fs_info, COMMIT);
+        wake_up_process(fs_info->transaction_kthread);
-        return ret;
+        return len;
 }
 BTRFS_ATTR_RW(label, btrfs_label_show, btrfs_label_store);
diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c
index dcaae3616728..a605d4e2f2bc 100644
--- a/fs/btrfs/transaction.c
+++ b/fs/btrfs/transaction.c
@@ -76,6 +76,32 @@ void btrfs_put_transaction(struct btrfs_transaction *transaction)
        }
 }
+static void clear_btree_io_tree(struct extent_io_tree *tree)
+{
+        spin_lock(&tree->lock);
+        while (!RB_EMPTY_ROOT(&tree->state)) {
+                struct rb_node *node;
+                struct extent_state *state;
+                node = rb_first(&tree->state);
+                state = rb_entry(node, struct extent_state, rb_node);
+                rb_erase(&state->rb_node, &tree->state);
+                RB_CLEAR_NODE(&state->rb_node);
+                /*
+                 * btree io trees aren't supposed to have tasks waiting for
+                 * changes in the flags of extent states ever.
+                 */
+                ASSERT(!waitqueue_active(&state->wq));
+                free_extent_state(state);
+                if (need_resched()) {
+                        spin_unlock(&tree->lock);
+                        cond_resched();
+                        spin_lock(&tree->lock);
+                }
+        }
+        spin_unlock(&tree->lock);
+}
 static noinline void switch_commit_roots(struct btrfs_transaction *trans,
                                         struct btrfs_fs_info *fs_info)
 {
@@ -89,6 +115,7 @@ static noinline void switch_commit_roots(struct btrfs_transaction *trans,
                root->commit_root = btrfs_root_node(root);
                if (is_fstree(root->objectid))
                        btrfs_unpin_free_ino(root);
+                clear_btree_io_tree(&root->dirty_log_pages);
        }
        up_write(&fs_info->commit_root_sem);
 }
@@ -220,6 +247,7 @@ loop:
        INIT_LIST_HEAD(&cur_trans->pending_snapshots);
        INIT_LIST_HEAD(&cur_trans->pending_chunks);
        INIT_LIST_HEAD(&cur_trans->switch_commits);
+        INIT_LIST_HEAD(&cur_trans->pending_ordered);
        list_add_tail(&cur_trans->list, &fs_info->trans_list);
        extent_io_tree_init(&cur_trans->dirty_pages,
                             fs_info->btree_inode->i_mapping);
@@ -488,6 +516,7 @@ again:
        h->sync = false;
        INIT_LIST_HEAD(&h->qgroup_ref_list);
        INIT_LIST_HEAD(&h->new_bgs);
+        INIT_LIST_HEAD(&h->ordered);
        smp_mb();
        if (cur_trans->state >= TRANS_STATE_BLOCKED &&
@@ -719,6 +748,12 @@ static int __btrfs_end_transaction(struct btrfs_trans_handle *trans,
        if (!list_empty(&trans->new_bgs))
                btrfs_create_pending_block_groups(trans, root);
+        if (!list_empty(&trans->ordered)) {
+                spin_lock(&info->trans_lock);
+                list_splice(&trans->ordered, &cur_trans->pending_ordered);
+                spin_unlock(&info->trans_lock);
+        }
        trans->delayed_ref_updates = 0;
        if (!trans->sync) {
                must_run_delayed_refs =
@@ -828,17 +863,39 @@ int btrfs_write_marked_extents(struct btrfs_root *root,
        while (!find_first_extent_bit(dirty_pages, start, &start, &end,
                                      mark, &cached_state)) {
-                convert_extent_bit(dirty_pages, start, end, EXTENT_NEED_WAIT,
+                bool wait_writeback = false;
-                                   mark, &cached_state, GFP_NOFS);
-                cached_state = NULL;
+                err = convert_extent_bit(dirty_pages, start, end,
-                err = filemap_fdatawrite_range(mapping, start, end);
+                                         EXTENT_NEED_WAIT,
+                                         mark, &cached_state, GFP_NOFS);
+                /*
+                 * convert_extent_bit can return -ENOMEM, which is most of the
+                 * time a temporary error. So when it happens, ignore the error
+                 * and wait for writeback of this range to finish - because we
+                 * failed to set the bit EXTENT_NEED_WAIT for the range, a call
+                 * to btrfs_wait_marked_extents() would not know that writeback
+                 * for this range started and therefore wouldn't wait for it to
+                 * finish - we don't want to commit a superblock that points to
+                 * btree nodes/leafs for which writeback hasn't finished yet
+                 * (and without errors).
+                 * We cleanup any entries left in the io tree when committing
+                 * the transaction (through clear_btree_io_tree()).
+                 */
+                if (err == -ENOMEM) {
+                        err = 0;
+                        wait_writeback = true;
+                }
+                if (!err)
+                        err = filemap_fdatawrite_range(mapping, start, end);
                if (err)
                        werr = err;
+                else if (wait_writeback)
+                        werr = filemap_fdatawait_range(mapping, start, end);
+                free_extent_state(cached_state);
+                cached_state = NULL;
                cond_resched();
                start = end + 1;
        }
-        if (err)
-                werr = err;
        return werr;
 }
@@ -862,11 +919,25 @@ int btrfs_wait_marked_extents(struct btrfs_root *root,
        while (!find_first_extent_bit(dirty_pages, start, &start, &end,
                                      EXTENT_NEED_WAIT, &cached_state)) {
-                clear_extent_bit(dirty_pages, start, end, EXTENT_NEED_WAIT,
+                /*
-                                 0, 0, &cached_state, GFP_NOFS);
+                 * Ignore -ENOMEM errors returned by clear_extent_bit().
-                err = filemap_fdatawait_range(mapping, start, end);
+                 * When committing the transaction, we'll remove any entries
+                 * left in the io tree. For a log commit, we don't remove them
+                 * after committing the log because the tree can be accessed
+                 * concurrently - we do it only at transaction commit time when
+                 * it's safe to do it (through clear_btree_io_tree()).
+                 */
+                err = clear_extent_bit(dirty_pages, start, end,
+                                       EXTENT_NEED_WAIT,
+                                       0, 0, &cached_state, GFP_NOFS);
+                if (err == -ENOMEM)
+                        err = 0;
+                if (!err)
+                        err = filemap_fdatawait_range(mapping, start, end);
                if (err)
                        werr = err;
+                free_extent_state(cached_state);
+                cached_state = NULL;
                cond_resched();
                start = end + 1;
        }
@@ -919,17 +990,17 @@ static int btrfs_write_and_wait_marked_extents(struct btrfs_root *root,
        return 0;
 }
-int btrfs_write_and_wait_transaction(struct btrfs_trans_handle *trans,
+static int btrfs_write_and_wait_transaction(struct btrfs_trans_handle *trans,
                                     struct btrfs_root *root)
 {
-        if (!trans || !trans->transaction) {
+        int ret;
-                struct inode *btree_inode;
-                btree_inode = root->fs_info->btree_inode;
+        ret = btrfs_write_and_wait_marked_extents(root,
-                return filemap_write_and_wait(btree_inode->i_mapping);
-        }
-        return btrfs_write_and_wait_marked_extents(root,
                                           &trans->transaction->dirty_pages,
                                           EXTENT_DIRTY);
+        clear_btree_io_tree(&trans->transaction->dirty_pages);
+        return ret;
 }
 /*
@@ -1652,6 +1723,28 @@ static inline void btrfs_wait_delalloc_flush(struct btrfs_fs_info *fs_info)
                btrfs_wait_ordered_roots(fs_info, -1);
 }
+static inline void
+btrfs_wait_pending_ordered(struct btrfs_transaction *cur_trans,
+                           struct btrfs_fs_info *fs_info)
+{
+        struct btrfs_ordered_extent *ordered;
+        spin_lock(&fs_info->trans_lock);
+        while (!list_empty(&cur_trans->pending_ordered)) {
+                ordered = list_first_entry(&cur_trans->pending_ordered,
+                                           struct btrfs_ordered_extent,
+                                           trans_list);
+                list_del_init(&ordered->trans_list);
+                spin_unlock(&fs_info->trans_lock);
+                wait_event(ordered->wait, test_bit(BTRFS_ORDERED_COMPLETE,
+                                                   &ordered->flags));
+                btrfs_put_ordered_extent(ordered);
+                spin_lock(&fs_info->trans_lock);
+        }
+        spin_unlock(&fs_info->trans_lock);
+}
 int btrfs_commit_transaction(struct btrfs_trans_handle *trans,
                             struct btrfs_root *root)
 {
@@ -1702,6 +1795,7 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans,
        }
        spin_lock(&root->fs_info->trans_lock);
+        list_splice(&trans->ordered, &cur_trans->pending_ordered);
        if (cur_trans->state >= TRANS_STATE_COMMIT_START) {
                spin_unlock(&root->fs_info->trans_lock);
                atomic_inc(&cur_trans->use_count);
@@ -1754,6 +1848,8 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans,
        btrfs_wait_delalloc_flush(root->fs_info);
+        btrfs_wait_pending_ordered(cur_trans, root->fs_info);
        btrfs_scrub_pause(root);
        /*
         * Ok now we need to make sure to block out any other joins while we
@@ -1842,13 +1938,10 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans,
        }
        /*
-         * Since the transaction is done, we should set the inode map cache flag
+         * Since the transaction is done, we can apply the pending changes
-         * before any other comming transaction.
+         * before the next transaction.
         */
-        if (btrfs_test_opt(root, CHANGE_INODE_CACHE))
+        btrfs_apply_pending_changes(root->fs_info);
-                btrfs_set_opt(root->fs_info->mount_opt, INODE_MAP_CACHE);
-        else
-                btrfs_clear_opt(root->fs_info->mount_opt, INODE_MAP_CACHE);
        /* commit_fs_roots gets rid of all the tree log roots, it is now
         * safe to free the root of tree log roots
@@ -2019,3 +2112,32 @@ int btrfs_clean_one_deleted_snapshot(struct btrfs_root *root)
        return (ret < 0) ? 0 : 1;
 }
+void btrfs_apply_pending_changes(struct btrfs_fs_info *fs_info)
+{
+        unsigned long prev;
+        unsigned long bit;
+        prev = cmpxchg(&fs_info->pending_changes, 0, 0);
+        if (!prev)
+                return;
+        bit = 1 << BTRFS_PENDING_SET_INODE_MAP_CACHE;
+        if (prev & bit)
+                btrfs_set_opt(fs_info->mount_opt, INODE_MAP_CACHE);
+        prev &= ~bit;
+        bit = 1 << BTRFS_PENDING_CLEAR_INODE_MAP_CACHE;
+        if (prev & bit)
+                btrfs_clear_opt(fs_info->mount_opt, INODE_MAP_CACHE);
+        prev &= ~bit;
+        bit = 1 << BTRFS_PENDING_COMMIT;
+        if (prev & bit)
+                btrfs_debug(fs_info, "pending commit done");
+        prev &= ~bit;
+        if (prev)
+                btrfs_warn(fs_info,
+                        "unknown pending changes left 0x%lx, ignoring", prev);
+}
diff --git a/fs/btrfs/transaction.h b/fs/btrfs/transaction.h
index d8f40e1a5d2d..00ed29c4b3f9 100644
--- a/fs/btrfs/transaction.h
+++ b/fs/btrfs/transaction.h
@@ -56,6 +56,7 @@ struct btrfs_transaction {
        wait_queue_head_t commit_wait;
        struct list_head pending_snapshots;
        struct list_head pending_chunks;
+        struct list_head pending_ordered;
        struct list_head switch_commits;
        struct btrfs_delayed_ref_root delayed_refs;
        int aborted;
@@ -105,6 +106,7 @@ struct btrfs_trans_handle {
         */
        struct btrfs_root *root;
        struct seq_list delayed_ref_elem;
+        struct list_head ordered;
        struct list_head qgroup_ref_list;
        struct list_head new_bgs;
 };
@@ -145,8 +147,6 @@ struct btrfs_trans_handle *btrfs_attach_transaction_barrier(
                                        struct btrfs_root *root);
 struct btrfs_trans_handle *btrfs_start_ioctl_transaction(struct btrfs_root *root);
 int btrfs_wait_for_commit(struct btrfs_root *root, u64 transid);
-int btrfs_write_and_wait_transaction(struct btrfs_trans_handle *trans,
-                                     struct btrfs_root *root);
 void btrfs_add_dead_root(struct btrfs_root *root);
 int btrfs_defrag_root(struct btrfs_root *root);
@@ -170,4 +170,6 @@ int btrfs_wait_marked_extents(struct btrfs_root *root,
 int btrfs_transaction_blocked(struct btrfs_fs_info *info);
 int btrfs_transaction_in_commit(struct btrfs_fs_info *info);
 void btrfs_put_transaction(struct btrfs_transaction *transaction);
+void btrfs_apply_pending_changes(struct btrfs_fs_info *fs_info);
 #endif
diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c
index 286213cec861..9a02da16f2be 100644
--- a/fs/btrfs/tree-log.c
+++ b/fs/btrfs/tree-log.c
@@ -2599,12 +2599,14 @@ int btrfs_sync_log(struct btrfs_trans_handle *trans,
        index2 = root_log_ctx.log_transid % 2;
        if (atomic_read(&log_root_tree->log_commit[index2])) {
                blk_finish_plug(&plug);
-                btrfs_wait_marked_extents(log, &log->dirty_log_pages, mark);
+                ret = btrfs_wait_marked_extents(log, &log->dirty_log_pages,
+                                                mark);
+                btrfs_wait_logged_extents(trans, log, log_transid);
                wait_log_commit(trans, log_root_tree,
                                root_log_ctx.log_transid);
-                btrfs_free_logged_extents(log, log_transid);
                mutex_unlock(&log_root_tree->log_mutex);
-                ret = root_log_ctx.log_ret;
+                if (!ret)
+                        ret = root_log_ctx.log_ret;
                goto out;
        }
        ASSERT(root_log_ctx.log_transid == log_root_tree->log_transid);
@@ -2641,11 +2643,18 @@ int btrfs_sync_log(struct btrfs_trans_handle *trans,
                mutex_unlock(&log_root_tree->log_mutex);
                goto out_wake_log_root;
        }
-        btrfs_wait_marked_extents(log, &log->dirty_log_pages, mark);
+        ret = btrfs_wait_marked_extents(log, &log->dirty_log_pages, mark);
-        btrfs_wait_marked_extents(log_root_tree,
+        if (!ret)
-                                  &log_root_tree->dirty_log_pages,
+                ret = btrfs_wait_marked_extents(log_root_tree,
-                                  EXTENT_NEW | EXTENT_DIRTY);
+                                                &log_root_tree->dirty_log_pages,
-        btrfs_wait_logged_extents(log, log_transid);
+                                                EXTENT_NEW | EXTENT_DIRTY);
+        if (ret) {
+                btrfs_set_log_full_commit(root->fs_info, trans);
+                btrfs_free_logged_extents(log, log_transid);
+                mutex_unlock(&log_root_tree->log_mutex);
+                goto out_wake_log_root;
+        }
+        btrfs_wait_logged_extents(trans, log, log_transid);
        btrfs_set_super_log_root(root->fs_info->super_for_commit,
                                log_root_tree->node->start);
@@ -3626,6 +3635,12 @@ static int wait_ordered_extents(struct btrfs_trans_handle *trans,
                            test_bit(BTRFS_ORDERED_IOERR, &ordered->flags)));
                if (test_bit(BTRFS_ORDERED_IOERR, &ordered->flags)) {
+                        /*
+                         * Clear the AS_EIO/AS_ENOSPC flags from the inode's
+                         * i_mapping flags, so that the next fsync won't get
+                         * an outdated io error too.
+                         */
+                        btrfs_inode_check_errors(inode);
                        *ordered_io_error = true;
                        break;
                }
@@ -3766,7 +3781,7 @@ static int log_one_extent(struct btrfs_trans_handle *trans,
        fi = btrfs_item_ptr(leaf, path->slots[0],
                            struct btrfs_file_extent_item);
-        btrfs_set_token_file_extent_generation(leaf, fi, em->generation,
+        btrfs_set_token_file_extent_generation(leaf, fi, trans->transid,
                                               &token);
        if (test_bit(EXTENT_FLAG_PREALLOC, &em->flags))
                btrfs_set_token_file_extent_type(leaf, fi,
@@ -3963,7 +3978,7 @@ static int btrfs_log_inode(struct btrfs_trans_handle *trans,
        mutex_lock(&BTRFS_I(inode)->log_mutex);
-        btrfs_get_logged_extents(inode, &logged_list);
+        btrfs_get_logged_extents(inode, &logged_list, start, end);
        /*
         * a brute force approach to making sure we get the most uptodate
@@ -4089,6 +4104,21 @@ log_extents:
        btrfs_release_path(path);
        btrfs_release_path(dst_path);
        if (fast_search) {
+                /*
+                 * Some ordered extents started by fsync might have completed
+                 * before we collected the ordered extents in logged_list, which
+                 * means they're gone, not in our logged_list nor in the inode's
+                 * ordered tree. We want the application/user space to know an
+                 * error happened while attempting to persist file data so that
+                 * it can take proper action. If such error happened, we leave
+                 * without writing to the log tree and the fsync must report the
+                 * file data write error and not commit the current transaction.
+                 */
+                err = btrfs_inode_check_errors(inode);
+                if (err) {
+                        ctx->io_err = err;
+                        goto out_unlock;
+                }
                ret = btrfs_log_changed_extents(trans, root, inode, dst_path,
                                                &logged_list, ctx);
                if (ret) {
diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c
index d47289c715c8..0144790e296e 100644
--- a/fs/btrfs/volumes.c
+++ b/fs/btrfs/volumes.c
@@ -53,16 +53,6 @@ static void btrfs_dev_stat_print_on_load(struct btrfs_device *device);
 DEFINE_MUTEX(uuid_mutex);
 static LIST_HEAD(fs_uuids);
-static void lock_chunks(struct btrfs_root *root)
-{
-        mutex_lock(&root->fs_info->chunk_mutex);
-}
-static void unlock_chunks(struct btrfs_root *root)
-{
-        mutex_unlock(&root->fs_info->chunk_mutex);
-}
 static struct btrfs_fs_devices *__alloc_fs_devices(void)
 {
        struct btrfs_fs_devices *fs_devs;
@@ -1068,9 +1058,11 @@ static int contains_pending_extent(struct btrfs_trans_handle *trans,
                                   u64 *start, u64 len)
 {
        struct extent_map *em;
+        struct list_head *search_list = &trans->transaction->pending_chunks;
        int ret = 0;
-        list_for_each_entry(em, &trans->transaction->pending_chunks, list) {
+again:
+        list_for_each_entry(em, search_list, list) {
                struct map_lookup *map;
                int i;
@@ -1087,6 +1079,10 @@ static int contains_pending_extent(struct btrfs_trans_handle *trans,
                        ret = 1;
                }
        }
+        if (search_list == &trans->transaction->pending_chunks) {
+                search_list = &trans->root->fs_info->pinned_chunks;
+                goto again;
+        }
        return ret;
 }
@@ -1800,8 +1796,8 @@ error_undo:
        goto error_brelse;
 }
-void btrfs_rm_dev_replace_srcdev(struct btrfs_fs_info *fs_info,
+void btrfs_rm_dev_replace_remove_srcdev(struct btrfs_fs_info *fs_info,
-                                 struct btrfs_device *srcdev)
+                                        struct btrfs_device *srcdev)
 {
        struct btrfs_fs_devices *fs_devices;
@@ -1829,6 +1825,12 @@ void btrfs_rm_dev_replace_srcdev(struct btrfs_fs_info *fs_info,
        if (srcdev->bdev)
                fs_devices->open_devices--;
+}
+void btrfs_rm_dev_replace_free_srcdev(struct btrfs_fs_info *fs_info,
+                                      struct btrfs_device *srcdev)
+{
+        struct btrfs_fs_devices *fs_devices = srcdev->fs_devices;
        call_rcu(&srcdev->rcu, free_device);
@@ -2647,18 +2649,12 @@ int btrfs_remove_chunk(struct btrfs_trans_handle *trans,
                }
        }
-        ret = btrfs_remove_block_group(trans, extent_root, chunk_offset);
+        ret = btrfs_remove_block_group(trans, extent_root, chunk_offset, em);
        if (ret) {
                btrfs_abort_transaction(trans, extent_root, ret);
                goto out;
        }
-        write_lock(&em_tree->lock);
-        remove_extent_mapping(em_tree, em);
-        write_unlock(&em_tree->lock);
-        /* once for the tree */
-        free_extent_map(em);
 out:
        /* once for us */
        free_extent_map(em);
@@ -4505,6 +4501,8 @@ error_del_extent:
        free_extent_map(em);
        /* One for the tree reference */
        free_extent_map(em);
+        /* One for the pending_chunks list reference */
+        free_extent_map(em);
 error:
        kfree(devices_info);
        return ret;
@@ -4881,13 +4879,15 @@ static inline int parity_smaller(u64 a, u64 b)
 static void sort_parity_stripes(struct btrfs_bio *bbio, u64 *raid_map)
 {
        struct btrfs_bio_stripe s;
+        int real_stripes = bbio->num_stripes - bbio->num_tgtdevs;
        int i;
        u64 l;
        int again = 1;
+        int m;
        while (again) {
                again = 0;
-                for (i = 0; i < bbio->num_stripes - 1; i++) {
+                for (i = 0; i < real_stripes - 1; i++) {
                        if (parity_smaller(raid_map[i], raid_map[i+1])) {
                                s = bbio->stripes[i];
                                l = raid_map[i];
@@ -4895,6 +4895,14 @@ static void sort_parity_stripes(struct btrfs_bio *bbio, u64 *raid_map)
                                raid_map[i] = raid_map[i+1];
                                bbio->stripes[i+1] = s;
                                raid_map[i+1] = l;
+                                if (bbio->tgtdev_map) {
+                                        m = bbio->tgtdev_map[i];
+                                        bbio->tgtdev_map[i] =
+                                                        bbio->tgtdev_map[i + 1];
+                                        bbio->tgtdev_map[i + 1] = m;
+                                }
                                again = 1;
                        }
                }
@@ -4923,6 +4931,7 @@ static int __btrfs_map_block(struct btrfs_fs_info *fs_info, int rw,
        int ret = 0;
        int num_stripes;
        int max_errors = 0;
+        int tgtdev_indexes = 0;
        struct btrfs_bio *bbio = NULL;
        struct btrfs_dev_replace *dev_replace = &fs_info->dev_replace;
        int dev_replace_is_ongoing = 0;
@@ -5161,15 +5170,14 @@ static int __btrfs_map_block(struct btrfs_fs_info *fs_info, int rw,
                                BTRFS_BLOCK_GROUP_RAID6)) {
                u64 tmp;
-                if (bbio_ret && ((rw & REQ_WRITE) || mirror_num > 1)
+                if (raid_map_ret &&
-                    && raid_map_ret) {
+                    ((rw & (REQ_WRITE | REQ_GET_READ_MIRRORS)) ||
+                     mirror_num > 1)) {
                        int i, rot;
                        /* push stripe_nr back to the start of the full stripe */
                        stripe_nr = raid56_full_stripe_start;
-                        do_div(stripe_nr, stripe_len);
+                        do_div(stripe_nr, stripe_len * nr_data_stripes(map));
-                        stripe_index = do_div(stripe_nr, nr_data_stripes(map));
                        /* RAID[56] write or recovery. Return all stripes */
                        num_stripes = map->num_stripes;
@@ -5235,14 +5243,19 @@ static int __btrfs_map_block(struct btrfs_fs_info *fs_info, int rw,
                        num_alloc_stripes <<= 1;
                if (rw & REQ_GET_READ_MIRRORS)
                        num_alloc_stripes++;
+                tgtdev_indexes = num_stripes;
        }
-        bbio = kzalloc(btrfs_bio_size(num_alloc_stripes), GFP_NOFS);
+        bbio = kzalloc(btrfs_bio_size(num_alloc_stripes, tgtdev_indexes),
+                       GFP_NOFS);
        if (!bbio) {
                kfree(raid_map);
                ret = -ENOMEM;
                goto out;
        }
        atomic_set(&bbio->error, 0);
+        if (dev_replace_is_ongoing)
+                bbio->tgtdev_map = (int *)(bbio->stripes + num_alloc_stripes);
        if (rw & REQ_DISCARD) {
                int factor = 0;
@@ -5327,6 +5340,7 @@ static int __btrfs_map_block(struct btrfs_fs_info *fs_info, int rw,
        if (rw & (REQ_WRITE | REQ_GET_READ_MIRRORS))
                max_errors = btrfs_chunk_max_errors(map);
+        tgtdev_indexes = 0;
        if (dev_replace_is_ongoing && (rw & (REQ_WRITE | REQ_DISCARD)) &&
            dev_replace->tgtdev != NULL) {
                int index_where_to_add;
@@ -5355,8 +5369,10 @@ static int __btrfs_map_block(struct btrfs_fs_info *fs_info, int rw,
                                new->physical = old->physical;
                                new->length = old->length;
                                new->dev = dev_replace->tgtdev;
+                                bbio->tgtdev_map[i] = index_where_to_add;
                                index_where_to_add++;
                                max_errors++;
+                                tgtdev_indexes++;
                        }
                }
                num_stripes = index_where_to_add;
@@ -5402,7 +5418,9 @@ static int __btrfs_map_block(struct btrfs_fs_info *fs_info, int rw,
                                tgtdev_stripe->length =
                                        bbio->stripes[index_srcdev].length;
                                tgtdev_stripe->dev = dev_replace->tgtdev;
+                                bbio->tgtdev_map[index_srcdev] = num_stripes;
+                                tgtdev_indexes++;
                                num_stripes++;
                        }
                }
@@ -5412,6 +5430,7 @@ static int __btrfs_map_block(struct btrfs_fs_info *fs_info, int rw,
        bbio->num_stripes = num_stripes;
        bbio->max_errors = max_errors;
        bbio->mirror_num = mirror_num;
+        bbio->num_tgtdevs = tgtdev_indexes;
        /*
         * this is the case that REQ_READ && dev_replace_is_ongoing &&
@@ -5443,6 +5462,16 @@ int btrfs_map_block(struct btrfs_fs_info *fs_info, int rw,
                                 mirror_num, NULL);
 }
+/* For Scrub/replace */
+int btrfs_map_sblock(struct btrfs_fs_info *fs_info, int rw,
+                     u64 logical, u64 *length,
+                     struct btrfs_bio **bbio_ret, int mirror_num,
+                     u64 **raid_map_ret)
+{
+        return __btrfs_map_block(fs_info, rw, logical, length, bbio_ret,
+                                 mirror_num, raid_map_ret);
+}
 int btrfs_rmap_block(struct btrfs_mapping_tree *map_tree,
                     u64 chunk_start, u64 physical, u64 devid,
                     u64 **logical, int *naddrs, int *stripe_len)
@@ -5812,12 +5841,9 @@ int btrfs_map_bio(struct btrfs_root *root, int rw, struct bio *bio,
                } else {
                        ret = raid56_parity_recover(root, bio, bbio,
                                                    raid_map, map_length,
-                                                    mirror_num);
+                                                    mirror_num, 1);
                }
-                /*
-                 * FIXME, replace dosen't support raid56 yet, please fix
-                 * it in the future.
-                 */
                btrfs_bio_counter_dec(root->fs_info);
                return ret;
        }
diff --git a/fs/btrfs/volumes.h b/fs/btrfs/volumes.h
index 08980fa23039..d6fe73c0f4a2 100644
--- a/fs/btrfs/volumes.h
+++ b/fs/btrfs/volumes.h
@@ -292,7 +292,7 @@ struct btrfs_bio_stripe {
 struct btrfs_bio;
 typedef void (btrfs_bio_end_io_t) (struct btrfs_bio *bio, int err);
-#define BTRFS_BIO_ORIG_BIO_SUBMITTED    0x1
+#define BTRFS_BIO_ORIG_BIO_SUBMITTED    (1 << 0)
 struct btrfs_bio {
        atomic_t stripes_pending;
@@ -305,6 +305,8 @@ struct btrfs_bio {
        int max_errors;
        int num_stripes;
        int mirror_num;
+        int num_tgtdevs;
+        int *tgtdev_map;
        struct btrfs_bio_stripe stripes[];
 };
@@ -387,12 +389,18 @@ struct btrfs_balance_control {
 int btrfs_account_dev_extents_size(struct btrfs_device *device, u64 start,
                                   u64 end, u64 *length);
-#define btrfs_bio_size(n) (sizeof(struct btrfs_bio) + \
+#define btrfs_bio_size(total_stripes, real_stripes)             \
-                            (sizeof(struct btrfs_bio_stripe) * (n)))
+        (sizeof(struct btrfs_bio) +                             \
+         (sizeof(struct btrfs_bio_stripe) * (total_stripes)) +  \
+         (sizeof(int) * (real_stripes)))
 int btrfs_map_block(struct btrfs_fs_info *fs_info, int rw,
                    u64 logical, u64 *length,
                    struct btrfs_bio **bbio_ret, int mirror_num);
+int btrfs_map_sblock(struct btrfs_fs_info *fs_info, int rw,
+                     u64 logical, u64 *length,
+                     struct btrfs_bio **bbio_ret, int mirror_num,
+                     u64 **raid_map_ret);
 int btrfs_rmap_block(struct btrfs_mapping_tree *map_tree,
                     u64 chunk_start, u64 physical, u64 devid,
                     u64 **logical, int *naddrs, int *stripe_len);
@@ -448,8 +456,10 @@ void btrfs_init_devices_late(struct btrfs_fs_info *fs_info);
 int btrfs_init_dev_stats(struct btrfs_fs_info *fs_info);
 int btrfs_run_dev_stats(struct btrfs_trans_handle *trans,
                        struct btrfs_fs_info *fs_info);
-void btrfs_rm_dev_replace_srcdev(struct btrfs_fs_info *fs_info,
+void btrfs_rm_dev_replace_remove_srcdev(struct btrfs_fs_info *fs_info,
-                                 struct btrfs_device *srcdev);
+                                        struct btrfs_device *srcdev);
+void btrfs_rm_dev_replace_free_srcdev(struct btrfs_fs_info *fs_info,
+                                      struct btrfs_device *srcdev);
 void btrfs_destroy_dev_replace_tgtdev(struct btrfs_fs_info *fs_info,
                                      struct btrfs_device *tgtdev);
 void btrfs_init_dev_replace_tgtdev_for_resume(struct btrfs_fs_info *fs_info,
@@ -513,4 +523,16 @@ static inline void btrfs_dev_stat_reset(struct btrfs_device *dev,
 void btrfs_update_commit_device_size(struct btrfs_fs_info *fs_info);
 void btrfs_update_commit_device_bytes_used(struct btrfs_root *root,
                                        struct btrfs_transaction *transaction);
+static inline void lock_chunks(struct btrfs_root *root)
+{
+        mutex_lock(&root->fs_info->chunk_mutex);
+}
+static inline void unlock_chunks(struct btrfs_root *root)
+{
+        mutex_unlock(&root->fs_info->chunk_mutex);
+}
 #endif
diff --git a/fs/btrfs/xattr.c b/fs/btrfs/xattr.c
index dcf20131fbe4..47b19465f0dc 100644
--- a/fs/btrfs/xattr.c
+++ b/fs/btrfs/xattr.c
@@ -29,6 +29,7 @@
 #include "xattr.h"
 #include "disk-io.h"
 #include "props.h"
+#include "locking.h"
 ssize_t __btrfs_getxattr(struct inode *inode, const char *name,
@@ -91,7 +92,7 @@ static int do_setxattr(struct btrfs_trans_handle *trans,
                       struct inode *inode, const char *name,
                       const void *value, size_t size, int flags)
 {
-        struct btrfs_dir_item *di;
+        struct btrfs_dir_item *di = NULL;
        struct btrfs_root *root = BTRFS_I(inode)->root;
        struct btrfs_path *path;
        size_t name_len = strlen(name);
@@ -103,84 +104,119 @@ static int do_setxattr(struct btrfs_trans_handle *trans,
        path = btrfs_alloc_path();
        if (!path)
                return -ENOMEM;
+        path->skip_release_on_error = 1;
+        if (!value) {
+                di = btrfs_lookup_xattr(trans, root, path, btrfs_ino(inode),
+                                        name, name_len, -1);
+                if (!di && (flags & XATTR_REPLACE))
+                        ret = -ENODATA;
+                else if (di)
+                        ret = btrfs_delete_one_dir_name(trans, root, path, di);
+                goto out;
+        }
+        /*
+         * For a replace we can't just do the insert blindly.
+         * Do a lookup first (read-only btrfs_search_slot), and return if xattr
+         * doesn't exist. If it exists, fall down below to the insert/replace
+         * path - we can't race with a concurrent xattr delete, because the VFS
+         * locks the inode's i_mutex before calling setxattr or removexattr.
+         */
        if (flags & XATTR_REPLACE) {
-                di = btrfs_lookup_xattr(trans, root, path, btrfs_ino(inode), name,
+                ASSERT(mutex_is_locked(&inode->i_mutex));
-                                        name_len, -1);
+                di = btrfs_lookup_xattr(NULL, root, path, btrfs_ino(inode),
-                if (IS_ERR(di)) {
+                                        name, name_len, 0);
-                        ret = PTR_ERR(di);
+                if (!di) {
-                        goto out;
-                } else if (!di) {
                        ret = -ENODATA;
                        goto out;
                }
-                ret = btrfs_delete_one_dir_name(trans, root, path, di);
-                if (ret)
-                        goto out;
                btrfs_release_path(path);
+                di = NULL;
+        }
+        ret = btrfs_insert_xattr_item(trans, root, path, btrfs_ino(inode),
+                                      name, name_len, value, size);
+        if (ret == -EOVERFLOW) {
                /*
-                 * remove the attribute
+                 * We have an existing item in a leaf, split_leaf couldn't
+                 * expand it. That item might have or not a dir_item that
+                 * matches our target xattr, so lets check.
                 */
-                if (!value)
+                ret = 0;
-                        goto out;
+                btrfs_assert_tree_locked(path->nodes[0]);
-        } else {
+                di = btrfs_match_dir_item_name(root, path, name, name_len);
-                di = btrfs_lookup_xattr(NULL, root, path, btrfs_ino(inode),
+                if (!di && !(flags & XATTR_REPLACE)) {
-                                        name, name_len, 0);
+                        ret = -ENOSPC;
-                if (IS_ERR(di)) {
-                        ret = PTR_ERR(di);
                        goto out;
                }
-                if (!di && !value)
+        } else if (ret == -EEXIST) {
-                        goto out;
+                ret = 0;
-                btrfs_release_path(path);
+                di = btrfs_match_dir_item_name(root, path, name, name_len);
+                ASSERT(di); /* logic error */
+        } else if (ret) {
+                goto out;
        }
-again:
+        if (di && (flags & XATTR_CREATE)) {
-        ret = btrfs_insert_xattr_item(trans, root, path, btrfs_ino(inode),
-                                      name, name_len, value, size);
-        /*
-         * If we're setting an xattr to a new value but the new value is say
-         * exactly BTRFS_MAX_XATTR_SIZE, we could end up with EOVERFLOW getting
-         * back from split_leaf.  This is because it thinks we'll be extending
-         * the existing item size, but we're asking for enough space to add the
-         * item itself.  So if we get EOVERFLOW just set ret to EEXIST and let
-         * the rest of the function figure it out.
-         */
-        if (ret == -EOVERFLOW)
                ret = -EEXIST;
+                goto out;
+        }
-        if (ret == -EEXIST) {
+        if (di) {
-                if (flags & XATTR_CREATE)
-                        goto out;
                /*
-                 * We can't use the path we already have since we won't have the
+                 * We're doing a replace, and it must be atomic, that is, at
-                 * proper locking for a delete, so release the path and
+                 * any point in time we have either the old or the new xattr
-                 * re-lookup to delete the thing.
+                 * value in the tree. We don't want readers (getxattr and
+                 * listxattrs) to miss a value, this is specially important
+                 * for ACLs.
                 */
-                btrfs_release_path(path);
+                const int slot = path->slots[0];
-                di = btrfs_lookup_xattr(trans, root, path, btrfs_ino(inode),
+                struct extent_buffer *leaf = path->nodes[0];
-                                        name, name_len, -1);
+                const u16 old_data_len = btrfs_dir_data_len(leaf, di);
-                if (IS_ERR(di)) {
+                const u32 item_size = btrfs_item_size_nr(leaf, slot);
-                        ret = PTR_ERR(di);
+                const u32 data_size = sizeof(*di) + name_len + size;
-                        goto out;
+                struct btrfs_item *item;
-                } else if (!di) {
+                unsigned long data_ptr;
-                        /* Shouldn't happen but just in case... */
+                char *ptr;
-                        btrfs_release_path(path);
-                        goto again;
+                if (size > old_data_len) {
+                        if (btrfs_leaf_free_space(root, leaf) <
+                            (size - old_data_len)) {
+                                ret = -ENOSPC;
+                                goto out;
+                        }
                }
-                ret = btrfs_delete_one_dir_name(trans, root, path, di);
+                if (old_data_len + name_len + sizeof(*di) == item_size) {
-                if (ret)
+                        /* No other xattrs packed in the same leaf item. */
-                        goto out;
+                        if (size > old_data_len)
+                                btrfs_extend_item(root, path,
+                                                  size - old_data_len);
+                        else if (size < old_data_len)
+                                btrfs_truncate_item(root, path, data_size, 1);
+                } else {
+                        /* There are other xattrs packed in the same item. */
+                        ret = btrfs_delete_one_dir_name(trans, root, path, di);
+                        if (ret)
+                                goto out;
+                        btrfs_extend_item(root, path, data_size);
+                }
+                item = btrfs_item_nr(slot);
+                ptr = btrfs_item_ptr(leaf, slot, char);
+                ptr += btrfs_item_size(leaf, item) - data_size;
+                di = (struct btrfs_dir_item *)ptr;
+                btrfs_set_dir_data_len(leaf, di, size);
+                data_ptr = ((unsigned long)(di + 1)) + name_len;
+                write_extent_buffer(leaf, value, data_ptr, size);
+                btrfs_mark_buffer_dirty(leaf);
+        } else {
                /*
-                 * We have a value to set, so go back and try to insert it now.
+                 * Insert, and we had space for the xattr, so path->slots[0] is
+                 * where our xattr dir_item is and btrfs_insert_xattr_item()
+                 * filled it.
                 */
-                if (value) {
-                        btrfs_release_path(path);
-                        goto again;
-                }
        }
 out:
        btrfs_free_path(path);
diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h
index db3f772e57ae..a75fba67bb1f 100644
--- a/fs/ext4/ext4.h
+++ b/fs/ext4/ext4.h
@@ -158,17 +158,8 @@ struct ext4_allocation_request {
 #define EXT4_MAP_MAPPED         (1 << BH_Mapped)
 #define EXT4_MAP_UNWRITTEN      (1 << BH_Unwritten)
 #define EXT4_MAP_BOUNDARY       (1 << BH_Boundary)
-/* Sometimes (in the bigalloc case, from ext4_da_get_block_prep) the caller of
- * ext4_map_blocks wants to know whether or not the underlying cluster has
- * already been accounted for. EXT4_MAP_FROM_CLUSTER conveys to the caller that
- * the requested mapping was from previously mapped (or delayed allocated)
- * cluster. We use BH_AllocFromCluster only for this flag. BH_AllocFromCluster
- * should never appear on buffer_head's state flags.
- */
-#define EXT4_MAP_FROM_CLUSTER   (1 << BH_AllocFromCluster)
 #define EXT4_MAP_FLAGS          (EXT4_MAP_NEW | EXT4_MAP_MAPPED |\
-                                 EXT4_MAP_UNWRITTEN | EXT4_MAP_BOUNDARY |\
+                                 EXT4_MAP_UNWRITTEN | EXT4_MAP_BOUNDARY)
-                                 EXT4_MAP_FROM_CLUSTER)
 struct ext4_map_blocks {
        ext4_fsblk_t m_pblk;
@@ -565,10 +556,8 @@ enum {
 #define EXT4_GET_BLOCKS_KEEP_SIZE               0x0080
        /* Do not take i_data_sem locking in ext4_map_blocks */
 #define EXT4_GET_BLOCKS_NO_LOCK                 0x0100
-        /* Do not put hole in extent cache */
-#define EXT4_GET_BLOCKS_NO_PUT_HOLE             0x0200
        /* Convert written extents to unwritten */
-#define EXT4_GET_BLOCKS_CONVERT_UNWRITTEN       0x0400
+#define EXT4_GET_BLOCKS_CONVERT_UNWRITTEN       0x0200
 /*
 * The bit position of these flags must not overlap with any of the
@@ -889,10 +878,12 @@ struct ext4_inode_info {
        /* extents status tree */
        struct ext4_es_tree i_es_tree;
        rwlock_t i_es_lock;
-        struct list_head i_es_lru;
+        struct list_head i_es_list;
        unsigned int i_es_all_nr;       /* protected by i_es_lock */
-        unsigned int i_es_lru_nr;       /* protected by i_es_lock */
+        unsigned int i_es_shk_nr;       /* protected by i_es_lock */
-        unsigned long i_touch_when;     /* jiffies of last accessing */
+        ext4_lblk_t i_es_shrink_lblk;   /* Offset where we start searching for
+                                           extents to shrink. Protected by
+                                           i_es_lock  */
        /* ialloc */
        ext4_group_t    i_last_alloc_group;
@@ -1337,10 +1328,11 @@ struct ext4_sb_info {
        /* Reclaim extents from extent status tree */
        struct shrinker s_es_shrinker;
-        struct list_head s_es_lru;
+        struct list_head s_es_list;     /* List of inodes with reclaimable extents */
+        long s_es_nr_inode;
        struct ext4_es_stats s_es_stats;
        struct mb_cache *s_mb_cache;
-        spinlock_t s_es_lru_lock ____cacheline_aligned_in_smp;
+        spinlock_t s_es_lock ____cacheline_aligned_in_smp;
        /* Ratelimit ext4 messages. */
        struct ratelimit_state s_err_ratelimit_state;
@@ -2196,7 +2188,6 @@ extern int ext4_calculate_overhead(struct super_block *sb);
 extern void ext4_superblock_csum_set(struct super_block *sb);
 extern void *ext4_kvmalloc(size_t size, gfp_t flags);
 extern void *ext4_kvzalloc(size_t size, gfp_t flags);
-extern void ext4_kvfree(void *ptr);
 extern int ext4_alloc_flex_bg_array(struct super_block *sb,
                                    ext4_group_t ngroup);
 extern const char *ext4_decode_error(struct super_block *sb, int errno,
@@ -2647,7 +2638,7 @@ extern struct buffer_head *ext4_get_first_inline_block(struct inode *inode,
                                        int *retval);
 extern int ext4_inline_data_fiemap(struct inode *inode,
                                   struct fiemap_extent_info *fieinfo,
-                                   int *has_inline);
+                                   int *has_inline, __u64 start, __u64 len);
 extern int ext4_try_to_evict_inline_data(handle_t *handle,
                                         struct inode *inode,
                                         int needed);
@@ -2795,16 +2786,6 @@ extern int ext4_bio_write_page(struct ext4_io_submit *io,
 extern int ext4_multi_mount_protect(struct super_block *, ext4_fsblk_t);
 /*
- * Note that these flags will never ever appear in a buffer_head's state flag.
- * See EXT4_MAP_... to see where this is used.
- */
-enum ext4_state_bits {
-        BH_AllocFromCluster     /* allocated blocks were part of already
-                                 * allocated cluster. */
-        = BH_JBDPrivateStart
-};
-/*
 * Add new method to test whether block and inode bitmaps are properly
 * initialized. With uninit_bg reading the block from disk is not enough
 * to mark the bitmap uptodate. We need to also zero-out the bitmap
diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c
index 0b16fb4c06d3..e5d3eadf47b1 100644
--- a/fs/ext4/extents.c
+++ b/fs/ext4/extents.c
@@ -2306,16 +2306,16 @@ ext4_ext_put_gap_in_cache(struct inode *inode, struct ext4_ext_path *path,
                                ext4_lblk_t block)
 {
        int depth = ext_depth(inode);
-        unsigned long len = 0;
+        ext4_lblk_t len;
-        ext4_lblk_t lblock = 0;
+        ext4_lblk_t lblock;
        struct ext4_extent *ex;
+        struct extent_status es;
        ex = path[depth].p_ext;
        if (ex == NULL) {
-                /*
+                /* there is no extent yet, so gap is [0;-] */
-                 * there is no extent yet, so gap is [0;-] and we
+                lblock = 0;
-                 * don't cache it
+                len = EXT_MAX_BLOCKS;
-                 */
                ext_debug("cache gap(whole file):");
        } else if (block < le32_to_cpu(ex->ee_block)) {
                lblock = block;
@@ -2324,9 +2324,6 @@ ext4_ext_put_gap_in_cache(struct inode *inode, struct ext4_ext_path *path,
                                block,
                                le32_to_cpu(ex->ee_block),
                                 ext4_ext_get_actual_len(ex));
-                if (!ext4_find_delalloc_range(inode, lblock, lblock + len - 1))
-                        ext4_es_insert_extent(inode, lblock, len, ~0,
-                                              EXTENT_STATUS_HOLE);
        } else if (block >= le32_to_cpu(ex->ee_block)
                        + ext4_ext_get_actual_len(ex)) {
                ext4_lblk_t next;
@@ -2340,14 +2337,19 @@ ext4_ext_put_gap_in_cache(struct inode *inode, struct ext4_ext_path *path,
                                block);
                BUG_ON(next == lblock);
                len = next - lblock;
-                if (!ext4_find_delalloc_range(inode, lblock, lblock + len - 1))
-                        ext4_es_insert_extent(inode, lblock, len, ~0,
-                                              EXTENT_STATUS_HOLE);
        } else {
                BUG();
        }
-        ext_debug(" -> %u:%lu\n", lblock, len);
+        ext4_es_find_delayed_extent_range(inode, lblock, lblock + len - 1, &es);
+        if (es.es_len) {
+                /* There's delayed extent containing lblock? */
+                if (es.es_lblk <= lblock)
+                        return;
+                len = min(es.es_lblk - lblock, len);
+        }
+        ext_debug(" -> %u:%u\n", lblock, len);
+        ext4_es_insert_extent(inode, lblock, len, ~0, EXTENT_STATUS_HOLE);
 }
 /*
@@ -2481,7 +2483,7 @@ static int ext4_remove_blocks(handle_t *handle, struct inode *inode,
                              ext4_lblk_t from, ext4_lblk_t to)
 {
        struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
-        unsigned short ee_len =  ext4_ext_get_actual_len(ex);
+        unsigned short ee_len = ext4_ext_get_actual_len(ex);
        ext4_fsblk_t pblk;
        int flags = get_default_free_blocks_flags(inode);
@@ -2490,7 +2492,7 @@ static int ext4_remove_blocks(handle_t *handle, struct inode *inode,
         * at the beginning of the extent.  Instead, we make a note
         * that we tried freeing the cluster, and check to see if we
         * need to free it on a subsequent call to ext4_remove_blocks,
-         * or at the end of the ext4_truncate() operation.
+         * or at the end of ext4_ext_rm_leaf or ext4_ext_remove_space.
         */
        flags |= EXT4_FREE_BLOCKS_NOFREE_FIRST_CLUSTER;
@@ -2501,8 +2503,8 @@ static int ext4_remove_blocks(handle_t *handle, struct inode *inode,
         * partial cluster here.
         */
        pblk = ext4_ext_pblock(ex) + ee_len - 1;
-        if ((*partial_cluster > 0) &&
+        if (*partial_cluster > 0 &&
-            (EXT4_B2C(sbi, pblk) != *partial_cluster)) {
+            *partial_cluster != (long long) EXT4_B2C(sbi, pblk)) {
                ext4_free_blocks(handle, inode, NULL,
                                 EXT4_C2B(sbi, *partial_cluster),
                                 sbi->s_cluster_ratio, flags);
@@ -2528,7 +2530,7 @@ static int ext4_remove_blocks(handle_t *handle, struct inode *inode,
            && to == le32_to_cpu(ex->ee_block) + ee_len - 1) {
                /* tail removal */
                ext4_lblk_t num;
-                unsigned int unaligned;
+                long long first_cluster;
                num = le32_to_cpu(ex->ee_block) + ee_len - from;
                pblk = ext4_ext_pblock(ex) + ee_len - num;
@@ -2538,7 +2540,7 @@ static int ext4_remove_blocks(handle_t *handle, struct inode *inode,
                 * used by any other extent (partial_cluster is negative).
                 */
                if (*partial_cluster < 0 &&
-                    -(*partial_cluster) == EXT4_B2C(sbi, pblk + num - 1))
+                    *partial_cluster == -(long long) EXT4_B2C(sbi, pblk+num-1))
                        flags |= EXT4_FREE_BLOCKS_NOFREE_LAST_CLUSTER;
                ext_debug("free last %u blocks starting %llu partial %lld\n",
@@ -2549,21 +2551,24 @@ static int ext4_remove_blocks(handle_t *handle, struct inode *inode,
                 * beginning of a cluster, and we removed the entire
                 * extent and the cluster is not used by any other extent,
                 * save the partial cluster here, since we might need to
-                 * delete if we determine that the truncate operation has
+                 * delete if we determine that the truncate or punch hole
-                 * removed all of the blocks in the cluster.
+                 * operation has removed all of the blocks in the cluster.
+                 * If that cluster is used by another extent, preserve its
+                 * negative value so it isn't freed later on.
                 *
-                 * On the other hand, if we did not manage to free the whole
+                 * If the whole extent wasn't freed, we've reached the
-                 * extent, we have to mark the cluster as used (store negative
+                 * start of the truncated/punched region and have finished
-                 * cluster number in partial_cluster).
+                 * removing blocks.  If there's a partial cluster here it's
+                 * shared with the remainder of the extent and is no longer
+                 * a candidate for removal.
                 */
-                unaligned = EXT4_PBLK_COFF(sbi, pblk);
+                if (EXT4_PBLK_COFF(sbi, pblk) && ee_len == num) {
-                if (unaligned && (ee_len == num) &&
+                        first_cluster = (long long) EXT4_B2C(sbi, pblk);
-                    (*partial_cluster != -((long long)EXT4_B2C(sbi, pblk))))
+                        if (first_cluster != -*partial_cluster)
-                        *partial_cluster = EXT4_B2C(sbi, pblk);
+                                *partial_cluster = first_cluster;
-                else if (unaligned)
+                } else {
-                        *partial_cluster = -((long long)EXT4_B2C(sbi, pblk));
-                else if (*partial_cluster > 0)
                        *partial_cluster = 0;
+                }
        } else
                ext4_error(sbi->s_sb, "strange request: removal(2) "
                           "%u-%u from %u:%u\n",
@@ -2574,15 +2579,16 @@ static int ext4_remove_blocks(handle_t *handle, struct inode *inode,
 /*
 * ext4_ext_rm_leaf() Removes the extents associated with the
- * blocks appearing between "start" and "end", and splits the extents
+ * blocks appearing between "start" and "end".  Both "start"
- * if "start" and "end" appear in the same extent
+ * and "end" must appear in the same extent or EIO is returned.
 *
 * @handle: The journal handle
 * @inode:  The files inode
 * @path:   The path to the leaf
 * @partial_cluster: The cluster which we'll have to free if all extents
- *                   has been released from it. It gets negative in case
+ *                   has been released from it.  However, if this value is
- *                   that the cluster is still used.
+ *                   negative, it's a cluster just to the right of the
+ *                   punched region and it must not be freed.
 * @start:  The first block to remove
 * @end:   The last block to remove
 */
@@ -2621,27 +2627,6 @@ ext4_ext_rm_leaf(handle_t *handle, struct inode *inode,
        ex_ee_block = le32_to_cpu(ex->ee_block);
        ex_ee_len = ext4_ext_get_actual_len(ex);
-        /*
-         * If we're starting with an extent other than the last one in the
-         * node, we need to see if it shares a cluster with the extent to
-         * the right (towards the end of the file). If its leftmost cluster
-         * is this extent's rightmost cluster and it is not cluster aligned,
-         * we'll mark it as a partial that is not to be deallocated.
-         */
-        if (ex != EXT_LAST_EXTENT(eh)) {
-                ext4_fsblk_t current_pblk, right_pblk;
-                long long current_cluster, right_cluster;
-                current_pblk = ext4_ext_pblock(ex) + ex_ee_len - 1;
-                current_cluster = (long long)EXT4_B2C(sbi, current_pblk);
-                right_pblk = ext4_ext_pblock(ex + 1);
-                right_cluster = (long long)EXT4_B2C(sbi, right_pblk);
-                if (current_cluster == right_cluster &&
-                        EXT4_PBLK_COFF(sbi, right_pblk))
-                        *partial_cluster = -right_cluster;
-        }
        trace_ext4_ext_rm_leaf(inode, start, ex, *partial_cluster);
        while (ex >= EXT_FIRST_EXTENT(eh) &&
@@ -2666,14 +2651,16 @@ ext4_ext_rm_leaf(handle_t *handle, struct inode *inode,
                if (end < ex_ee_block) {
                        /*
                         * We're going to skip this extent and move to another,
-                         * so if this extent is not cluster aligned we have
+                         * so note that its first cluster is in use to avoid
-                         * to mark the current cluster as used to avoid
+                         * freeing it when removing blocks.  Eventually, the
-                         * accidentally freeing it later on
+                         * right edge of the truncated/punched region will
+                         * be just to the left.
                         */
-                        pblk = ext4_ext_pblock(ex);
+                        if (sbi->s_cluster_ratio > 1) {
-                        if (EXT4_PBLK_COFF(sbi, pblk))
+                                pblk = ext4_ext_pblock(ex);
                                *partial_cluster =
-                                        -((long long)EXT4_B2C(sbi, pblk));
+                                        -(long long) EXT4_B2C(sbi, pblk);
+                        }
                        ex--;
                        ex_ee_block = le32_to_cpu(ex->ee_block);
                        ex_ee_len = ext4_ext_get_actual_len(ex);
@@ -2749,8 +2736,7 @@ ext4_ext_rm_leaf(handle_t *handle, struct inode *inode,
                                        sizeof(struct ext4_extent));
                        }
                        le16_add_cpu(&eh->eh_entries, -1);
-                } else if (*partial_cluster > 0)
+                }
-                        *partial_cluster = 0;
                err = ext4_ext_dirty(handle, inode, path + depth);
                if (err)
@@ -2769,20 +2755,18 @@ ext4_ext_rm_leaf(handle_t *handle, struct inode *inode,
        /*
         * If there's a partial cluster and at least one extent remains in
         * the leaf, free the partial cluster if it isn't shared with the
-         * current extent.  If there's a partial cluster and no extents
+         * current extent.  If it is shared with the current extent
-         * remain in the leaf, it can't be freed here.  It can only be
+         * we zero partial_cluster because we've reached the start of the
-         * freed when it's possible to determine if it's not shared with
+         * truncated/punched region and we're done removing blocks.
-         * any other extent - when the next leaf is processed or when space
-         * removal is complete.
         */
-        if (*partial_cluster > 0 && eh->eh_entries &&
+        if (*partial_cluster > 0 && ex >= EXT_FIRST_EXTENT(eh)) {
-            (EXT4_B2C(sbi, ext4_ext_pblock(ex) + ex_ee_len - 1) !=
+                pblk = ext4_ext_pblock(ex) + ex_ee_len - 1;
-             *partial_cluster)) {
+                if (*partial_cluster != (long long) EXT4_B2C(sbi, pblk)) {
-                int flags = get_default_free_blocks_flags(inode);
+                        ext4_free_blocks(handle, inode, NULL,
+                                         EXT4_C2B(sbi, *partial_cluster),
-                ext4_free_blocks(handle, inode, NULL,
+                                         sbi->s_cluster_ratio,
-                                 EXT4_C2B(sbi, *partial_cluster),
+                                         get_default_free_blocks_flags(inode));
-                                 sbi->s_cluster_ratio, flags);
+                }
                *partial_cluster = 0;
        }
@@ -2819,7 +2803,7 @@ ext4_ext_more_to_rm(struct ext4_ext_path *path)
 int ext4_ext_remove_space(struct inode *inode, ext4_lblk_t start,
                          ext4_lblk_t end)
 {
-        struct super_block *sb = inode->i_sb;
+        struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
        int depth = ext_depth(inode);
        struct ext4_ext_path *path = NULL;
        long long partial_cluster = 0;
@@ -2845,9 +2829,10 @@ again:
         */
        if (end < EXT_MAX_BLOCKS - 1) {
                struct ext4_extent *ex;
-                ext4_lblk_t ee_block;
+                ext4_lblk_t ee_block, ex_end, lblk;
+                ext4_fsblk_t pblk;
-                /* find extent for this block */
+                /* find extent for or closest extent to this block */
                path = ext4_find_extent(inode, end, NULL, EXT4_EX_NOCACHE);
                if (IS_ERR(path)) {
                        ext4_journal_stop(handle);
@@ -2867,6 +2852,7 @@ again:
                }
                ee_block = le32_to_cpu(ex->ee_block);
+                ex_end = ee_block + ext4_ext_get_actual_len(ex) - 1;
                /*
                 * See if the last block is inside the extent, if so split
@@ -2874,8 +2860,19 @@ again:
                 * tail of the first part of the split extent in
                 * ext4_ext_rm_leaf().
                 */
-                if (end >= ee_block &&
+                if (end >= ee_block && end < ex_end) {
-                    end < ee_block + ext4_ext_get_actual_len(ex) - 1) {
+                        /*
+                         * If we're going to split the extent, note that
+                         * the cluster containing the block after 'end' is
+                         * in use to avoid freeing it when removing blocks.
+                         */
+                        if (sbi->s_cluster_ratio > 1) {
+                                pblk = ext4_ext_pblock(ex) + end - ee_block + 2;
+                                partial_cluster =
+                                        -(long long) EXT4_B2C(sbi, pblk);
+                        }
                        /*
                         * Split the extent in two so that 'end' is the last
                         * block in the first new extent. Also we should not
@@ -2886,6 +2883,24 @@ again:
                                                         end + 1, 1);
                        if (err < 0)
                                goto out;
+                } else if (sbi->s_cluster_ratio > 1 && end >= ex_end) {
+                        /*
+                         * If there's an extent to the right its first cluster
+                         * contains the immediate right boundary of the
+                         * truncated/punched region.  Set partial_cluster to
+                         * its negative value so it won't be freed if shared
+                         * with the current extent.  The end < ee_block case
+                         * is handled in ext4_ext_rm_leaf().
+                         */
+                        lblk = ex_end + 1;
+                        err = ext4_ext_search_right(inode, path, &lblk, &pblk,
+                                                    &ex);
+                        if (err)
+                                goto out;
+                        if (pblk)
+                                partial_cluster =
+                                        -(long long) EXT4_B2C(sbi, pblk);
                }
        }
        /*
@@ -2996,16 +3011,18 @@ again:
        trace_ext4_ext_remove_space_done(inode, start, end, depth,
                        partial_cluster, path->p_hdr->eh_entries);
-        /* If we still have something in the partial cluster and we have removed
+        /*
+         * If we still have something in the partial cluster and we have removed
         * even the first extent, then we should free the blocks in the partial
-         * cluster as well. */
+         * cluster as well.  (This code will only run when there are no leaves
-        if (partial_cluster > 0 && path->p_hdr->eh_entries == 0) {
+         * to the immediate left of the truncated/punched region.)
-                int flags = get_default_free_blocks_flags(inode);
+         */
+        if (partial_cluster > 0 && err == 0) {
+                /* don't zero partial_cluster since it's not used afterwards */
                ext4_free_blocks(handle, inode, NULL,
-                                 EXT4_C2B(EXT4_SB(sb), partial_cluster),
+                                 EXT4_C2B(sbi, partial_cluster),
-                                 EXT4_SB(sb)->s_cluster_ratio, flags);
+                                 sbi->s_cluster_ratio,
-                partial_cluster = 0;
+                                 get_default_free_blocks_flags(inode));
        }
        /* TODO: flexible tree reduction should be here */
@@ -4267,6 +4284,7 @@ int ext4_ext_map_blocks(handle_t *handle, struct inode *inode,
        ext4_io_end_t *io = ext4_inode_aio(inode);
        ext4_lblk_t cluster_offset;
        int set_unwritten = 0;
+        bool map_from_cluster = false;
        ext_debug("blocks %u/%u requested for inode %lu\n",
                  map->m_lblk, map->m_len, inode->i_ino);
@@ -4343,10 +4361,6 @@ int ext4_ext_map_blocks(handle_t *handle, struct inode *inode,
                }
        }
-        if ((sbi->s_cluster_ratio > 1) &&
-            ext4_find_delalloc_cluster(inode, map->m_lblk))
-                map->m_flags |= EXT4_MAP_FROM_CLUSTER;
        /*
         * requested block isn't allocated yet;
         * we couldn't try to create block if create flag is zero
@@ -4356,15 +4370,13 @@ int ext4_ext_map_blocks(handle_t *handle, struct inode *inode,
                 * put just found gap into cache to speed up
                 * subsequent requests
                 */
-                if ((flags & EXT4_GET_BLOCKS_NO_PUT_HOLE) == 0)
+                ext4_ext_put_gap_in_cache(inode, path, map->m_lblk);
-                        ext4_ext_put_gap_in_cache(inode, path, map->m_lblk);
                goto out2;
        }
        /*
         * Okay, we need to do block allocation.
         */
-        map->m_flags &= ~EXT4_MAP_FROM_CLUSTER;
        newex.ee_block = cpu_to_le32(map->m_lblk);
        cluster_offset = EXT4_LBLK_COFF(sbi, map->m_lblk);
@@ -4376,7 +4388,7 @@ int ext4_ext_map_blocks(handle_t *handle, struct inode *inode,
            get_implied_cluster_alloc(inode->i_sb, map, ex, path)) {
                ar.len = allocated = map->m_len;
                newblock = map->m_pblk;
-                map->m_flags |= EXT4_MAP_FROM_CLUSTER;
+                map_from_cluster = true;
                goto got_allocated_blocks;
        }
@@ -4397,7 +4409,7 @@ int ext4_ext_map_blocks(handle_t *handle, struct inode *inode,
            get_implied_cluster_alloc(inode->i_sb, map, ex2, path)) {
                ar.len = allocated = map->m_len;
                newblock = map->m_pblk;
-                map->m_flags |= EXT4_MAP_FROM_CLUSTER;
+                map_from_cluster = true;
                goto got_allocated_blocks;
        }
@@ -4523,7 +4535,7 @@ got_allocated_blocks:
                 */
                reserved_clusters = get_reserved_cluster_alloc(inode,
                                                map->m_lblk, allocated);
-                if (map->m_flags & EXT4_MAP_FROM_CLUSTER) {
+                if (map_from_cluster) {
                        if (reserved_clusters) {
                                /*
                                 * We have clusters reserved for this range.
@@ -4620,7 +4632,6 @@ out2:
        trace_ext4_ext_map_blocks_exit(inode, flags, map,
                                       err ? err : allocated);
-        ext4_es_lru_add(inode);
        return err ? err : allocated;
 }
@@ -5140,7 +5151,8 @@ int ext4_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
        if (ext4_has_inline_data(inode)) {
                int has_inline = 1;
-                error = ext4_inline_data_fiemap(inode, fieinfo, &has_inline);
+                error = ext4_inline_data_fiemap(inode, fieinfo, &has_inline,
+                                                start, len);
                if (has_inline)
                        return error;
@@ -5154,8 +5166,8 @@ int ext4_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
        /* fallback to generic here if not in extents fmt */
        if (!(ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)))
-                return generic_block_fiemap(inode, fieinfo, start, len,
+                return __generic_block_fiemap(inode, fieinfo, start, len,
-                        ext4_get_block);
+                                              ext4_get_block);
        if (fiemap_check_flags(fieinfo, EXT4_FIEMAP_FLAGS))
                return -EBADR;
@@ -5179,7 +5191,6 @@ int ext4_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
                error = ext4_fill_fiemap_extents(inode, start_blk,
                                                 len_blks, fieinfo);
        }
-        ext4_es_lru_add(inode);
        return error;
 }
@@ -5239,8 +5250,6 @@ ext4_ext_shift_path_extents(struct ext4_ext_path *path, ext4_lblk_t shift,
                                return -EIO;
                        ex_last = EXT_LAST_EXTENT(path[depth].p_hdr);
-                        if (!ex_last)
-                                return -EIO;
                        err = ext4_access_path(handle, inode, path + depth);
                        if (err)
diff --git a/fs/ext4/extents_status.c b/fs/ext4/extents_status.c
index 94e7855ae71b..e04d45733976 100644
--- a/fs/ext4/extents_status.c
+++ b/fs/ext4/extents_status.c
@@ -147,10 +147,9 @@ static struct kmem_cache *ext4_es_cachep;
 static int __es_insert_extent(struct inode *inode, struct extent_status *newes);
 static int __es_remove_extent(struct inode *inode, ext4_lblk_t lblk,
                              ext4_lblk_t end);
-static int __es_try_to_reclaim_extents(struct ext4_inode_info *ei,
+static int es_reclaim_extents(struct ext4_inode_info *ei, int *nr_to_scan);
-                                       int nr_to_scan);
+static int __es_shrink(struct ext4_sb_info *sbi, int nr_to_scan,
-static int __ext4_es_shrink(struct ext4_sb_info *sbi, int nr_to_scan,
+                       struct ext4_inode_info *locked_ei);
-                            struct ext4_inode_info *locked_ei);
 int __init ext4_init_es(void)
 {
@@ -298,6 +297,36 @@ out:
        trace_ext4_es_find_delayed_extent_range_exit(inode, es);
 }
+static void ext4_es_list_add(struct inode *inode)
+{
+        struct ext4_inode_info *ei = EXT4_I(inode);
+        struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
+        if (!list_empty(&ei->i_es_list))
+                return;
+        spin_lock(&sbi->s_es_lock);
+        if (list_empty(&ei->i_es_list)) {
+                list_add_tail(&ei->i_es_list, &sbi->s_es_list);
+                sbi->s_es_nr_inode++;
+        }
+        spin_unlock(&sbi->s_es_lock);
+}
+static void ext4_es_list_del(struct inode *inode)
+{
+        struct ext4_inode_info *ei = EXT4_I(inode);
+        struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
+        spin_lock(&sbi->s_es_lock);
+        if (!list_empty(&ei->i_es_list)) {
+                list_del_init(&ei->i_es_list);
+                sbi->s_es_nr_inode--;
+                WARN_ON_ONCE(sbi->s_es_nr_inode < 0);
+        }
+        spin_unlock(&sbi->s_es_lock);
+}
 static struct extent_status *
 ext4_es_alloc_extent(struct inode *inode, ext4_lblk_t lblk, ext4_lblk_t len,
                     ext4_fsblk_t pblk)
@@ -314,9 +343,10 @@ ext4_es_alloc_extent(struct inode *inode, ext4_lblk_t lblk, ext4_lblk_t len,
         * We don't count delayed extent because we never try to reclaim them
         */
        if (!ext4_es_is_delayed(es)) {
-                EXT4_I(inode)->i_es_lru_nr++;
+                if (!EXT4_I(inode)->i_es_shk_nr++)
+                        ext4_es_list_add(inode);
                percpu_counter_inc(&EXT4_SB(inode->i_sb)->
-                                        s_es_stats.es_stats_lru_cnt);
+                                        s_es_stats.es_stats_shk_cnt);
        }
        EXT4_I(inode)->i_es_all_nr++;
@@ -330,12 +360,13 @@ static void ext4_es_free_extent(struct inode *inode, struct extent_status *es)
        EXT4_I(inode)->i_es_all_nr--;
        percpu_counter_dec(&EXT4_SB(inode->i_sb)->s_es_stats.es_stats_all_cnt);
-        /* Decrease the lru counter when this es is not delayed */
+        /* Decrease the shrink counter when this es is not delayed */
        if (!ext4_es_is_delayed(es)) {
-                BUG_ON(EXT4_I(inode)->i_es_lru_nr == 0);
+                BUG_ON(EXT4_I(inode)->i_es_shk_nr == 0);
-                EXT4_I(inode)->i_es_lru_nr--;
+                if (!--EXT4_I(inode)->i_es_shk_nr)
+                        ext4_es_list_del(inode);
                percpu_counter_dec(&EXT4_SB(inode->i_sb)->
-                                        s_es_stats.es_stats_lru_cnt);
+                                        s_es_stats.es_stats_shk_cnt);
        }
        kmem_cache_free(ext4_es_cachep, es);
@@ -351,7 +382,7 @@ static void ext4_es_free_extent(struct inode *inode, struct extent_status *es)
 static int ext4_es_can_be_merged(struct extent_status *es1,
                                 struct extent_status *es2)
 {
-        if (ext4_es_status(es1) != ext4_es_status(es2))
+        if (ext4_es_type(es1) != ext4_es_type(es2))
                return 0;
        if (((__u64) es1->es_len) + es2->es_len > EXT_MAX_BLOCKS) {
@@ -394,6 +425,8 @@ ext4_es_try_to_merge_left(struct inode *inode, struct extent_status *es)
        es1 = rb_entry(node, struct extent_status, rb_node);
        if (ext4_es_can_be_merged(es1, es)) {
                es1->es_len += es->es_len;
+                if (ext4_es_is_referenced(es))
+                        ext4_es_set_referenced(es1);
                rb_erase(&es->rb_node, &tree->root);
                ext4_es_free_extent(inode, es);
                es = es1;
@@ -416,6 +449,8 @@ ext4_es_try_to_merge_right(struct inode *inode, struct extent_status *es)
        es1 = rb_entry(node, struct extent_status, rb_node);
        if (ext4_es_can_be_merged(es, es1)) {
                es->es_len += es1->es_len;
+                if (ext4_es_is_referenced(es1))
+                        ext4_es_set_referenced(es);
                rb_erase(node, &tree->root);
                ext4_es_free_extent(inode, es1);
        }
@@ -683,8 +718,8 @@ int ext4_es_insert_extent(struct inode *inode, ext4_lblk_t lblk,
                goto error;
 retry:
        err = __es_insert_extent(inode, &newes);
-        if (err == -ENOMEM && __ext4_es_shrink(EXT4_SB(inode->i_sb), 1,
+        if (err == -ENOMEM && __es_shrink(EXT4_SB(inode->i_sb),
-                                               EXT4_I(inode)))
+                                          128, EXT4_I(inode)))
                goto retry;
        if (err == -ENOMEM && !ext4_es_is_delayed(&newes))
                err = 0;
@@ -782,6 +817,8 @@ out:
                es->es_lblk = es1->es_lblk;
                es->es_len = es1->es_len;
                es->es_pblk = es1->es_pblk;
+                if (!ext4_es_is_referenced(es))
+                        ext4_es_set_referenced(es);
                stats->es_stats_cache_hits++;
        } else {
                stats->es_stats_cache_misses++;
@@ -841,8 +878,8 @@ retry:
                                es->es_lblk = orig_es.es_lblk;
                                es->es_len = orig_es.es_len;
                                if ((err == -ENOMEM) &&
-                                    __ext4_es_shrink(EXT4_SB(inode->i_sb), 1,
+                                    __es_shrink(EXT4_SB(inode->i_sb),
-                                                     EXT4_I(inode)))
+                                                        128, EXT4_I(inode)))
                                        goto retry;
                                goto out;
                        }
@@ -914,6 +951,11 @@ int ext4_es_remove_extent(struct inode *inode, ext4_lblk_t lblk,
        end = lblk + len - 1;
        BUG_ON(end < lblk);
+        /*
+         * ext4_clear_inode() depends on us taking i_es_lock unconditionally
+         * so that we are sure __es_shrink() is done with the inode before it
+         * is reclaimed.
+         */
        write_lock(&EXT4_I(inode)->i_es_lock);
        err = __es_remove_extent(inode, lblk, end);
        write_unlock(&EXT4_I(inode)->i_es_lock);
@@ -921,114 +963,75 @@ int ext4_es_remove_extent(struct inode *inode, ext4_lblk_t lblk,
        return err;
 }
-static int ext4_inode_touch_time_cmp(void *priv, struct list_head *a,
+static int __es_shrink(struct ext4_sb_info *sbi, int nr_to_scan,
-                                     struct list_head *b)
+                       struct ext4_inode_info *locked_ei)
-{
-        struct ext4_inode_info *eia, *eib;
-        eia = list_entry(a, struct ext4_inode_info, i_es_lru);
-        eib = list_entry(b, struct ext4_inode_info, i_es_lru);
-        if (ext4_test_inode_state(&eia->vfs_inode, EXT4_STATE_EXT_PRECACHED) &&
-            !ext4_test_inode_state(&eib->vfs_inode, EXT4_STATE_EXT_PRECACHED))
-                return 1;
-        if (!ext4_test_inode_state(&eia->vfs_inode, EXT4_STATE_EXT_PRECACHED) &&
-            ext4_test_inode_state(&eib->vfs_inode, EXT4_STATE_EXT_PRECACHED))
-                return -1;
-        if (eia->i_touch_when == eib->i_touch_when)
-                return 0;
-        if (time_after(eia->i_touch_when, eib->i_touch_when))
-                return 1;
-        else
-                return -1;
-}
-static int __ext4_es_shrink(struct ext4_sb_info *sbi, int nr_to_scan,
-                            struct ext4_inode_info *locked_ei)
 {
        struct ext4_inode_info *ei;
        struct ext4_es_stats *es_stats;
-        struct list_head *cur, *tmp;
-        LIST_HEAD(skipped);
        ktime_t start_time;
        u64 scan_time;
+        int nr_to_walk;
        int nr_shrunk = 0;
-        int retried = 0, skip_precached = 1, nr_skipped = 0;
+        int retried = 0, nr_skipped = 0;
        es_stats = &sbi->s_es_stats;
        start_time = ktime_get();
-        spin_lock(&sbi->s_es_lru_lock);
 retry:
-        list_for_each_safe(cur, tmp, &sbi->s_es_lru) {
+        spin_lock(&sbi->s_es_lock);
-                int shrunk;
+        nr_to_walk = sbi->s_es_nr_inode;
+        while (nr_to_walk-- > 0) {
-                /*
+                if (list_empty(&sbi->s_es_list)) {
-                 * If we have already reclaimed all extents from extent
+                        spin_unlock(&sbi->s_es_lock);
-                 * status tree, just stop the loop immediately.
+                        goto out;
-                 */
+                }
-                if (percpu_counter_read_positive(
+                ei = list_first_entry(&sbi->s_es_list, struct ext4_inode_info,
-                                &es_stats->es_stats_lru_cnt) == 0)
+                                      i_es_list);
-                        break;
+                /* Move the inode to the tail */
+                list_move_tail(&ei->i_es_list, &sbi->s_es_list);
-                ei = list_entry(cur, struct ext4_inode_info, i_es_lru);
                /*
-                 * Skip the inode that is newer than the last_sorted
+                 * Normally we try hard to avoid shrinking precached inodes,
-                 * time.  Normally we try hard to avoid shrinking
+                 * but we will as a last resort.
-                 * precached inodes, but we will as a last resort.
                 */
-                if ((es_stats->es_stats_last_sorted < ei->i_touch_when) ||
+                if (!retried && ext4_test_inode_state(&ei->vfs_inode,
-                    (skip_precached && ext4_test_inode_state(&ei->vfs_inode,
+                                                EXT4_STATE_EXT_PRECACHED)) {
-                                                EXT4_STATE_EXT_PRECACHED))) {
                        nr_skipped++;
-                        list_move_tail(cur, &skipped);
                        continue;
                }
-                if (ei->i_es_lru_nr == 0 || ei == locked_ei ||
+                if (ei == locked_ei || !write_trylock(&ei->i_es_lock)) {
-                    !write_trylock(&ei->i_es_lock))
+                        nr_skipped++;
                        continue;
+                }
+                /*
+                 * Now we hold i_es_lock which protects us from inode reclaim
+                 * freeing inode under us
+                 */
+                spin_unlock(&sbi->s_es_lock);
-                shrunk = __es_try_to_reclaim_extents(ei, nr_to_scan);
+                nr_shrunk += es_reclaim_extents(ei, &nr_to_scan);
-                if (ei->i_es_lru_nr == 0)
-                        list_del_init(&ei->i_es_lru);
                write_unlock(&ei->i_es_lock);
-                nr_shrunk += shrunk;
+                if (nr_to_scan <= 0)
-                nr_to_scan -= shrunk;
+                        goto out;
-                if (nr_to_scan == 0)
+                spin_lock(&sbi->s_es_lock);
-                        break;
        }
+        spin_unlock(&sbi->s_es_lock);
-        /* Move the newer inodes into the tail of the LRU list. */
-        list_splice_tail(&skipped, &sbi->s_es_lru);
-        INIT_LIST_HEAD(&skipped);
        /*
         * If we skipped any inodes, and we weren't able to make any
-         * forward progress, sort the list and try again.
+         * forward progress, try again to scan precached inodes.
         */
        if ((nr_shrunk == 0) && nr_skipped && !retried) {
                retried++;
-                list_sort(NULL, &sbi->s_es_lru, ext4_inode_touch_time_cmp);
-                es_stats->es_stats_last_sorted = jiffies;
-                ei = list_first_entry(&sbi->s_es_lru, struct ext4_inode_info,
-                                      i_es_lru);
-                /*
-                 * If there are no non-precached inodes left on the
-                 * list, start releasing precached extents.
-                 */
-                if (ext4_test_inode_state(&ei->vfs_inode,
-                                          EXT4_STATE_EXT_PRECACHED))
-                        skip_precached = 0;
                goto retry;
        }
-        spin_unlock(&sbi->s_es_lru_lock);
        if (locked_ei && nr_shrunk == 0)
-                nr_shrunk = __es_try_to_reclaim_extents(locked_ei, nr_to_scan);
+                nr_shrunk = es_reclaim_extents(locked_ei, &nr_to_scan);
+out:
        scan_time = ktime_to_ns(ktime_sub(ktime_get(), start_time));
        if (likely(es_stats->es_stats_scan_time))
                es_stats->es_stats_scan_time = (scan_time +
@@ -1043,7 +1046,7 @@ retry:
        else
                es_stats->es_stats_shrunk = nr_shrunk;
-        trace_ext4_es_shrink(sbi->s_sb, nr_shrunk, scan_time, skip_precached,
+        trace_ext4_es_shrink(sbi->s_sb, nr_shrunk, scan_time,
                             nr_skipped, retried);
        return nr_shrunk;
 }
@@ -1055,7 +1058,7 @@ static unsigned long ext4_es_count(struct shrinker *shrink,
        struct ext4_sb_info *sbi;
        sbi = container_of(shrink, struct ext4_sb_info, s_es_shrinker);
-        nr = percpu_counter_read_positive(&sbi->s_es_stats.es_stats_lru_cnt);
+        nr = percpu_counter_read_positive(&sbi->s_es_stats.es_stats_shk_cnt);
        trace_ext4_es_shrink_count(sbi->s_sb, sc->nr_to_scan, nr);
        return nr;
 }
@@ -1068,13 +1071,13 @@ static unsigned long ext4_es_scan(struct shrinker *shrink,
        int nr_to_scan = sc->nr_to_scan;
        int ret, nr_shrunk;
-        ret = percpu_counter_read_positive(&sbi->s_es_stats.es_stats_lru_cnt);
+        ret = percpu_counter_read_positive(&sbi->s_es_stats.es_stats_shk_cnt);
        trace_ext4_es_shrink_scan_enter(sbi->s_sb, nr_to_scan, ret);
        if (!nr_to_scan)
                return ret;
-        nr_shrunk = __ext4_es_shrink(sbi, nr_to_scan, NULL);
+        nr_shrunk = __es_shrink(sbi, nr_to_scan, NULL);
        trace_ext4_es_shrink_scan_exit(sbi->s_sb, nr_shrunk, ret);
        return nr_shrunk;
@@ -1102,28 +1105,24 @@ static int ext4_es_seq_shrinker_info_show(struct seq_file *seq, void *v)
                return 0;
        /* here we just find an inode that has the max nr. of objects */
-        spin_lock(&sbi->s_es_lru_lock);
+        spin_lock(&sbi->s_es_lock);
-        list_for_each_entry(ei, &sbi->s_es_lru, i_es_lru) {
+        list_for_each_entry(ei, &sbi->s_es_list, i_es_list) {
                inode_cnt++;
                if (max && max->i_es_all_nr < ei->i_es_all_nr)
                        max = ei;
                else if (!max)
                        max = ei;
        }
-        spin_unlock(&sbi->s_es_lru_lock);
+        spin_unlock(&sbi->s_es_lock);
        seq_printf(seq, "stats:\n  %lld objects\n  %lld reclaimable objects\n",
                   percpu_counter_sum_positive(&es_stats->es_stats_all_cnt),
-                   percpu_counter_sum_positive(&es_stats->es_stats_lru_cnt));
+                   percpu_counter_sum_positive(&es_stats->es_stats_shk_cnt));
        seq_printf(seq, "  %lu/%lu cache hits/misses\n",
                   es_stats->es_stats_cache_hits,
                   es_stats->es_stats_cache_misses);
-        if (es_stats->es_stats_last_sorted != 0)
-                seq_printf(seq, "  %u ms last sorted interval\n",
-                           jiffies_to_msecs(jiffies -
-                                            es_stats->es_stats_last_sorted));
        if (inode_cnt)
-                seq_printf(seq, "  %d inodes on lru list\n", inode_cnt);
+                seq_printf(seq, "  %d inodes on list\n", inode_cnt);
        seq_printf(seq, "average:\n  %llu us scan time\n",
            div_u64(es_stats->es_stats_scan_time, 1000));
@@ -1132,7 +1131,7 @@ static int ext4_es_seq_shrinker_info_show(struct seq_file *seq, void *v)
                seq_printf(seq,
                    "maximum:\n  %lu inode (%u objects, %u reclaimable)\n"
                    "  %llu us max scan time\n",
-                    max->vfs_inode.i_ino, max->i_es_all_nr, max->i_es_lru_nr,
+                    max->vfs_inode.i_ino, max->i_es_all_nr, max->i_es_shk_nr,
                    div_u64(es_stats->es_stats_max_scan_time, 1000));
        return 0;
@@ -1181,9 +1180,11 @@ int ext4_es_register_shrinker(struct ext4_sb_info *sbi)
 {
        int err;
-        INIT_LIST_HEAD(&sbi->s_es_lru);
+        /* Make sure we have enough bits for physical block number */
-        spin_lock_init(&sbi->s_es_lru_lock);
+        BUILD_BUG_ON(ES_SHIFT < 48);
-        sbi->s_es_stats.es_stats_last_sorted = 0;
+        INIT_LIST_HEAD(&sbi->s_es_list);
+        sbi->s_es_nr_inode = 0;
+        spin_lock_init(&sbi->s_es_lock);
        sbi->s_es_stats.es_stats_shrunk = 0;
        sbi->s_es_stats.es_stats_cache_hits = 0;
        sbi->s_es_stats.es_stats_cache_misses = 0;
@@ -1192,7 +1193,7 @@ int ext4_es_register_shrinker(struct ext4_sb_info *sbi)
        err = percpu_counter_init(&sbi->s_es_stats.es_stats_all_cnt, 0, GFP_KERNEL);
        if (err)
                return err;
-        err = percpu_counter_init(&sbi->s_es_stats.es_stats_lru_cnt, 0, GFP_KERNEL);
+        err = percpu_counter_init(&sbi->s_es_stats.es_stats_shk_cnt, 0, GFP_KERNEL);
        if (err)
                goto err1;
@@ -1210,7 +1211,7 @@ int ext4_es_register_shrinker(struct ext4_sb_info *sbi)
        return 0;
 err2:
-        percpu_counter_destroy(&sbi->s_es_stats.es_stats_lru_cnt);
+        percpu_counter_destroy(&sbi->s_es_stats.es_stats_shk_cnt);
 err1:
        percpu_counter_destroy(&sbi->s_es_stats.es_stats_all_cnt);
        return err;
@@ -1221,71 +1222,83 @@ void ext4_es_unregister_shrinker(struct ext4_sb_info *sbi)
        if (sbi->s_proc)
                remove_proc_entry("es_shrinker_info", sbi->s_proc);
        percpu_counter_destroy(&sbi->s_es_stats.es_stats_all_cnt);
-        percpu_counter_destroy(&sbi->s_es_stats.es_stats_lru_cnt);
+        percpu_counter_destroy(&sbi->s_es_stats.es_stats_shk_cnt);
        unregister_shrinker(&sbi->s_es_shrinker);
 }
-void ext4_es_lru_add(struct inode *inode)
+/*
+ * Shrink extents in given inode from ei->i_es_shrink_lblk till end. Scan at
+ * most *nr_to_scan extents, update *nr_to_scan accordingly.
+ *
+ * Return 0 if we hit end of tree / interval, 1 if we exhausted nr_to_scan.
+ * Increment *nr_shrunk by the number of reclaimed extents. Also update
+ * ei->i_es_shrink_lblk to where we should continue scanning.
+ */
+static int es_do_reclaim_extents(struct ext4_inode_info *ei, ext4_lblk_t end,
+                                 int *nr_to_scan, int *nr_shrunk)
 {
-        struct ext4_inode_info *ei = EXT4_I(inode);
+        struct inode *inode = &ei->vfs_inode;
-        struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
+        struct ext4_es_tree *tree = &ei->i_es_tree;
+        struct extent_status *es;
-        ei->i_touch_when = jiffies;
+        struct rb_node *node;
-        if (!list_empty(&ei->i_es_lru))
-                return;
-        spin_lock(&sbi->s_es_lru_lock);
+        es = __es_tree_search(&tree->root, ei->i_es_shrink_lblk);
-        if (list_empty(&ei->i_es_lru))
+        if (!es)
-                list_add_tail(&ei->i_es_lru, &sbi->s_es_lru);
+                goto out_wrap;
-        spin_unlock(&sbi->s_es_lru_lock);
+        node = &es->rb_node;
-}
+        while (*nr_to_scan > 0) {
+                if (es->es_lblk > end) {
+                        ei->i_es_shrink_lblk = end + 1;
+                        return 0;
+                }
-void ext4_es_lru_del(struct inode *inode)
+                (*nr_to_scan)--;
-{
+                node = rb_next(&es->rb_node);
-        struct ext4_inode_info *ei = EXT4_I(inode);
+                /*
-        struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
+                 * We can't reclaim delayed extent from status tree because
+                 * fiemap, bigallic, and seek_data/hole need to use it.
+                 */
+                if (ext4_es_is_delayed(es))
+                        goto next;
+                if (ext4_es_is_referenced(es)) {
+                        ext4_es_clear_referenced(es);
+                        goto next;
+                }
-        spin_lock(&sbi->s_es_lru_lock);
+                rb_erase(&es->rb_node, &tree->root);
-        if (!list_empty(&ei->i_es_lru))
+                ext4_es_free_extent(inode, es);
-                list_del_init(&ei->i_es_lru);
+                (*nr_shrunk)++;
-        spin_unlock(&sbi->s_es_lru_lock);
+next:
+                if (!node)
+                        goto out_wrap;
+                es = rb_entry(node, struct extent_status, rb_node);
+        }
+        ei->i_es_shrink_lblk = es->es_lblk;
+        return 1;
+out_wrap:
+        ei->i_es_shrink_lblk = 0;
+        return 0;
 }
-static int __es_try_to_reclaim_extents(struct ext4_inode_info *ei,
+static int es_reclaim_extents(struct ext4_inode_info *ei, int *nr_to_scan)
-                                       int nr_to_scan)
 {
        struct inode *inode = &ei->vfs_inode;
-        struct ext4_es_tree *tree = &ei->i_es_tree;
+        int nr_shrunk = 0;
-        struct rb_node *node;
+        ext4_lblk_t start = ei->i_es_shrink_lblk;
-        struct extent_status *es;
-        unsigned long nr_shrunk = 0;
        static DEFINE_RATELIMIT_STATE(_rs, DEFAULT_RATELIMIT_INTERVAL,
                                      DEFAULT_RATELIMIT_BURST);
-        if (ei->i_es_lru_nr == 0)
+        if (ei->i_es_shk_nr == 0)
                return 0;
        if (ext4_test_inode_state(inode, EXT4_STATE_EXT_PRECACHED) &&
            __ratelimit(&_rs))
                ext4_warning(inode->i_sb, "forced shrink of precached extents");
-        node = rb_first(&tree->root);
+        if (!es_do_reclaim_extents(ei, EXT_MAX_BLOCKS, nr_to_scan, &nr_shrunk) &&
-        while (node != NULL) {
+            start != 0)
-                es = rb_entry(node, struct extent_status, rb_node);
+                es_do_reclaim_extents(ei, start - 1, nr_to_scan, &nr_shrunk);
-                node = rb_next(&es->rb_node);
-                /*
+        ei->i_es_tree.cache_es = NULL;
-                 * We can't reclaim delayed extent from status tree because
-                 * fiemap, bigallic, and seek_data/hole need to use it.
-                 */
-                if (!ext4_es_is_delayed(es)) {
-                        rb_erase(&es->rb_node, &tree->root);
-                        ext4_es_free_extent(inode, es);
-                        nr_shrunk++;
-                        if (--nr_to_scan == 0)
-                                break;
-                }
-        }
-        tree->cache_es = NULL;
        return nr_shrunk;
 }
diff --git a/fs/ext4/extents_status.h b/fs/ext4/extents_status.h
index efd5f970b501..691b52613ce4 100644
--- a/fs/ext4/extents_status.h
+++ b/fs/ext4/extents_status.h
@@ -29,25 +29,28 @@
 /*
 * These flags live in the high bits of extent_status.es_pblk
 */
-#define ES_SHIFT        60
+enum {
+        ES_WRITTEN_B,
-#define EXTENT_STATUS_WRITTEN   (1 << 3)
+        ES_UNWRITTEN_B,
-#define EXTENT_STATUS_UNWRITTEN (1 << 2)
+        ES_DELAYED_B,
-#define EXTENT_STATUS_DELAYED   (1 << 1)
+        ES_HOLE_B,
-#define EXTENT_STATUS_HOLE      (1 << 0)
+        ES_REFERENCED_B,
+        ES_FLAGS
+};
-#define EXTENT_STATUS_FLAGS     (EXTENT_STATUS_WRITTEN | \
+#define ES_SHIFT (sizeof(ext4_fsblk_t)*8 - ES_FLAGS)
-                                 EXTENT_STATUS_UNWRITTEN | \
+#define ES_MASK (~((ext4_fsblk_t)0) << ES_SHIFT)
-                                 EXTENT_STATUS_DELAYED | \
-                                 EXTENT_STATUS_HOLE)
-#define ES_WRITTEN              (1ULL << 63)
+#define EXTENT_STATUS_WRITTEN   (1 << ES_WRITTEN_B)
-#define ES_UNWRITTEN            (1ULL << 62)
+#define EXTENT_STATUS_UNWRITTEN (1 << ES_UNWRITTEN_B)
-#define ES_DELAYED              (1ULL << 61)
+#define EXTENT_STATUS_DELAYED   (1 << ES_DELAYED_B)
-#define ES_HOLE                 (1ULL << 60)
+#define EXTENT_STATUS_HOLE      (1 << ES_HOLE_B)
+#define EXTENT_STATUS_REFERENCED        (1 << ES_REFERENCED_B)
-#define ES_MASK                 (ES_WRITTEN | ES_UNWRITTEN | \
+#define ES_TYPE_MASK    ((ext4_fsblk_t)(EXTENT_STATUS_WRITTEN | \
-                                 ES_DELAYED | ES_HOLE)
+                          EXTENT_STATUS_UNWRITTEN | \
+                          EXTENT_STATUS_DELAYED | \
+                          EXTENT_STATUS_HOLE) << ES_SHIFT)
 struct ext4_sb_info;
 struct ext4_extent;
@@ -65,14 +68,13 @@ struct ext4_es_tree {
 };
 struct ext4_es_stats {
-        unsigned long es_stats_last_sorted;
        unsigned long es_stats_shrunk;
        unsigned long es_stats_cache_hits;
        unsigned long es_stats_cache_misses;
        u64 es_stats_scan_time;
        u64 es_stats_max_scan_time;
        struct percpu_counter es_stats_all_cnt;
-        struct percpu_counter es_stats_lru_cnt;
+        struct percpu_counter es_stats_shk_cnt;
 };
 extern int __init ext4_init_es(void);
@@ -93,29 +95,49 @@ extern void ext4_es_find_delayed_extent_range(struct inode *inode,
 extern int ext4_es_lookup_extent(struct inode *inode, ext4_lblk_t lblk,
                                 struct extent_status *es);
+static inline unsigned int ext4_es_status(struct extent_status *es)
+{
+        return es->es_pblk >> ES_SHIFT;
+}
+static inline unsigned int ext4_es_type(struct extent_status *es)
+{
+        return (es->es_pblk & ES_TYPE_MASK) >> ES_SHIFT;
+}
 static inline int ext4_es_is_written(struct extent_status *es)
 {
-        return (es->es_pblk & ES_WRITTEN) != 0;
+        return (ext4_es_type(es) & EXTENT_STATUS_WRITTEN) != 0;
 }
 static inline int ext4_es_is_unwritten(struct extent_status *es)
 {
-        return (es->es_pblk & ES_UNWRITTEN) != 0;
+        return (ext4_es_type(es) & EXTENT_STATUS_UNWRITTEN) != 0;
 }
 static inline int ext4_es_is_delayed(struct extent_status *es)
 {
-        return (es->es_pblk & ES_DELAYED) != 0;
+        return (ext4_es_type(es) & EXTENT_STATUS_DELAYED) != 0;
 }
 static inline int ext4_es_is_hole(struct extent_status *es)
 {
-        return (es->es_pblk & ES_HOLE) != 0;
+        return (ext4_es_type(es) & EXTENT_STATUS_HOLE) != 0;
 }
-static inline unsigned int ext4_es_status(struct extent_status *es)
+static inline void ext4_es_set_referenced(struct extent_status *es)
 {
-        return es->es_pblk >> ES_SHIFT;
+        es->es_pblk |= ((ext4_fsblk_t)EXTENT_STATUS_REFERENCED) << ES_SHIFT;
+}
+static inline void ext4_es_clear_referenced(struct extent_status *es)
+{
+        es->es_pblk &= ~(((ext4_fsblk_t)EXTENT_STATUS_REFERENCED) << ES_SHIFT);
+}
+static inline int ext4_es_is_referenced(struct extent_status *es)
+{
+        return (ext4_es_status(es) & EXTENT_STATUS_REFERENCED) != 0;
 }
 static inline ext4_fsblk_t ext4_es_pblock(struct extent_status *es)
@@ -135,23 +157,19 @@ static inline void ext4_es_store_pblock(struct extent_status *es,
 static inline void ext4_es_store_status(struct extent_status *es,
                                        unsigned int status)
 {
-        es->es_pblk = (((ext4_fsblk_t)
+        es->es_pblk = (((ext4_fsblk_t)status << ES_SHIFT) & ES_MASK) |
-                        (status & EXTENT_STATUS_FLAGS) << ES_SHIFT) |
+                      (es->es_pblk & ~ES_MASK);
-                       (es->es_pblk & ~ES_MASK));
 }
 static inline void ext4_es_store_pblock_status(struct extent_status *es,
                                               ext4_fsblk_t pb,
                                               unsigned int status)
 {
-        es->es_pblk = (((ext4_fsblk_t)
+        es->es_pblk = (((ext4_fsblk_t)status << ES_SHIFT) & ES_MASK) |
-                        (status & EXTENT_STATUS_FLAGS) << ES_SHIFT) |
+                      (pb & ~ES_MASK);
-                       (pb & ~ES_MASK));
 }
 extern int ext4_es_register_shrinker(struct ext4_sb_info *sbi);
 extern void ext4_es_unregister_shrinker(struct ext4_sb_info *sbi);
-extern void ext4_es_lru_add(struct inode *inode);
-extern void ext4_es_lru_del(struct inode *inode);
 #endif /* _EXT4_EXTENTS_STATUS_H */
diff --git a/fs/ext4/file.c b/fs/ext4/file.c
index 8131be8c0af3..513c12cf444c 100644
--- a/fs/ext4/file.c
+++ b/fs/ext4/file.c
@@ -273,24 +273,19 @@ static int ext4_file_open(struct inode * inode, struct file * filp)
 * we determine this extent as a data or a hole according to whether the
 * page cache has data or not.
 */
-static int ext4_find_unwritten_pgoff(struct inode *inode,
+static int ext4_find_unwritten_pgoff(struct inode *inode, int whence,
-                                     int whence,
+                                     loff_t endoff, loff_t *offset)
-                                     struct ext4_map_blocks *map,
-                                     loff_t *offset)
 {
        struct pagevec pvec;
-        unsigned int blkbits;
        pgoff_t index;
        pgoff_t end;
-        loff_t endoff;
        loff_t startoff;
        loff_t lastoff;
        int found = 0;
-        blkbits = inode->i_sb->s_blocksize_bits;
        startoff = *offset;
        lastoff = startoff;
-        endoff = (loff_t)(map->m_lblk + map->m_len) << blkbits;
        index = startoff >> PAGE_CACHE_SHIFT;
        end = endoff >> PAGE_CACHE_SHIFT;
@@ -408,147 +403,144 @@ out:
 static loff_t ext4_seek_data(struct file *file, loff_t offset, loff_t maxsize)
 {
        struct inode *inode = file->f_mapping->host;
-        struct ext4_map_blocks map;
+        struct fiemap_extent_info fie;
-        struct extent_status es;
+        struct fiemap_extent ext[2];
-        ext4_lblk_t start, last, end;
+        loff_t next;
-        loff_t dataoff, isize;
+        int i, ret = 0;
-        int blkbits;
-        int ret = 0;
        mutex_lock(&inode->i_mutex);
+        if (offset >= inode->i_size) {
-        isize = i_size_read(inode);
-        if (offset >= isize) {
                mutex_unlock(&inode->i_mutex);
                return -ENXIO;
        }
+        fie.fi_flags = 0;
-        blkbits = inode->i_sb->s_blocksize_bits;
+        fie.fi_extents_max = 2;
-        start = offset >> blkbits;
+        fie.fi_extents_start = (struct fiemap_extent __user *) &ext;
-        last = start;
+        while (1) {
-        end = isize >> blkbits;
+                mm_segment_t old_fs = get_fs();
-        dataoff = offset;
+                fie.fi_extents_mapped = 0;
-        do {
+                memset(ext, 0, sizeof(*ext) * fie.fi_extents_max);
-                map.m_lblk = last;
-                map.m_len = end - last + 1;
+                set_fs(get_ds());
-                ret = ext4_map_blocks(NULL, inode, &map, 0);
+                ret = ext4_fiemap(inode, &fie, offset, maxsize - offset);
-                if (ret > 0 && !(map.m_flags & EXT4_MAP_UNWRITTEN)) {
+                set_fs(old_fs);
-                        if (last != start)
+                if (ret)
-                                dataoff = (loff_t)last << blkbits;
                        break;
-                }
-                /*
+                /* No extents found, EOF */
-                 * If there is a delay extent at this offset,
+                if (!fie.fi_extents_mapped) {
-                 * it will be as a data.
+                        ret = -ENXIO;
-                 */
-                ext4_es_find_delayed_extent_range(inode, last, last, &es);
-                if (es.es_len != 0 && in_range(last, es.es_lblk, es.es_len)) {
-                        if (last != start)
-                                dataoff = (loff_t)last << blkbits;
                        break;
                }
+                for (i = 0; i < fie.fi_extents_mapped; i++) {
+                        next = (loff_t)(ext[i].fe_length + ext[i].fe_logical);
-                /*
+                        if (offset < (loff_t)ext[i].fe_logical)
-                 * If there is a unwritten extent at this offset,
+                                offset = (loff_t)ext[i].fe_logical;
-                 * it will be as a data or a hole according to page
+                        /*
-                 * cache that has data or not.
+                         * If extent is not unwritten, then it contains valid
-                 */
+                         * data, mapped or delayed.
-                if (map.m_flags & EXT4_MAP_UNWRITTEN) {
+                         */
-                        int unwritten;
+                        if (!(ext[i].fe_flags & FIEMAP_EXTENT_UNWRITTEN))
-                        unwritten = ext4_find_unwritten_pgoff(inode, SEEK_DATA,
+                                goto out;
-                                                              &map, &dataoff);
-                        if (unwritten)
-                                break;
-                }
-                last++;
+                        /*
-                dataoff = (loff_t)last << blkbits;
+                         * If there is a unwritten extent at this offset,
-        } while (last <= end);
+                         * it will be as a data or a hole according to page
+                         * cache that has data or not.
+                         */
+                        if (ext4_find_unwritten_pgoff(inode, SEEK_DATA,
+                                                      next, &offset))
+                                goto out;
+                        if (ext[i].fe_flags & FIEMAP_EXTENT_LAST) {
+                                ret = -ENXIO;
+                                goto out;
+                        }
+                        offset = next;
+                }
+        }
+        if (offset > inode->i_size)
+                offset = inode->i_size;
+out:
        mutex_unlock(&inode->i_mutex);
+        if (ret)
+                return ret;
-        if (dataoff > isize)
+        return vfs_setpos(file, offset, maxsize);
-                return -ENXIO;
-        return vfs_setpos(file, dataoff, maxsize);
 }
 /*
- * ext4_seek_hole() retrieves the offset for SEEK_HOLE.
+ * ext4_seek_hole() retrieves the offset for SEEK_HOLE
 */
 static loff_t ext4_seek_hole(struct file *file, loff_t offset, loff_t maxsize)
 {
        struct inode *inode = file->f_mapping->host;
-        struct ext4_map_blocks map;
+        struct fiemap_extent_info fie;
-        struct extent_status es;
+        struct fiemap_extent ext[2];
-        ext4_lblk_t start, last, end;
+        loff_t next;
-        loff_t holeoff, isize;
+        int i, ret = 0;
-        int blkbits;
-        int ret = 0;
        mutex_lock(&inode->i_mutex);
+        if (offset >= inode->i_size) {
-        isize = i_size_read(inode);
-        if (offset >= isize) {
                mutex_unlock(&inode->i_mutex);
                return -ENXIO;
        }
-        blkbits = inode->i_sb->s_blocksize_bits;
+        fie.fi_flags = 0;
-        start = offset >> blkbits;
+        fie.fi_extents_max = 2;
-        last = start;
+        fie.fi_extents_start = (struct fiemap_extent __user *)&ext;
-        end = isize >> blkbits;
+        while (1) {
-        holeoff = offset;
+                mm_segment_t old_fs = get_fs();
-        do {
+                fie.fi_extents_mapped = 0;
-                map.m_lblk = last;
+                memset(ext, 0, sizeof(*ext));
-                map.m_len = end - last + 1;
-                ret = ext4_map_blocks(NULL, inode, &map, 0);
-                if (ret > 0 && !(map.m_flags & EXT4_MAP_UNWRITTEN)) {
-                        last += ret;
-                        holeoff = (loff_t)last << blkbits;
-                        continue;
-                }
-                /*
+                set_fs(get_ds());
-                 * If there is a delay extent at this offset,
+                ret = ext4_fiemap(inode, &fie, offset, maxsize - offset);
-                 * we will skip this extent.
+                set_fs(old_fs);
-                 */
+                if (ret)
-                ext4_es_find_delayed_extent_range(inode, last, last, &es);
+                        break;
-                if (es.es_len != 0 && in_range(last, es.es_lblk, es.es_len)) {
-                        last = es.es_lblk + es.es_len;
-                        holeoff = (loff_t)last << blkbits;
-                        continue;
-                }
-                /*
+                /* No extents found */
-                 * If there is a unwritten extent at this offset,
+                if (!fie.fi_extents_mapped)
-                 * it will be as a data or a hole according to page
+                        break;
-                 * cache that has data or not.
-                 */
+                for (i = 0; i < fie.fi_extents_mapped; i++) {
-                if (map.m_flags & EXT4_MAP_UNWRITTEN) {
+                        next = (loff_t)(ext[i].fe_logical + ext[i].fe_length);
-                        int unwritten;
+                        /*
-                        unwritten = ext4_find_unwritten_pgoff(inode, SEEK_HOLE,
+                         * If extent is not unwritten, then it contains valid
-                                                              &map, &holeoff);
+                         * data, mapped or delayed.
-                        if (!unwritten) {
+                         */
-                                last += ret;
+                        if (!(ext[i].fe_flags & FIEMAP_EXTENT_UNWRITTEN)) {
-                                holeoff = (loff_t)last << blkbits;
+                                if (offset < (loff_t)ext[i].fe_logical)
+                                        goto out;
+                                offset = next;
                                continue;
                        }
-                }
+                        /*
+                         * If there is a unwritten extent at this offset,
-                /* find a hole */
+                         * it will be as a data or a hole according to page
-                break;
+                         * cache that has data or not.
-        } while (last <= end);
+                         */
+                        if (ext4_find_unwritten_pgoff(inode, SEEK_HOLE,
+                                                      next, &offset))
+                                goto out;
+                        offset = next;
+                        if (ext[i].fe_flags & FIEMAP_EXTENT_LAST)
+                                goto out;
+                }
+        }
+        if (offset > inode->i_size)
+                offset = inode->i_size;
+out:
        mutex_unlock(&inode->i_mutex);
+        if (ret)
+                return ret;
-        if (holeoff > isize)
+        return vfs_setpos(file, offset, maxsize);
-                holeoff = isize;
-        return vfs_setpos(file, holeoff, maxsize);
 }
 /*
diff --git a/fs/ext4/inline.c b/fs/ext4/inline.c
index 3ea62695abce..4b143febf21f 100644
--- a/fs/ext4/inline.c
+++ b/fs/ext4/inline.c
@@ -811,8 +811,11 @@ static int ext4_da_convert_inline_data_to_extent(struct address_space *mapping,
        ret = __block_write_begin(page, 0, inline_size,
                                  ext4_da_get_block_prep);
        if (ret) {
+                up_read(&EXT4_I(inode)->xattr_sem);
+                unlock_page(page);
+                page_cache_release(page);
                ext4_truncate_failed_write(inode);
-                goto out;
+                return ret;
        }
        SetPageDirty(page);
@@ -870,6 +873,12 @@ retry_journal:
                        goto out_journal;
        }
+        /*
+         * We cannot recurse into the filesystem as the transaction
+         * is already started.
+         */
+        flags |= AOP_FLAG_NOFS;
        if (ret == -ENOSPC) {
                ret = ext4_da_convert_inline_data_to_extent(mapping,
                                                            inode,
@@ -882,11 +891,6 @@ retry_journal:
                goto out;
        }
-        /*
-         * We cannot recurse into the filesystem as the transaction
-         * is already started.
-         */
-        flags |= AOP_FLAG_NOFS;
        page = grab_cache_page_write_begin(mapping, 0, flags);
        if (!page) {
@@ -1807,11 +1811,12 @@ int ext4_destroy_inline_data(handle_t *handle, struct inode *inode)
 int ext4_inline_data_fiemap(struct inode *inode,
                            struct fiemap_extent_info *fieinfo,
-                            int *has_inline)
+                            int *has_inline, __u64 start, __u64 len)
 {
        __u64 physical = 0;
-        __u64 length;
+        __u64 inline_len;
-        __u32 flags = FIEMAP_EXTENT_DATA_INLINE | FIEMAP_EXTENT_LAST;
+        __u32 flags = FIEMAP_EXTENT_DATA_INLINE | FIEMAP_EXTENT_NOT_ALIGNED |
+                FIEMAP_EXTENT_LAST;
        int error = 0;
        struct ext4_iloc iloc;
@@ -1820,6 +1825,13 @@ int ext4_inline_data_fiemap(struct inode *inode,
                *has_inline = 0;
                goto out;
        }
+        inline_len = min_t(size_t, ext4_get_inline_size(inode),
+                           i_size_read(inode));
+        if (start >= inline_len)
+                goto out;
+        if (start + len < inline_len)
+                inline_len = start + len;
+        inline_len -= start;
        error = ext4_get_inode_loc(inode, &iloc);
        if (error)
@@ -1828,11 +1840,10 @@ int ext4_inline_data_fiemap(struct inode *inode,
        physical = (__u64)iloc.bh->b_blocknr << inode->i_sb->s_blocksize_bits;
        physical += (char *)ext4_raw_inode(&iloc) - iloc.bh->b_data;
        physical += offsetof(struct ext4_inode, i_block);
-        length = i_size_read(inode);
        if (physical)
-                error = fiemap_fill_next_extent(fieinfo, 0, physical,
+                error = fiemap_fill_next_extent(fieinfo, start, physical,
-                                                length, flags);
+                                                inline_len, flags);
        brelse(iloc.bh);
 out:
        up_read(&EXT4_I(inode)->xattr_sem);
diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
index 3356ab5395f4..5653fa42930b 100644
--- a/fs/ext4/inode.c
+++ b/fs/ext4/inode.c
@@ -416,11 +416,6 @@ static void ext4_map_blocks_es_recheck(handle_t *handle,
        }
        if (!(flags & EXT4_GET_BLOCKS_NO_LOCK))
                up_read((&EXT4_I(inode)->i_data_sem));
-        /*
-         * Clear EXT4_MAP_FROM_CLUSTER and EXT4_MAP_BOUNDARY flag
-         * because it shouldn't be marked in es_map->m_flags.
-         */
-        map->m_flags &= ~(EXT4_MAP_FROM_CLUSTER | EXT4_MAP_BOUNDARY);
        /*
         * We don't check m_len because extent will be collpased in status
@@ -491,7 +486,6 @@ int ext4_map_blocks(handle_t *handle, struct inode *inode,
        /* Lookup extent status tree firstly */
        if (ext4_es_lookup_extent(inode, map->m_lblk, &es)) {
-                ext4_es_lru_add(inode);
                if (ext4_es_is_written(&es) || ext4_es_is_unwritten(&es)) {
                        map->m_pblk = ext4_es_pblock(&es) +
                                        map->m_lblk - es.es_lblk;
@@ -1393,7 +1387,6 @@ static int ext4_da_map_blocks(struct inode *inode, sector_t iblock,
        /* Lookup extent status tree firstly */
        if (ext4_es_lookup_extent(inode, iblock, &es)) {
-                ext4_es_lru_add(inode);
                if (ext4_es_is_hole(&es)) {
                        retval = 0;
                        down_read(&EXT4_I(inode)->i_data_sem);
@@ -1434,24 +1427,12 @@ static int ext4_da_map_blocks(struct inode *inode, sector_t iblock,
         * file system block.
         */
        down_read(&EXT4_I(inode)->i_data_sem);
-        if (ext4_has_inline_data(inode)) {
+        if (ext4_has_inline_data(inode))
-                /*
-                 * We will soon create blocks for this page, and let
-                 * us pretend as if the blocks aren't allocated yet.
-                 * In case of clusters, we have to handle the work
-                 * of mapping from cluster so that the reserved space
-                 * is calculated properly.
-                 */
-                if ((EXT4_SB(inode->i_sb)->s_cluster_ratio > 1) &&
-                    ext4_find_delalloc_cluster(inode, map->m_lblk))
-                        map->m_flags |= EXT4_MAP_FROM_CLUSTER;
                retval = 0;
-        } else if (ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS))
+        else if (ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS))
-                retval = ext4_ext_map_blocks(NULL, inode, map,
+                retval = ext4_ext_map_blocks(NULL, inode, map, 0);
-                                             EXT4_GET_BLOCKS_NO_PUT_HOLE);
        else
-                retval = ext4_ind_map_blocks(NULL, inode, map,
+                retval = ext4_ind_map_blocks(NULL, inode, map, 0);
-                                             EXT4_GET_BLOCKS_NO_PUT_HOLE);
 add_delayed:
        if (retval == 0) {
@@ -1465,7 +1446,8 @@ add_delayed:
                 * then we don't need to reserve it again. However we still need
                 * to reserve metadata for every block we're going to write.
                 */
-                if (!(map->m_flags & EXT4_MAP_FROM_CLUSTER)) {
+                if (EXT4_SB(inode->i_sb)->s_cluster_ratio <= 1 ||
+                    !ext4_find_delalloc_cluster(inode, map->m_lblk)) {
                        ret = ext4_da_reserve_space(inode, iblock);
                        if (ret) {
                                /* not enough space to reserve */
@@ -1481,11 +1463,6 @@ add_delayed:
                        goto out_unlock;
                }
-                /* Clear EXT4_MAP_FROM_CLUSTER flag since its purpose is served
-                 * and it should not appear on the bh->b_state.
-                 */
-                map->m_flags &= ~EXT4_MAP_FROM_CLUSTER;
                map_bh(bh, inode->i_sb, invalid_block);
                set_buffer_new(bh);
                set_buffer_delay(bh);
@@ -3643,7 +3620,7 @@ out_stop:
         * If this was a simple ftruncate() and the file will remain alive,
         * then we need to clear up the orphan record which we created above.
         * However, if this was a real unlink then we were called by
-         * ext4_delete_inode(), and we allow that function to clean up the
+         * ext4_evict_inode(), and we allow that function to clean up the
         * orphan info for us.
         */
        if (inode->i_nlink)
diff --git a/fs/ext4/ioctl.c b/fs/ext4/ioctl.c
index bfda18a15592..f58a0d106726 100644
--- a/fs/ext4/ioctl.c
+++ b/fs/ext4/ioctl.c
@@ -78,8 +78,6 @@ static void swap_inode_data(struct inode *inode1, struct inode *inode2)
        memswap(&ei1->i_disksize, &ei2->i_disksize, sizeof(ei1->i_disksize));
        ext4_es_remove_extent(inode1, 0, EXT_MAX_BLOCKS);
        ext4_es_remove_extent(inode2, 0, EXT_MAX_BLOCKS);
-        ext4_es_lru_del(inode1);
-        ext4_es_lru_del(inode2);
        isize = i_size_read(inode1);
        i_size_write(inode1, i_size_read(inode2));
diff --git a/fs/ext4/mballoc.c b/fs/ext4/mballoc.c
index dbfe15c2533c..8d1e60214ef0 100644
--- a/fs/ext4/mballoc.c
+++ b/fs/ext4/mballoc.c
@@ -2358,7 +2358,7 @@ int ext4_mb_alloc_groupinfo(struct super_block *sb, ext4_group_t ngroups)
        if (sbi->s_group_info) {
                memcpy(new_groupinfo, sbi->s_group_info,
                       sbi->s_group_info_size * sizeof(*sbi->s_group_info));
-                ext4_kvfree(sbi->s_group_info);
+                kvfree(sbi->s_group_info);
        }
        sbi->s_group_info = new_groupinfo;
        sbi->s_group_info_size = size / sizeof(*sbi->s_group_info);
@@ -2385,7 +2385,7 @@ int ext4_mb_add_groupinfo(struct super_block *sb, ext4_group_t group,
        if (group % EXT4_DESC_PER_BLOCK(sb) == 0) {
                metalen = sizeof(*meta_group_info) <<
                        EXT4_DESC_PER_BLOCK_BITS(sb);
-                meta_group_info = kmalloc(metalen, GFP_KERNEL);
+                meta_group_info = kmalloc(metalen, GFP_NOFS);
                if (meta_group_info == NULL) {
                        ext4_msg(sb, KERN_ERR, "can't allocate mem "
                                 "for a buddy group");
@@ -2399,7 +2399,7 @@ int ext4_mb_add_groupinfo(struct super_block *sb, ext4_group_t group,
                sbi->s_group_info[group >> EXT4_DESC_PER_BLOCK_BITS(sb)];
        i = group & (EXT4_DESC_PER_BLOCK(sb) - 1);
-        meta_group_info[i] = kmem_cache_zalloc(cachep, GFP_KERNEL);
+        meta_group_info[i] = kmem_cache_zalloc(cachep, GFP_NOFS);
        if (meta_group_info[i] == NULL) {
                ext4_msg(sb, KERN_ERR, "can't allocate buddy mem");
                goto exit_group_info;
@@ -2428,7 +2428,7 @@ int ext4_mb_add_groupinfo(struct super_block *sb, ext4_group_t group,
        {
                struct buffer_head *bh;
                meta_group_info[i]->bb_bitmap =
-                        kmalloc(sb->s_blocksize, GFP_KERNEL);
+                        kmalloc(sb->s_blocksize, GFP_NOFS);
                BUG_ON(meta_group_info[i]->bb_bitmap == NULL);
                bh = ext4_read_block_bitmap(sb, group);
                BUG_ON(bh == NULL);
@@ -2495,7 +2495,7 @@ err_freebuddy:
                kfree(sbi->s_group_info[i]);
        iput(sbi->s_buddy_cache);
 err_freesgi:
-        ext4_kvfree(sbi->s_group_info);
+        kvfree(sbi->s_group_info);
        return -ENOMEM;
 }
@@ -2708,12 +2708,11 @@ int ext4_mb_release(struct super_block *sb)
                        EXT4_DESC_PER_BLOCK_BITS(sb);
                for (i = 0; i < num_meta_group_infos; i++)
                        kfree(sbi->s_group_info[i]);
-                ext4_kvfree(sbi->s_group_info);
+                kvfree(sbi->s_group_info);
        }
        kfree(sbi->s_mb_offsets);
        kfree(sbi->s_mb_maxs);
-        if (sbi->s_buddy_cache)
+        iput(sbi->s_buddy_cache);
-                iput(sbi->s_buddy_cache);
        if (sbi->s_mb_stats) {
                ext4_msg(sb, KERN_INFO,
                       "mballoc: %u blocks %u reqs (%u success)",
diff --git a/fs/ext4/migrate.c b/fs/ext4/migrate.c
index a432634f2e6a..3cb267aee802 100644
--- a/fs/ext4/migrate.c
+++ b/fs/ext4/migrate.c
@@ -592,7 +592,7 @@ err_out:
        /*
         * set the  i_blocks count to zero
-         * so that the ext4_delete_inode does the
+         * so that the ext4_evict_inode() does the
         * right job
         *
         * We don't need to take the i_lock because
diff --git a/fs/ext4/move_extent.c b/fs/ext4/move_extent.c
index 9f2311bc9c4f..503ea15dc5db 100644
--- a/fs/ext4/move_extent.c
+++ b/fs/ext4/move_extent.c
@@ -273,6 +273,7 @@ move_extent_per_page(struct file *o_filp, struct inode *donor_inode,
        int replaced_count = 0;
        int from = data_offset_in_page << orig_inode->i_blkbits;
        int blocks_per_page = PAGE_CACHE_SIZE >> orig_inode->i_blkbits;
+        struct super_block *sb = orig_inode->i_sb;
        /*
         * It needs twice the amount of ordinary journal buffers because
@@ -405,10 +406,13 @@ unlock_pages:
        page_cache_release(pagep[1]);
 stop_journal:
        ext4_journal_stop(handle);
+        if (*err == -ENOSPC &&
+            ext4_should_retry_alloc(sb, &retries))
+                goto again;
        /* Buffer was busy because probably is pinned to journal transaction,
         * force transaction commit may help to free it. */
-        if (*err == -EBUSY && ext4_should_retry_alloc(orig_inode->i_sb,
+        if (*err == -EBUSY && retries++ < 4 && EXT4_SB(sb)->s_journal &&
-                                                      &retries))
+            jbd2_journal_force_commit_nested(EXT4_SB(sb)->s_journal))
                goto again;
        return replaced_count;
diff --git a/fs/ext4/namei.c b/fs/ext4/namei.c
index 426211882f72..2291923dae4e 100644
--- a/fs/ext4/namei.c
+++ b/fs/ext4/namei.c
@@ -2814,7 +2814,6 @@ static int ext4_unlink(struct inode *dir, struct dentry *dentry)
                ext4_orphan_add(handle, inode);
        inode->i_ctime = ext4_current_time(inode);
        ext4_mark_inode_dirty(handle, inode);
-        retval = 0;
 end_unlink:
        brelse(bh);
diff --git a/fs/ext4/resize.c b/fs/ext4/resize.c
index ca4588388fc3..bf76f405a5f9 100644
--- a/fs/ext4/resize.c
+++ b/fs/ext4/resize.c
@@ -856,7 +856,7 @@ static int add_new_gdb(handle_t *handle, struct inode *inode,
        n_group_desc[gdb_num] = gdb_bh;
        EXT4_SB(sb)->s_group_desc = n_group_desc;
        EXT4_SB(sb)->s_gdb_count++;
-        ext4_kvfree(o_group_desc);
+        kvfree(o_group_desc);
        le16_add_cpu(&es->s_reserved_gdt_blocks, -1);
        err = ext4_handle_dirty_super(handle, sb);
@@ -866,7 +866,7 @@ static int add_new_gdb(handle_t *handle, struct inode *inode,
        return err;
 exit_inode:
-        ext4_kvfree(n_group_desc);
+        kvfree(n_group_desc);
        brelse(iloc.bh);
 exit_dind:
        brelse(dind);
@@ -909,7 +909,7 @@ static int add_new_gdb_meta_bg(struct super_block *sb,
        n_group_desc[gdb_num] = gdb_bh;
        EXT4_SB(sb)->s_group_desc = n_group_desc;
        EXT4_SB(sb)->s_gdb_count++;
-        ext4_kvfree(o_group_desc);
+        kvfree(o_group_desc);
        BUFFER_TRACE(gdb_bh, "get_write_access");
        err = ext4_journal_get_write_access(handle, gdb_bh);
        if (unlikely(err))
diff --git a/fs/ext4/super.c b/fs/ext4/super.c
index 63e802b8ec68..43c92b1685cb 100644
--- a/fs/ext4/super.c
+++ b/fs/ext4/super.c
@@ -176,15 +176,6 @@ void *ext4_kvzalloc(size_t size, gfp_t flags)
        return ret;
 }
-void ext4_kvfree(void *ptr)
-{
-        if (is_vmalloc_addr(ptr))
-                vfree(ptr);
-        else
-                kfree(ptr);
-}
 ext4_fsblk_t ext4_block_bitmap(struct super_block *sb,
                               struct ext4_group_desc *bg)
 {
@@ -811,8 +802,8 @@ static void ext4_put_super(struct super_block *sb)
        for (i = 0; i < sbi->s_gdb_count; i++)
                brelse(sbi->s_group_desc[i]);
-        ext4_kvfree(sbi->s_group_desc);
+        kvfree(sbi->s_group_desc);
-        ext4_kvfree(sbi->s_flex_groups);
+        kvfree(sbi->s_flex_groups);
        percpu_counter_destroy(&sbi->s_freeclusters_counter);
        percpu_counter_destroy(&sbi->s_freeinodes_counter);
        percpu_counter_destroy(&sbi->s_dirs_counter);
@@ -880,10 +871,10 @@ static struct inode *ext4_alloc_inode(struct super_block *sb)
        spin_lock_init(&ei->i_prealloc_lock);
        ext4_es_init_tree(&ei->i_es_tree);
        rwlock_init(&ei->i_es_lock);
-        INIT_LIST_HEAD(&ei->i_es_lru);
+        INIT_LIST_HEAD(&ei->i_es_list);
        ei->i_es_all_nr = 0;
-        ei->i_es_lru_nr = 0;
+        ei->i_es_shk_nr = 0;
-        ei->i_touch_when = 0;
+        ei->i_es_shrink_lblk = 0;
        ei->i_reserved_data_blocks = 0;
        ei->i_reserved_meta_blocks = 0;
        ei->i_allocated_meta_blocks = 0;
@@ -973,7 +964,6 @@ void ext4_clear_inode(struct inode *inode)
        dquot_drop(inode);
        ext4_discard_preallocations(inode);
        ext4_es_remove_extent(inode, 0, EXT_MAX_BLOCKS);
-        ext4_es_lru_del(inode);
        if (EXT4_I(inode)->jinode) {
                jbd2_journal_release_jbd_inode(EXT4_JOURNAL(inode),
                                               EXT4_I(inode)->jinode);
@@ -1153,7 +1143,7 @@ enum {
        Opt_inode_readahead_blks, Opt_journal_ioprio,
        Opt_dioread_nolock, Opt_dioread_lock,
        Opt_discard, Opt_nodiscard, Opt_init_itable, Opt_noinit_itable,
-        Opt_max_dir_size_kb,
+        Opt_max_dir_size_kb, Opt_nojournal_checksum,
 };
 static const match_table_t tokens = {
@@ -1187,6 +1177,7 @@ static const match_table_t tokens = {
        {Opt_journal_dev, "journal_dev=%u"},
        {Opt_journal_path, "journal_path=%s"},
        {Opt_journal_checksum, "journal_checksum"},
+        {Opt_nojournal_checksum, "nojournal_checksum"},
        {Opt_journal_async_commit, "journal_async_commit"},
        {Opt_abort, "abort"},
        {Opt_data_journal, "data=journal"},
@@ -1368,6 +1359,8 @@ static const struct mount_opts {
         MOPT_EXT4_ONLY | MOPT_SET | MOPT_EXPLICIT},
        {Opt_nodelalloc, EXT4_MOUNT_DELALLOC,
         MOPT_EXT4_ONLY | MOPT_CLEAR},
+        {Opt_nojournal_checksum, EXT4_MOUNT_JOURNAL_CHECKSUM,
+         MOPT_EXT4_ONLY | MOPT_CLEAR},
        {Opt_journal_checksum, EXT4_MOUNT_JOURNAL_CHECKSUM,
         MOPT_EXT4_ONLY | MOPT_SET},
        {Opt_journal_async_commit, (EXT4_MOUNT_JOURNAL_ASYNC_COMMIT |
@@ -1709,6 +1702,12 @@ static int parse_options(char *options, struct super_block *sb,
                        return 0;
                }
        }
+        if (test_opt(sb, DATA_FLAGS) == EXT4_MOUNT_ORDERED_DATA &&
+            test_opt(sb, JOURNAL_ASYNC_COMMIT)) {
+                ext4_msg(sb, KERN_ERR, "can't mount with journal_async_commit "
+                         "in data=ordered mode");
+                return 0;
+        }
        return 1;
 }
@@ -1946,7 +1945,7 @@ int ext4_alloc_flex_bg_array(struct super_block *sb, ext4_group_t ngroup)
                memcpy(new_groups, sbi->s_flex_groups,
                       (sbi->s_flex_groups_allocated *
                        sizeof(struct flex_groups)));
-                ext4_kvfree(sbi->s_flex_groups);
+                kvfree(sbi->s_flex_groups);
        }
        sbi->s_flex_groups = new_groups;
        sbi->s_flex_groups_allocated = size / sizeof(struct flex_groups);
@@ -3317,7 +3316,7 @@ int ext4_calculate_overhead(struct super_block *sb)
        struct ext4_super_block *es = sbi->s_es;
        ext4_group_t i, ngroups = ext4_get_groups_count(sb);
        ext4_fsblk_t overhead = 0;
-        char *buf = (char *) get_zeroed_page(GFP_KERNEL);
+        char *buf = (char *) get_zeroed_page(GFP_NOFS);
        if (!buf)
                return -ENOMEM;
@@ -3345,8 +3344,8 @@ int ext4_calculate_overhead(struct super_block *sb)
                        memset(buf, 0, PAGE_SIZE);
                cond_resched();
        }
-        /* Add the journal blocks as well */
+        /* Add the internal journal blocks as well */
-        if (sbi->s_journal)
+        if (sbi->s_journal && !sbi->journal_bdev)
                overhead += EXT4_NUM_B2C(sbi, sbi->s_journal->j_maxlen);
        sbi->s_overhead = overhead;
@@ -4232,7 +4231,7 @@ failed_mount7:
 failed_mount6:
        ext4_mb_release(sb);
        if (sbi->s_flex_groups)
-                ext4_kvfree(sbi->s_flex_groups);
+                kvfree(sbi->s_flex_groups);
        percpu_counter_destroy(&sbi->s_freeclusters_counter);
        percpu_counter_destroy(&sbi->s_freeinodes_counter);
        percpu_counter_destroy(&sbi->s_dirs_counter);
@@ -4261,7 +4260,7 @@ failed_mount3:
 failed_mount2:
        for (i = 0; i < db_count; i++)
                brelse(sbi->s_group_desc[i]);
-        ext4_kvfree(sbi->s_group_desc);
+        kvfree(sbi->s_group_desc);
 failed_mount:
        if (sbi->s_chksum_driver)
                crypto_free_shash(sbi->s_chksum_driver);
@@ -4862,6 +4861,14 @@ static int ext4_remount(struct super_block *sb, int *flags, char *data)
                goto restore_opts;
        }
+        if ((old_opts.s_mount_opt & EXT4_MOUNT_JOURNAL_CHECKSUM) ^
+            test_opt(sb, JOURNAL_CHECKSUM)) {
+                ext4_msg(sb, KERN_ERR, "changing journal_checksum "
+                         "during remount not supported");
+                err = -EINVAL;
+                goto restore_opts;
+        }
        if (test_opt(sb, DATA_FLAGS) == EXT4_MOUNT_JOURNAL_DATA) {
                if (test_opt2(sb, EXPLICIT_DELALLOC)) {
                        ext4_msg(sb, KERN_ERR, "can't mount with "
diff --git a/fs/jbd2/journal.c b/fs/jbd2/journal.c
index 1df94fabe4eb..b96bd8076b70 100644
--- a/fs/jbd2/journal.c
+++ b/fs/jbd2/journal.c
@@ -1714,8 +1714,7 @@ int jbd2_journal_destroy(journal_t *journal)
        if (journal->j_proc_entry)
                jbd2_stats_proc_exit(journal);
-        if (journal->j_inode)
+        iput(journal->j_inode);
-                iput(journal->j_inode);
        if (journal->j_revoke)
                jbd2_journal_destroy_revoke(journal);
        if (journal->j_chksum_driver)
diff --git a/fs/pstore/ram.c b/fs/pstore/ram.c
index ec881b312700..2f389ce5023c 100644
--- a/fs/pstore/ram.c
+++ b/fs/pstore/ram.c
@@ -61,6 +61,11 @@ module_param(mem_size, ulong, 0400);
 MODULE_PARM_DESC(mem_size,
                "size of reserved RAM used to store oops/panic logs");
+static unsigned int mem_type;
+module_param(mem_type, uint, 0600);
+MODULE_PARM_DESC(mem_type,
+                "set to 1 to try to use unbuffered memory (default 0)");
 static int dump_oops = 1;
 module_param(dump_oops, int, 0600);
 MODULE_PARM_DESC(dump_oops,
@@ -79,6 +84,7 @@ struct ramoops_context {
        struct persistent_ram_zone *fprz;
        phys_addr_t phys_addr;
        unsigned long size;
+        unsigned int memtype;
        size_t record_size;
        size_t console_size;
        size_t ftrace_size;
@@ -366,7 +372,8 @@ static int ramoops_init_przs(struct device *dev, struct ramoops_context *cxt,
                size_t sz = cxt->record_size;
                cxt->przs[i] = persistent_ram_new(*paddr, sz, 0,
-                                                  &cxt->ecc_info);
+                                                  &cxt->ecc_info,
+                                                  cxt->memtype);
                if (IS_ERR(cxt->przs[i])) {
                        err = PTR_ERR(cxt->przs[i]);
                        dev_err(dev, "failed to request mem region (0x%zx@0x%llx): %d\n",
@@ -396,7 +403,7 @@ static int ramoops_init_prz(struct device *dev, struct ramoops_context *cxt,
                return -ENOMEM;
        }
-        *prz = persistent_ram_new(*paddr, sz, sig, &cxt->ecc_info);
+        *prz = persistent_ram_new(*paddr, sz, sig, &cxt->ecc_info, cxt->memtype);
        if (IS_ERR(*prz)) {
                int err = PTR_ERR(*prz);
@@ -443,6 +450,7 @@ static int ramoops_probe(struct platform_device *pdev)
        cxt->size = pdata->mem_size;
        cxt->phys_addr = pdata->mem_address;
+        cxt->memtype = pdata->mem_type;
        cxt->record_size = pdata->record_size;
        cxt->console_size = pdata->console_size;
        cxt->ftrace_size = pdata->ftrace_size;
@@ -572,6 +580,7 @@ static void ramoops_register_dummy(void)
        dummy_data->mem_size = mem_size;
        dummy_data->mem_address = mem_address;
+        dummy_data->mem_type = 0;
        dummy_data->record_size = record_size;
        dummy_data->console_size = ramoops_console_size;
        dummy_data->ftrace_size = ramoops_ftrace_size;
diff --git a/fs/pstore/ram_core.c b/fs/pstore/ram_core.c
index 9d7b9a83699e..76c3f80efdfa 100644
--- a/fs/pstore/ram_core.c
+++ b/fs/pstore/ram_core.c
@@ -380,7 +380,8 @@ void persistent_ram_zap(struct persistent_ram_zone *prz)
        persistent_ram_update_header_ecc(prz);
 }
-static void *persistent_ram_vmap(phys_addr_t start, size_t size)
+static void *persistent_ram_vmap(phys_addr_t start, size_t size,
+                unsigned int memtype)
 {
        struct page **pages;
        phys_addr_t page_start;
@@ -392,7 +393,10 @@ static void *persistent_ram_vmap(phys_addr_t start, size_t size)
        page_start = start - offset_in_page(start);
        page_count = DIV_ROUND_UP(size + offset_in_page(start), PAGE_SIZE);
-        prot = pgprot_noncached(PAGE_KERNEL);
+        if (memtype)
+                prot = pgprot_noncached(PAGE_KERNEL);
+        else
+                prot = pgprot_writecombine(PAGE_KERNEL);
        pages = kmalloc_array(page_count, sizeof(struct page *), GFP_KERNEL);
        if (!pages) {
@@ -411,8 +415,11 @@ static void *persistent_ram_vmap(phys_addr_t start, size_t size)
        return vaddr;
 }
-static void *persistent_ram_iomap(phys_addr_t start, size_t size)
+static void *persistent_ram_iomap(phys_addr_t start, size_t size,
+                unsigned int memtype)
 {
+        void *va;
        if (!request_mem_region(start, size, "persistent_ram")) {
                pr_err("request mem region (0x%llx@0x%llx) failed\n",
                        (unsigned long long)size, (unsigned long long)start);
@@ -422,19 +429,24 @@ static void *persistent_ram_iomap(phys_addr_t start, size_t size)
        buffer_start_add = buffer_start_add_locked;
        buffer_size_add = buffer_size_add_locked;
-        return ioremap(start, size);
+        if (memtype)
+                va = ioremap(start, size);
+        else
+                va = ioremap_wc(start, size);
+        return va;
 }
 static int persistent_ram_buffer_map(phys_addr_t start, phys_addr_t size,
-                struct persistent_ram_zone *prz)
+                struct persistent_ram_zone *prz, int memtype)
 {
        prz->paddr = start;
        prz->size = size;
        if (pfn_valid(start >> PAGE_SHIFT))
-                prz->vaddr = persistent_ram_vmap(start, size);
+                prz->vaddr = persistent_ram_vmap(start, size, memtype);
        else
-                prz->vaddr = persistent_ram_iomap(start, size);
+                prz->vaddr = persistent_ram_iomap(start, size, memtype);
        if (!prz->vaddr) {
                pr_err("%s: Failed to map 0x%llx pages at 0x%llx\n", __func__,
@@ -500,7 +512,8 @@ void persistent_ram_free(struct persistent_ram_zone *prz)
 }
 struct persistent_ram_zone *persistent_ram_new(phys_addr_t start, size_t size,
-                        u32 sig, struct persistent_ram_ecc_info *ecc_info)
+                        u32 sig, struct persistent_ram_ecc_info *ecc_info,
+                        unsigned int memtype)
 {
        struct persistent_ram_zone *prz;
        int ret = -ENOMEM;
@@ -511,7 +524,7 @@ struct persistent_ram_zone *persistent_ram_new(phys_addr_t start, size_t size,
                goto err;
        }
-        ret = persistent_ram_buffer_map(start, size, prz);
+        ret = persistent_ram_buffer_map(start, size, prz, memtype);
        if (ret)
                goto err;
diff --git a/fs/reiserfs/journal.c b/fs/reiserfs/journal.c
index d571e173a990..9d6486d416a3 100644
--- a/fs/reiserfs/journal.c
+++ b/fs/reiserfs/journal.c
@@ -2772,7 +2772,7 @@ int journal_init(struct super_block *sb, const char *j_dev_name,
        if (journal_init_dev(sb, journal, j_dev_name) != 0) {
                reiserfs_warning(sb, "sh-462",
-                                 "unable to initialize jornal device");
+                                 "unable to initialize journal device");
                goto free_and_return;
        }
diff --git a/fs/ubifs/file.c b/fs/ubifs/file.c
index b5b593c45270..538519ee37d9 100644
--- a/fs/ubifs/file.c
+++ b/fs/ubifs/file.c
@@ -262,6 +262,7 @@ static int write_begin_slow(struct address_space *mapping,
                        if (err) {
                                unlock_page(page);
                                page_cache_release(page);
+                                ubifs_release_budget(c, &req);
                                return err;
                        }
                }
diff --git a/fs/ubifs/journal.c b/fs/ubifs/journal.c
index fb166e204441..f6ac3f29323c 100644
--- a/fs/ubifs/journal.c
+++ b/fs/ubifs/journal.c
@@ -571,7 +571,11 @@ int ubifs_jnl_update(struct ubifs_info *c, const struct inode *dir,
        aligned_dlen = ALIGN(dlen, 8);
        aligned_ilen = ALIGN(ilen, 8);
        len = aligned_dlen + aligned_ilen + UBIFS_INO_NODE_SZ;
+        /* Make sure to also account for extended attributes */
+        len += host_ui->data_len;
        dent = kmalloc(len, GFP_NOFS);
        if (!dent)
                return -ENOMEM;
@@ -648,7 +652,8 @@ int ubifs_jnl_update(struct ubifs_info *c, const struct inode *dir,
        ino_key_init(c, &ino_key, dir->i_ino);
        ino_offs += aligned_ilen;
-        err = ubifs_tnc_add(c, &ino_key, lnum, ino_offs, UBIFS_INO_NODE_SZ);
+        err = ubifs_tnc_add(c, &ino_key, lnum, ino_offs,
+                            UBIFS_INO_NODE_SZ + host_ui->data_len);
        if (err)
                goto out_ro;
diff --git a/fs/xfs/libxfs/xfs_ag.h b/fs/xfs/libxfs/xfs_ag.h
deleted file mode 100644
index 6e247a99f5db..000000000000
--- a/fs/xfs/libxfs/xfs_ag.h
+++ /dev/null
@@ -1,281 +0,0 @@
-/*
- * Copyright (c) 2000-2003,2005 Silicon Graphics, Inc.
- * All Rights Reserved.
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License as
- * published by the Free Software Foundation.
- *
- * This program is distributed in the hope that it would be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, write the Free Software Foundation,
- * Inc.,  51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
- */
-#ifndef __XFS_AG_H__
-#define __XFS_AG_H__
-/*
- * Allocation group header
- * This is divided into three structures, placed in sequential 512-byte
- * buffers after a copy of the superblock (also in a 512-byte buffer).
- */
-struct xfs_buf;
-struct xfs_mount;
-struct xfs_trans;
-#define XFS_AGF_MAGIC   0x58414746      /* 'XAGF' */
-#define XFS_AGI_MAGIC   0x58414749      /* 'XAGI' */
-#define XFS_AGFL_MAGIC  0x5841464c      /* 'XAFL' */
-#define XFS_AGF_VERSION 1
-#define XFS_AGI_VERSION 1
-#define XFS_AGF_GOOD_VERSION(v) ((v) == XFS_AGF_VERSION)
-#define XFS_AGI_GOOD_VERSION(v) ((v) == XFS_AGI_VERSION)
-/*
- * Btree number 0 is bno, 1 is cnt.  This value gives the size of the
- * arrays below.
- */
-#define XFS_BTNUM_AGF   ((int)XFS_BTNUM_CNTi + 1)
-/*
- * The second word of agf_levels in the first a.g. overlaps the EFS
- * superblock's magic number.  Since the magic numbers valid for EFS
- * are > 64k, our value cannot be confused for an EFS superblock's.
- */
-typedef struct xfs_agf {
-        /*
-         * Common allocation group header information
-         */
-        __be32          agf_magicnum;   /* magic number == XFS_AGF_MAGIC */
-        __be32          agf_versionnum; /* header version == XFS_AGF_VERSION */
-        __be32          agf_seqno;      /* sequence # starting from 0 */
-        __be32          agf_length;     /* size in blocks of a.g. */
-        /*
-         * Freespace information
-         */
-        __be32          agf_roots[XFS_BTNUM_AGF];       /* root blocks */
-        __be32          agf_spare0;     /* spare field */
-        __be32          agf_levels[XFS_BTNUM_AGF];      /* btree levels */
-        __be32          agf_spare1;     /* spare field */
-        __be32          agf_flfirst;    /* first freelist block's index */
-        __be32          agf_fllast;     /* last freelist block's index */
-        __be32          agf_flcount;    /* count of blocks in freelist */
-        __be32          agf_freeblks;   /* total free blocks */
-        __be32          agf_longest;    /* longest free space */
-        __be32          agf_btreeblks;  /* # of blocks held in AGF btrees */
-        uuid_t          agf_uuid;       /* uuid of filesystem */
-        /*
-         * reserve some contiguous space for future logged fields before we add
-         * the unlogged fields. This makes the range logging via flags and
-         * structure offsets much simpler.
-         */
-        __be64          agf_spare64[16];
-        /* unlogged fields, written during buffer writeback. */
-        __be64          agf_lsn;        /* last write sequence */
-        __be32          agf_crc;        /* crc of agf sector */
-        __be32          agf_spare2;
-        /* structure must be padded to 64 bit alignment */
-} xfs_agf_t;
-#define XFS_AGF_CRC_OFF         offsetof(struct xfs_agf, agf_crc)
-#define XFS_AGF_MAGICNUM        0x00000001
-#define XFS_AGF_VERSIONNUM      0x00000002
-#define XFS_AGF_SEQNO           0x00000004
-#define XFS_AGF_LENGTH          0x00000008
-#define XFS_AGF_ROOTS           0x00000010
-#define XFS_AGF_LEVELS          0x00000020
-#define XFS_AGF_FLFIRST         0x00000040
-#define XFS_AGF_FLLAST          0x00000080
-#define XFS_AGF_FLCOUNT         0x00000100
-#define XFS_AGF_FREEBLKS        0x00000200
-#define XFS_AGF_LONGEST         0x00000400
-#define XFS_AGF_BTREEBLKS       0x00000800
-#define XFS_AGF_UUID            0x00001000
-#define XFS_AGF_NUM_BITS        13
-#define XFS_AGF_ALL_BITS        ((1 << XFS_AGF_NUM_BITS) - 1)
-#define XFS_AGF_FLAGS \
-        { XFS_AGF_MAGICNUM,     "MAGICNUM" }, \
-        { XFS_AGF_VERSIONNUM,   "VERSIONNUM" }, \
-        { XFS_AGF_SEQNO,        "SEQNO" }, \
-        { XFS_AGF_LENGTH,       "LENGTH" }, \
-        { XFS_AGF_ROOTS,        "ROOTS" }, \
-        { XFS_AGF_LEVELS,       "LEVELS" }, \
-        { XFS_AGF_FLFIRST,      "FLFIRST" }, \
-        { XFS_AGF_FLLAST,       "FLLAST" }, \
-        { XFS_AGF_FLCOUNT,      "FLCOUNT" }, \
-        { XFS_AGF_FREEBLKS,     "FREEBLKS" }, \
-        { XFS_AGF_LONGEST,      "LONGEST" }, \
-        { XFS_AGF_BTREEBLKS,    "BTREEBLKS" }, \
-        { XFS_AGF_UUID,         "UUID" }
-/* disk block (xfs_daddr_t) in the AG */
-#define XFS_AGF_DADDR(mp)       ((xfs_daddr_t)(1 << (mp)->m_sectbb_log))
-#define XFS_AGF_BLOCK(mp)       XFS_HDR_BLOCK(mp, XFS_AGF_DADDR(mp))
-#define XFS_BUF_TO_AGF(bp)      ((xfs_agf_t *)((bp)->b_addr))
-extern int xfs_read_agf(struct xfs_mount *mp, struct xfs_trans *tp,
-                        xfs_agnumber_t agno, int flags, struct xfs_buf **bpp);
-/*
- * Size of the unlinked inode hash table in the agi.
- */
-#define XFS_AGI_UNLINKED_BUCKETS        64
-typedef struct xfs_agi {
-        /*
-         * Common allocation group header information
-         */
-        __be32          agi_magicnum;   /* magic number == XFS_AGI_MAGIC */
-        __be32          agi_versionnum; /* header version == XFS_AGI_VERSION */
-        __be32          agi_seqno;      /* sequence # starting from 0 */
-        __be32          agi_length;     /* size in blocks of a.g. */
-        /*
-         * Inode information
-         * Inodes are mapped by interpreting the inode number, so no
-         * mapping data is needed here.
-         */
-        __be32          agi_count;      /* count of allocated inodes */
-        __be32          agi_root;       /* root of inode btree */
-        __be32          agi_level;      /* levels in inode btree */
-        __be32          agi_freecount;  /* number of free inodes */
-        __be32          agi_newino;     /* new inode just allocated */
-        __be32          agi_dirino;     /* last directory inode chunk */
-        /*
-         * Hash table of inodes which have been unlinked but are
-         * still being referenced.
-         */
-        __be32          agi_unlinked[XFS_AGI_UNLINKED_BUCKETS];
-        /*
-         * This marks the end of logging region 1 and start of logging region 2.
-         */
-        uuid_t          agi_uuid;       /* uuid of filesystem */
-        __be32          agi_crc;        /* crc of agi sector */
-        __be32          agi_pad32;
-        __be64          agi_lsn;        /* last write sequence */
-        __be32          agi_free_root; /* root of the free inode btree */
-        __be32          agi_free_level;/* levels in free inode btree */
-        /* structure must be padded to 64 bit alignment */
-} xfs_agi_t;
-#define XFS_AGI_CRC_OFF         offsetof(struct xfs_agi, agi_crc)
-#define XFS_AGI_MAGICNUM        (1 << 0)
-#define XFS_AGI_VERSIONNUM      (1 << 1)
-#define XFS_AGI_SEQNO           (1 << 2)
-#define XFS_AGI_LENGTH          (1 << 3)
-#define XFS_AGI_COUNT           (1 << 4)
-#define XFS_AGI_ROOT            (1 << 5)
-#define XFS_AGI_LEVEL           (1 << 6)
-#define XFS_AGI_FREECOUNT       (1 << 7)
-#define XFS_AGI_NEWINO          (1 << 8)
-#define XFS_AGI_DIRINO          (1 << 9)
-#define XFS_AGI_UNLINKED        (1 << 10)
-#define XFS_AGI_NUM_BITS_R1     11      /* end of the 1st agi logging region */
-#define XFS_AGI_ALL_BITS_R1     ((1 << XFS_AGI_NUM_BITS_R1) - 1)
-#define XFS_AGI_FREE_ROOT       (1 << 11)
-#define XFS_AGI_FREE_LEVEL      (1 << 12)
-#define XFS_AGI_NUM_BITS_R2     13
-/* disk block (xfs_daddr_t) in the AG */
-#define XFS_AGI_DADDR(mp)       ((xfs_daddr_t)(2 << (mp)->m_sectbb_log))
-#define XFS_AGI_BLOCK(mp)       XFS_HDR_BLOCK(mp, XFS_AGI_DADDR(mp))
-#define XFS_BUF_TO_AGI(bp)      ((xfs_agi_t *)((bp)->b_addr))
-extern int xfs_read_agi(struct xfs_mount *mp, struct xfs_trans *tp,
-                                xfs_agnumber_t agno, struct xfs_buf **bpp);
-/*
- * The third a.g. block contains the a.g. freelist, an array
- * of block pointers to blocks owned by the allocation btree code.
- */
-#define XFS_AGFL_DADDR(mp)      ((xfs_daddr_t)(3 << (mp)->m_sectbb_log))
-#define XFS_AGFL_BLOCK(mp)      XFS_HDR_BLOCK(mp, XFS_AGFL_DADDR(mp))
-#define XFS_BUF_TO_AGFL(bp)     ((xfs_agfl_t *)((bp)->b_addr))
-#define XFS_BUF_TO_AGFL_BNO(mp, bp) \
-        (xfs_sb_version_hascrc(&((mp)->m_sb)) ? \
-                &(XFS_BUF_TO_AGFL(bp)->agfl_bno[0]) : \
-                (__be32 *)(bp)->b_addr)
-/*
- * Size of the AGFL.  For CRC-enabled filesystes we steal a couple of
- * slots in the beginning of the block for a proper header with the
- * location information and CRC.
- */
-#define XFS_AGFL_SIZE(mp) \
-        (((mp)->m_sb.sb_sectsize - \
-         (xfs_sb_version_hascrc(&((mp)->m_sb)) ? \
-                sizeof(struct xfs_agfl) : 0)) / \
-          sizeof(xfs_agblock_t))
-typedef struct xfs_agfl {
-        __be32          agfl_magicnum;
-        __be32          agfl_seqno;
-        uuid_t          agfl_uuid;
-        __be64          agfl_lsn;
-        __be32          agfl_crc;
-        __be32          agfl_bno[];     /* actually XFS_AGFL_SIZE(mp) */
-} xfs_agfl_t;
-#define XFS_AGFL_CRC_OFF        offsetof(struct xfs_agfl, agfl_crc)
-/*
- * tags for inode radix tree
- */
-#define XFS_ICI_NO_TAG          (-1)    /* special flag for an untagged lookup
-                                           in xfs_inode_ag_iterator */
-#define XFS_ICI_RECLAIM_TAG     0       /* inode is to be reclaimed */
-#define XFS_ICI_EOFBLOCKS_TAG   1       /* inode has blocks beyond EOF */
-#define XFS_AG_MAXLEVELS(mp)            ((mp)->m_ag_maxlevels)
-#define XFS_MIN_FREELIST_RAW(bl,cl,mp)  \
-        (MIN(bl + 1, XFS_AG_MAXLEVELS(mp)) + MIN(cl + 1, XFS_AG_MAXLEVELS(mp)))
-#define XFS_MIN_FREELIST(a,mp)          \
-        (XFS_MIN_FREELIST_RAW(          \
-                be32_to_cpu((a)->agf_levels[XFS_BTNUM_BNOi]), \
-                be32_to_cpu((a)->agf_levels[XFS_BTNUM_CNTi]), mp))
-#define XFS_MIN_FREELIST_PAG(pag,mp)    \
-        (XFS_MIN_FREELIST_RAW(          \
-                (unsigned int)(pag)->pagf_levels[XFS_BTNUM_BNOi], \
-                (unsigned int)(pag)->pagf_levels[XFS_BTNUM_CNTi], mp))
-#define XFS_AGB_TO_FSB(mp,agno,agbno)   \
-        (((xfs_fsblock_t)(agno) << (mp)->m_sb.sb_agblklog) | (agbno))
-#define XFS_FSB_TO_AGNO(mp,fsbno)       \
-        ((xfs_agnumber_t)((fsbno) >> (mp)->m_sb.sb_agblklog))
-#define XFS_FSB_TO_AGBNO(mp,fsbno)      \
-        ((xfs_agblock_t)((fsbno) & xfs_mask32lo((mp)->m_sb.sb_agblklog)))
-#define XFS_AGB_TO_DADDR(mp,agno,agbno) \
-        ((xfs_daddr_t)XFS_FSB_TO_BB(mp, \
-                (xfs_fsblock_t)(agno) * (mp)->m_sb.sb_agblocks + (agbno)))
-#define XFS_AG_DADDR(mp,agno,d)         (XFS_AGB_TO_DADDR(mp, agno, 0) + (d))
-/*
- * For checking for bad ranges of xfs_daddr_t's, covering multiple
- * allocation groups or a single xfs_daddr_t that's a superblock copy.
- */
-#define XFS_AG_CHECK_DADDR(mp,d,len)    \
-        ((len) == 1 ? \
-            ASSERT((d) == XFS_SB_DADDR || \
-                   xfs_daddr_to_agbno(mp, d) != XFS_SB_DADDR) : \
-            ASSERT(xfs_daddr_to_agno(mp, d) == \
-                   xfs_daddr_to_agno(mp, (d) + (len) - 1)))
-#endif  /* __XFS_AG_H__ */
diff --git a/fs/xfs/libxfs/xfs_alloc.c b/fs/xfs/libxfs/xfs_alloc.c
index eff34218f405..a6fbf4472017 100644
--- a/fs/xfs/libxfs/xfs_alloc.c
+++ b/fs/xfs/libxfs/xfs_alloc.c
@@ -23,7 +23,6 @@
 #include "xfs_trans_resv.h"
 #include "xfs_bit.h"
 #include "xfs_sb.h"
-#include "xfs_ag.h"
 #include "xfs_mount.h"
 #include "xfs_inode.h"
 #include "xfs_btree.h"
diff --git a/fs/xfs/libxfs/xfs_alloc.h b/fs/xfs/libxfs/xfs_alloc.h
index feacb061bab7..d1b4b6a5c894 100644
--- a/fs/xfs/libxfs/xfs_alloc.h
+++ b/fs/xfs/libxfs/xfs_alloc.h
@@ -231,4 +231,7 @@ xfs_alloc_get_rec(
        xfs_extlen_t            *len,   /* output: length of extent */
        int                     *stat); /* output: success/failure */
+int xfs_read_agf(struct xfs_mount *mp, struct xfs_trans *tp,
+                        xfs_agnumber_t agno, int flags, struct xfs_buf **bpp);
 #endif  /* __XFS_ALLOC_H__ */
diff --git a/fs/xfs/libxfs/xfs_alloc_btree.c b/fs/xfs/libxfs/xfs_alloc_btree.c
index e0e83e24d3ef..59d521c09a17 100644
--- a/fs/xfs/libxfs/xfs_alloc_btree.c
+++ b/fs/xfs/libxfs/xfs_alloc_btree.c
@@ -22,7 +22,6 @@
 #include "xfs_log_format.h"
 #include "xfs_trans_resv.h"
 #include "xfs_sb.h"
-#include "xfs_ag.h"
 #include "xfs_mount.h"
 #include "xfs_btree.h"
 #include "xfs_alloc_btree.h"
diff --git a/fs/xfs/libxfs/xfs_attr.c b/fs/xfs/libxfs/xfs_attr.c
index 353fb425faef..0a472fbe06d4 100644
--- a/fs/xfs/libxfs/xfs_attr.c
+++ b/fs/xfs/libxfs/xfs_attr.c
@@ -22,8 +22,6 @@
 #include "xfs_log_format.h"
 #include "xfs_trans_resv.h"
 #include "xfs_bit.h"
-#include "xfs_sb.h"
-#include "xfs_ag.h"
 #include "xfs_mount.h"
 #include "xfs_da_format.h"
 #include "xfs_da_btree.h"
@@ -42,7 +40,6 @@
 #include "xfs_quota.h"
 #include "xfs_trans_space.h"
 #include "xfs_trace.h"
-#include "xfs_dinode.h"
 /*
 * xfs_attr.c
diff --git a/fs/xfs/libxfs/xfs_attr_leaf.c b/fs/xfs/libxfs/xfs_attr_leaf.c
index b1f73dbbf3d8..5d38e8b8a913 100644
--- a/fs/xfs/libxfs/xfs_attr_leaf.c
+++ b/fs/xfs/libxfs/xfs_attr_leaf.c
@@ -24,7 +24,6 @@
 #include "xfs_trans_resv.h"
 #include "xfs_bit.h"
 #include "xfs_sb.h"
-#include "xfs_ag.h"
 #include "xfs_mount.h"
 #include "xfs_da_format.h"
 #include "xfs_da_btree.h"
@@ -41,7 +40,6 @@
 #include "xfs_trace.h"
 #include "xfs_buf_item.h"
 #include "xfs_cksum.h"
-#include "xfs_dinode.h"
 #include "xfs_dir2.h"
diff --git a/fs/xfs/libxfs/xfs_attr_remote.c b/fs/xfs/libxfs/xfs_attr_remote.c
index 7510ab8058a4..20de88d1bf86 100644
--- a/fs/xfs/libxfs/xfs_attr_remote.c
+++ b/fs/xfs/libxfs/xfs_attr_remote.c
@@ -23,8 +23,6 @@
 #include "xfs_log_format.h"
 #include "xfs_trans_resv.h"
 #include "xfs_bit.h"
-#include "xfs_sb.h"
-#include "xfs_ag.h"
 #include "xfs_mount.h"
 #include "xfs_da_format.h"
 #include "xfs_da_btree.h"
diff --git a/fs/xfs/libxfs/xfs_bmap.c b/fs/xfs/libxfs/xfs_bmap.c
index 79c981984dca..b5eb4743f75a 100644
--- a/fs/xfs/libxfs/xfs_bmap.c
+++ b/fs/xfs/libxfs/xfs_bmap.c
@@ -22,9 +22,7 @@
 #include "xfs_log_format.h"
 #include "xfs_trans_resv.h"
 #include "xfs_bit.h"
-#include "xfs_inum.h"
 #include "xfs_sb.h"
-#include "xfs_ag.h"
 #include "xfs_mount.h"
 #include "xfs_da_format.h"
 #include "xfs_da_btree.h"
@@ -46,7 +44,6 @@
 #include "xfs_trace.h"
 #include "xfs_symlink.h"
 #include "xfs_attr_leaf.h"
-#include "xfs_dinode.h"
 #include "xfs_filestream.h"
@@ -5450,13 +5447,11 @@ xfs_bmse_merge(
        struct xfs_btree_cur            *cur,
        int                             *logflags)      /* output */
 {
-        struct xfs_ifork                *ifp;
        struct xfs_bmbt_irec            got;
        struct xfs_bmbt_irec            left;
        xfs_filblks_t                   blockcount;
        int                             error, i;
-        ifp = XFS_IFORK_PTR(ip, whichfork);
        xfs_bmbt_get_all(gotp, &got);
        xfs_bmbt_get_all(leftp, &left);
        blockcount = left.br_blockcount + got.br_blockcount;
@@ -5489,32 +5484,25 @@ xfs_bmse_merge(
        error = xfs_bmbt_lookup_eq(cur, got.br_startoff, got.br_startblock,
                                   got.br_blockcount, &i);
        if (error)
-                goto out_error;
+                return error;
-        XFS_WANT_CORRUPTED_GOTO(i == 1, out_error);
+        XFS_WANT_CORRUPTED_RETURN(i == 1);
        error = xfs_btree_delete(cur, &i);
        if (error)
-                goto out_error;
+                return error;
-        XFS_WANT_CORRUPTED_GOTO(i == 1, out_error);
+        XFS_WANT_CORRUPTED_RETURN(i == 1);
        /* lookup and update size of the previous extent */
        error = xfs_bmbt_lookup_eq(cur, left.br_startoff, left.br_startblock,
                                   left.br_blockcount, &i);
        if (error)
-                goto out_error;
+                return error;
-        XFS_WANT_CORRUPTED_GOTO(i == 1, out_error);
+        XFS_WANT_CORRUPTED_RETURN(i == 1);
        left.br_blockcount = blockcount;
-        error = xfs_bmbt_update(cur, left.br_startoff, left.br_startblock,
+        return xfs_bmbt_update(cur, left.br_startoff, left.br_startblock,
-                                left.br_blockcount, left.br_state);
+                               left.br_blockcount, left.br_state);
-        if (error)
-                goto out_error;
-        return 0;
-out_error:
-        return error;
 }
 /*
@@ -5544,35 +5532,29 @@ xfs_bmse_shift_one(
        startoff = got.br_startoff - offset_shift_fsb;
        /* delalloc extents should be prevented by caller */
-        XFS_WANT_CORRUPTED_GOTO(!isnullstartblock(got.br_startblock),
+        XFS_WANT_CORRUPTED_RETURN(!isnullstartblock(got.br_startblock));
-                                out_error);
        /*
-         * If this is the first extent in the file, make sure there's enough
+         * Check for merge if we've got an extent to the left, otherwise make
-         * room at the start of the file and jump right to the shift as there's
+         * sure there's enough room at the start of the file for the shift.
-         * no left extent to merge.
         */
-        if (*current_ext == 0) {
+        if (*current_ext) {
-                if (got.br_startoff < offset_shift_fsb)
+                /* grab the left extent and check for a large enough hole */
-                        return -EINVAL;
+                leftp = xfs_iext_get_ext(ifp, *current_ext - 1);
-                goto shift_extent;
+                xfs_bmbt_get_all(leftp, &left);
-        }
-        /* grab the left extent and check for a large enough hole */
+                if (startoff < left.br_startoff + left.br_blockcount)
-        leftp = xfs_iext_get_ext(ifp, *current_ext - 1);
+                        return -EINVAL;
-        xfs_bmbt_get_all(leftp, &left);
-        if (startoff < left.br_startoff + left.br_blockcount)
+                /* check whether to merge the extent or shift it down */
+                if (xfs_bmse_can_merge(&left, &got, offset_shift_fsb)) {
+                        return xfs_bmse_merge(ip, whichfork, offset_shift_fsb,
+                                              *current_ext, gotp, leftp, cur,
+                                              logflags);
+                }
+        } else if (got.br_startoff < offset_shift_fsb)
                return -EINVAL;
-        /* check whether to merge the extent or shift it down */
-        if (!xfs_bmse_can_merge(&left, &got, offset_shift_fsb))
-                goto shift_extent;
-        return xfs_bmse_merge(ip, whichfork, offset_shift_fsb, *current_ext,
-                              gotp, leftp, cur, logflags);
-shift_extent:
        /*
         * Increment the extent index for the next iteration, update the start
         * offset of the in-core extent and update the btree if applicable.
@@ -5589,18 +5571,11 @@ shift_extent:
                                   got.br_blockcount, &i);
        if (error)
                return error;
-        XFS_WANT_CORRUPTED_GOTO(i == 1, out_error);
+        XFS_WANT_CORRUPTED_RETURN(i == 1);
        got.br_startoff = startoff;
-        error = xfs_bmbt_update(cur, got.br_startoff, got.br_startblock,
+        return xfs_bmbt_update(cur, got.br_startoff, got.br_startblock,
                                got.br_blockcount, got.br_state);
-        if (error)
-                return error;
-        return 0;
-out_error:
-        return error;
 }
 /*
diff --git a/fs/xfs/libxfs/xfs_bmap_btree.c b/fs/xfs/libxfs/xfs_bmap_btree.c
index fba753308f31..2c44c8e50782 100644
--- a/fs/xfs/libxfs/xfs_bmap_btree.c
+++ b/fs/xfs/libxfs/xfs_bmap_btree.c
@@ -22,8 +22,6 @@
 #include "xfs_log_format.h"
 #include "xfs_trans_resv.h"
 #include "xfs_bit.h"
-#include "xfs_sb.h"
-#include "xfs_ag.h"
 #include "xfs_mount.h"
 #include "xfs_inode.h"
 #include "xfs_trans.h"
@@ -36,7 +34,6 @@
 #include "xfs_quota.h"
 #include "xfs_trace.h"
 #include "xfs_cksum.h"
-#include "xfs_dinode.h"
 /*
 * Determine the extent state.
diff --git a/fs/xfs/libxfs/xfs_btree.c b/fs/xfs/libxfs/xfs_btree.c
index 8fe6a93ff473..81cad433df85 100644
--- a/fs/xfs/libxfs/xfs_btree.c
+++ b/fs/xfs/libxfs/xfs_btree.c
@@ -22,8 +22,6 @@
 #include "xfs_log_format.h"
 #include "xfs_trans_resv.h"
 #include "xfs_bit.h"
-#include "xfs_sb.h"
-#include "xfs_ag.h"
 #include "xfs_mount.h"
 #include "xfs_inode.h"
 #include "xfs_trans.h"
diff --git a/fs/xfs/libxfs/xfs_da_btree.c b/fs/xfs/libxfs/xfs_da_btree.c
index fd827530afec..9cb0115c6bd1 100644
--- a/fs/xfs/libxfs/xfs_da_btree.c
+++ b/fs/xfs/libxfs/xfs_da_btree.c
@@ -23,8 +23,6 @@
 #include "xfs_log_format.h"
 #include "xfs_trans_resv.h"
 #include "xfs_bit.h"
-#include "xfs_sb.h"
-#include "xfs_ag.h"
 #include "xfs_mount.h"
 #include "xfs_da_format.h"
 #include "xfs_da_btree.h"
@@ -514,7 +512,6 @@ xfs_da3_root_split(
        struct xfs_buf          *bp;
        struct xfs_inode        *dp;
        struct xfs_trans        *tp;
-        struct xfs_mount        *mp;
        struct xfs_dir2_leaf    *leaf;
        xfs_dablk_t             blkno;
        int                     level;
@@ -534,7 +531,6 @@ xfs_da3_root_split(
        dp = args->dp;
        tp = args->trans;
-        mp = state->mp;
        error = xfs_da_get_buf(tp, dp, blkno, -1, &bp, args->whichfork);
        if (error)
                return error;
@@ -2342,14 +2338,12 @@ xfs_da_shrink_inode(
        xfs_inode_t *dp;
        int done, error, w, count;
        xfs_trans_t *tp;
-        xfs_mount_t *mp;
        trace_xfs_da_shrink_inode(args);
        dp = args->dp;
        w = args->whichfork;
        tp = args->trans;
-        mp = dp->i_mount;
        count = args->geo->fsbcount;
        for (;;) {
                /*
diff --git a/fs/xfs/libxfs/xfs_da_format.c b/fs/xfs/libxfs/xfs_da_format.c
index 7e42fdfd2f1d..9d624a622946 100644
--- a/fs/xfs/libxfs/xfs_da_format.c
+++ b/fs/xfs/libxfs/xfs_da_format.c
@@ -22,8 +22,6 @@
 #include "xfs_format.h"
 #include "xfs_log_format.h"
 #include "xfs_trans_resv.h"
-#include "xfs_sb.h"
-#include "xfs_ag.h"
 #include "xfs_mount.h"
 #include "xfs_da_format.h"
 #include "xfs_da_btree.h"
diff --git a/fs/xfs/libxfs/xfs_dinode.h b/fs/xfs/libxfs/xfs_dinode.h
deleted file mode 100644
index 623bbe8fd921..000000000000
--- a/fs/xfs/libxfs/xfs_dinode.h
+++ /dev/null
@@ -1,243 +0,0 @@
-/*
- * Copyright (c) 2000,2002,2005 Silicon Graphics, Inc.
- * All Rights Reserved.
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License as
- * published by the Free Software Foundation.
- *
- * This program is distributed in the hope that it would be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, write the Free Software Foundation,
- * Inc.,  51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
- */
-#ifndef __XFS_DINODE_H__
-#define __XFS_DINODE_H__
-#define XFS_DINODE_MAGIC                0x494e  /* 'IN' */
-#define XFS_DINODE_GOOD_VERSION(v)      ((v) >= 1 && (v) <= 3)
-typedef struct xfs_timestamp {
-        __be32          t_sec;          /* timestamp seconds */
-        __be32          t_nsec;         /* timestamp nanoseconds */
-} xfs_timestamp_t;
-/*
- * On-disk inode structure.
- *
- * This is just the header or "dinode core", the inode is expanded to fill a
- * variable size the leftover area split into a data and an attribute fork.
- * The format of the data and attribute fork depends on the format of the
- * inode as indicated by di_format and di_aformat.  To access the data and
- * attribute use the XFS_DFORK_DPTR, XFS_DFORK_APTR, and XFS_DFORK_PTR macros
- * below.
- *
- * There is a very similar struct icdinode in xfs_inode which matches the
- * layout of the first 96 bytes of this structure, but is kept in native
- * format instead of big endian.
- *
- * Note: di_flushiter is only used by v1/2 inodes - it's effectively a zeroed
- * padding field for v3 inodes.
- */
-typedef struct xfs_dinode {
-        __be16          di_magic;       /* inode magic # = XFS_DINODE_MAGIC */
-        __be16          di_mode;        /* mode and type of file */
-        __u8            di_version;     /* inode version */
-        __u8            di_format;      /* format of di_c data */
-        __be16          di_onlink;      /* old number of links to file */
-        __be32          di_uid;         /* owner's user id */
-        __be32          di_gid;         /* owner's group id */
-        __be32          di_nlink;       /* number of links to file */
-        __be16          di_projid_lo;   /* lower part of owner's project id */
-        __be16          di_projid_hi;   /* higher part owner's project id */
-        __u8            di_pad[6];      /* unused, zeroed space */
-        __be16          di_flushiter;   /* incremented on flush */
-        xfs_timestamp_t di_atime;       /* time last accessed */
-        xfs_timestamp_t di_mtime;       /* time last modified */
-        xfs_timestamp_t di_ctime;       /* time created/inode modified */
-        __be64          di_size;        /* number of bytes in file */
-        __be64          di_nblocks;     /* # of direct & btree blocks used */
-        __be32          di_extsize;     /* basic/minimum extent size for file */
-        __be32          di_nextents;    /* number of extents in data fork */
-        __be16          di_anextents;   /* number of extents in attribute fork*/
-        __u8            di_forkoff;     /* attr fork offs, <<3 for 64b align */
-        __s8            di_aformat;     /* format of attr fork's data */
-        __be32          di_dmevmask;    /* DMIG event mask */
-        __be16          di_dmstate;     /* DMIG state info */
-        __be16          di_flags;       /* random flags, XFS_DIFLAG_... */
-        __be32          di_gen;         /* generation number */
-        /* di_next_unlinked is the only non-core field in the old dinode */
-        __be32          di_next_unlinked;/* agi unlinked list ptr */
-        /* start of the extended dinode, writable fields */
-        __le32          di_crc;         /* CRC of the inode */
-        __be64          di_changecount; /* number of attribute changes */
-        __be64          di_lsn;         /* flush sequence */
-        __be64          di_flags2;      /* more random flags */
-        __u8            di_pad2[16];    /* more padding for future expansion */
-        /* fields only written to during inode creation */
-        xfs_timestamp_t di_crtime;      /* time created */
-        __be64          di_ino;         /* inode number */
-        uuid_t          di_uuid;        /* UUID of the filesystem */
-        /* structure must be padded to 64 bit alignment */
-} xfs_dinode_t;
-#define XFS_DINODE_CRC_OFF      offsetof(struct xfs_dinode, di_crc)
-#define DI_MAX_FLUSH 0xffff
-/*
- * Size of the core inode on disk.  Version 1 and 2 inodes have
- * the same size, but version 3 has grown a few additional fields.
- */
-static inline uint xfs_dinode_size(int version)
-{
-        if (version == 3)
-                return sizeof(struct xfs_dinode);
-        return offsetof(struct xfs_dinode, di_crc);
-}
-/*
- * The 32 bit link count in the inode theoretically maxes out at UINT_MAX.
- * Since the pathconf interface is signed, we use 2^31 - 1 instead.
- * The old inode format had a 16 bit link count, so its maximum is USHRT_MAX.
- */
-#define XFS_MAXLINK             ((1U << 31) - 1U)
-#define XFS_MAXLINK_1           65535U
-/*
- * Values for di_format
- */
-typedef enum xfs_dinode_fmt {
-        XFS_DINODE_FMT_DEV,             /* xfs_dev_t */
-        XFS_DINODE_FMT_LOCAL,           /* bulk data */
-        XFS_DINODE_FMT_EXTENTS,         /* struct xfs_bmbt_rec */
-        XFS_DINODE_FMT_BTREE,           /* struct xfs_bmdr_block */
-        XFS_DINODE_FMT_UUID             /* uuid_t */
-} xfs_dinode_fmt_t;
-/*
- * Inode minimum and maximum sizes.
- */
-#define XFS_DINODE_MIN_LOG      8
-#define XFS_DINODE_MAX_LOG      11
-#define XFS_DINODE_MIN_SIZE     (1 << XFS_DINODE_MIN_LOG)
-#define XFS_DINODE_MAX_SIZE     (1 << XFS_DINODE_MAX_LOG)
-/*
- * Inode size for given fs.
- */
-#define XFS_LITINO(mp, version) \
-        ((int)(((mp)->m_sb.sb_inodesize) - xfs_dinode_size(version)))
-/*
- * Inode data & attribute fork sizes, per inode.
- */
-#define XFS_DFORK_Q(dip)                ((dip)->di_forkoff != 0)
-#define XFS_DFORK_BOFF(dip)             ((int)((dip)->di_forkoff << 3))
-#define XFS_DFORK_DSIZE(dip,mp) \
-        (XFS_DFORK_Q(dip) ? \
-                XFS_DFORK_BOFF(dip) : \
-                XFS_LITINO(mp, (dip)->di_version))
-#define XFS_DFORK_ASIZE(dip,mp) \
-        (XFS_DFORK_Q(dip) ? \
-                XFS_LITINO(mp, (dip)->di_version) - XFS_DFORK_BOFF(dip) : \
-                0)
-#define XFS_DFORK_SIZE(dip,mp,w) \
-        ((w) == XFS_DATA_FORK ? \
-                XFS_DFORK_DSIZE(dip, mp) : \
-                XFS_DFORK_ASIZE(dip, mp))
-/*
- * Return pointers to the data or attribute forks.
- */
-#define XFS_DFORK_DPTR(dip) \
-        ((char *)dip + xfs_dinode_size(dip->di_version))
-#define XFS_DFORK_APTR(dip)     \
-        (XFS_DFORK_DPTR(dip) + XFS_DFORK_BOFF(dip))
-#define XFS_DFORK_PTR(dip,w)    \
-        ((w) == XFS_DATA_FORK ? XFS_DFORK_DPTR(dip) : XFS_DFORK_APTR(dip))
-#define XFS_DFORK_FORMAT(dip,w) \
-        ((w) == XFS_DATA_FORK ? \
-                (dip)->di_format : \
-                (dip)->di_aformat)
-#define XFS_DFORK_NEXTENTS(dip,w) \
-        ((w) == XFS_DATA_FORK ? \
-                be32_to_cpu((dip)->di_nextents) : \
-                be16_to_cpu((dip)->di_anextents))
-#define XFS_BUF_TO_DINODE(bp)   ((xfs_dinode_t *)((bp)->b_addr))
-/*
- * For block and character special files the 32bit dev_t is stored at the
- * beginning of the data fork.
- */
-static inline xfs_dev_t xfs_dinode_get_rdev(struct xfs_dinode *dip)
-{
-        return be32_to_cpu(*(__be32 *)XFS_DFORK_DPTR(dip));
-}
-static inline void xfs_dinode_put_rdev(struct xfs_dinode *dip, xfs_dev_t rdev)
-{
-        *(__be32 *)XFS_DFORK_DPTR(dip) = cpu_to_be32(rdev);
-}
-/*
- * Values for di_flags
- * There should be a one-to-one correspondence between these flags and the
- * XFS_XFLAG_s.
- */
-#define XFS_DIFLAG_REALTIME_BIT  0      /* file's blocks come from rt area */
-#define XFS_DIFLAG_PREALLOC_BIT  1      /* file space has been preallocated */
-#define XFS_DIFLAG_NEWRTBM_BIT   2      /* for rtbitmap inode, new format */
-#define XFS_DIFLAG_IMMUTABLE_BIT 3      /* inode is immutable */
-#define XFS_DIFLAG_APPEND_BIT    4      /* inode is append-only */
-#define XFS_DIFLAG_SYNC_BIT      5      /* inode is written synchronously */
-#define XFS_DIFLAG_NOATIME_BIT   6      /* do not update atime */
-#define XFS_DIFLAG_NODUMP_BIT    7      /* do not dump */
-#define XFS_DIFLAG_RTINHERIT_BIT 8      /* create with realtime bit set */
-#define XFS_DIFLAG_PROJINHERIT_BIT   9  /* create with parents projid */
-#define XFS_DIFLAG_NOSYMLINKS_BIT   10  /* disallow symlink creation */
-#define XFS_DIFLAG_EXTSIZE_BIT      11  /* inode extent size allocator hint */
-#define XFS_DIFLAG_EXTSZINHERIT_BIT 12  /* inherit inode extent size */
-#define XFS_DIFLAG_NODEFRAG_BIT     13  /* do not reorganize/defragment */
-#define XFS_DIFLAG_FILESTREAM_BIT   14  /* use filestream allocator */
-#define XFS_DIFLAG_REALTIME      (1 << XFS_DIFLAG_REALTIME_BIT)
-#define XFS_DIFLAG_PREALLOC      (1 << XFS_DIFLAG_PREALLOC_BIT)
-#define XFS_DIFLAG_NEWRTBM       (1 << XFS_DIFLAG_NEWRTBM_BIT)
-#define XFS_DIFLAG_IMMUTABLE     (1 << XFS_DIFLAG_IMMUTABLE_BIT)
-#define XFS_DIFLAG_APPEND        (1 << XFS_DIFLAG_APPEND_BIT)
-#define XFS_DIFLAG_SYNC          (1 << XFS_DIFLAG_SYNC_BIT)
-#define XFS_DIFLAG_NOATIME       (1 << XFS_DIFLAG_NOATIME_BIT)
-#define XFS_DIFLAG_NODUMP        (1 << XFS_DIFLAG_NODUMP_BIT)
-#define XFS_DIFLAG_RTINHERIT     (1 << XFS_DIFLAG_RTINHERIT_BIT)
-#define XFS_DIFLAG_PROJINHERIT   (1 << XFS_DIFLAG_PROJINHERIT_BIT)
-#define XFS_DIFLAG_NOSYMLINKS    (1 << XFS_DIFLAG_NOSYMLINKS_BIT)
-#define XFS_DIFLAG_EXTSIZE       (1 << XFS_DIFLAG_EXTSIZE_BIT)
-#define XFS_DIFLAG_EXTSZINHERIT  (1 << XFS_DIFLAG_EXTSZINHERIT_BIT)
-#define XFS_DIFLAG_NODEFRAG      (1 << XFS_DIFLAG_NODEFRAG_BIT)
-#define XFS_DIFLAG_FILESTREAM    (1 << XFS_DIFLAG_FILESTREAM_BIT)
-#ifdef CONFIG_XFS_RT
-#define XFS_IS_REALTIME_INODE(ip) ((ip)->i_d.di_flags & XFS_DIFLAG_REALTIME)
-#else
-#define XFS_IS_REALTIME_INODE(ip) (0)
-#endif
-#define XFS_DIFLAG_ANY \
-        (XFS_DIFLAG_REALTIME | XFS_DIFLAG_PREALLOC | XFS_DIFLAG_NEWRTBM | \
-         XFS_DIFLAG_IMMUTABLE | XFS_DIFLAG_APPEND | XFS_DIFLAG_SYNC | \
-         XFS_DIFLAG_NOATIME | XFS_DIFLAG_NODUMP | XFS_DIFLAG_RTINHERIT | \
-         XFS_DIFLAG_PROJINHERIT | XFS_DIFLAG_NOSYMLINKS | XFS_DIFLAG_EXTSIZE | \
-         XFS_DIFLAG_EXTSZINHERIT | XFS_DIFLAG_NODEFRAG | XFS_DIFLAG_FILESTREAM)
-#endif  /* __XFS_DINODE_H__ */
diff --git a/fs/xfs/libxfs/xfs_dir2.c b/fs/xfs/libxfs/xfs_dir2.c
index 7075aaf131f4..a69fb3a1e161 100644
--- a/fs/xfs/libxfs/xfs_dir2.c
+++ b/fs/xfs/libxfs/xfs_dir2.c
@@ -20,9 +20,6 @@
 #include "xfs_format.h"
 #include "xfs_log_format.h"
 #include "xfs_trans_resv.h"
-#include "xfs_inum.h"
-#include "xfs_sb.h"
-#include "xfs_ag.h"
 #include "xfs_mount.h"
 #include "xfs_da_format.h"
 #include "xfs_da_btree.h"
@@ -34,10 +31,25 @@
 #include "xfs_dir2_priv.h"
 #include "xfs_error.h"
 #include "xfs_trace.h"
-#include "xfs_dinode.h"
 struct xfs_name xfs_name_dotdot = { (unsigned char *)"..", 2, XFS_DIR3_FT_DIR };
+/*
+ * @mode, if set, indicates that the type field needs to be set up.
+ * This uses the transformation from file mode to DT_* as defined in linux/fs.h
+ * for file type specification. This will be propagated into the directory
+ * structure if appropriate for the given operation and filesystem config.
+ */
+const unsigned char xfs_mode_to_ftype[S_IFMT >> S_SHIFT] = {
+        [0]                     = XFS_DIR3_FT_UNKNOWN,
+        [S_IFREG >> S_SHIFT]    = XFS_DIR3_FT_REG_FILE,
+        [S_IFDIR >> S_SHIFT]    = XFS_DIR3_FT_DIR,
+        [S_IFCHR >> S_SHIFT]    = XFS_DIR3_FT_CHRDEV,
+        [S_IFBLK >> S_SHIFT]    = XFS_DIR3_FT_BLKDEV,
+        [S_IFIFO >> S_SHIFT]    = XFS_DIR3_FT_FIFO,
+        [S_IFSOCK >> S_SHIFT]   = XFS_DIR3_FT_SOCK,
+        [S_IFLNK >> S_SHIFT]    = XFS_DIR3_FT_SYMLINK,
+};
 /*
 * ASCII case-insensitive (ie. A-Z) support for directories that was
diff --git a/fs/xfs/libxfs/xfs_dir2.h b/fs/xfs/libxfs/xfs_dir2.h
index 4dff261e6ed5..e55353651f5b 100644
--- a/fs/xfs/libxfs/xfs_dir2.h
+++ b/fs/xfs/libxfs/xfs_dir2.h
@@ -32,6 +32,12 @@ struct xfs_dir2_data_unused;
 extern struct xfs_name  xfs_name_dotdot;
 /*
+ * directory filetype conversion tables.
+ */
+#define S_SHIFT 12
+extern const unsigned char xfs_mode_to_ftype[];
+/*
 * directory operations vector for encode/decode routines
 */
 struct xfs_dir_ops {
@@ -177,4 +183,138 @@ extern const struct xfs_buf_ops xfs_dir3_leaf1_buf_ops;
 extern const struct xfs_buf_ops xfs_dir3_free_buf_ops;
 extern const struct xfs_buf_ops xfs_dir3_data_buf_ops;
+/*
+ * Directory offset/block conversion functions.
+ *
+ * DB blocks here are logical directory block numbers, not filesystem blocks.
+ */
+/*
+ * Convert dataptr to byte in file space
+ */
+static inline xfs_dir2_off_t
+xfs_dir2_dataptr_to_byte(xfs_dir2_dataptr_t dp)
+{
+        return (xfs_dir2_off_t)dp << XFS_DIR2_DATA_ALIGN_LOG;
+}
+/*
+ * Convert byte in file space to dataptr.  It had better be aligned.
+ */
+static inline xfs_dir2_dataptr_t
+xfs_dir2_byte_to_dataptr(xfs_dir2_off_t by)
+{
+        return (xfs_dir2_dataptr_t)(by >> XFS_DIR2_DATA_ALIGN_LOG);
+}
+/*
+ * Convert byte in space to (DB) block
+ */
+static inline xfs_dir2_db_t
+xfs_dir2_byte_to_db(struct xfs_da_geometry *geo, xfs_dir2_off_t by)
+{
+        return (xfs_dir2_db_t)(by >> geo->blklog);
+}
+/*
+ * Convert dataptr to a block number
+ */
+static inline xfs_dir2_db_t
+xfs_dir2_dataptr_to_db(struct xfs_da_geometry *geo, xfs_dir2_dataptr_t dp)
+{
+        return xfs_dir2_byte_to_db(geo, xfs_dir2_dataptr_to_byte(dp));
+}
+/*
+ * Convert byte in space to offset in a block
+ */
+static inline xfs_dir2_data_aoff_t
+xfs_dir2_byte_to_off(struct xfs_da_geometry *geo, xfs_dir2_off_t by)
+{
+        return (xfs_dir2_data_aoff_t)(by & (geo->blksize - 1));
+}
+/*
+ * Convert dataptr to a byte offset in a block
+ */
+static inline xfs_dir2_data_aoff_t
+xfs_dir2_dataptr_to_off(struct xfs_da_geometry *geo, xfs_dir2_dataptr_t dp)
+{
+        return xfs_dir2_byte_to_off(geo, xfs_dir2_dataptr_to_byte(dp));
+}
+/*
+ * Convert block and offset to byte in space
+ */
+static inline xfs_dir2_off_t
+xfs_dir2_db_off_to_byte(struct xfs_da_geometry *geo, xfs_dir2_db_t db,
+                        xfs_dir2_data_aoff_t o)
+{
+        return ((xfs_dir2_off_t)db << geo->blklog) + o;
+}
+/*
+ * Convert block (DB) to block (dablk)
+ */
+static inline xfs_dablk_t
+xfs_dir2_db_to_da(struct xfs_da_geometry *geo, xfs_dir2_db_t db)
+{
+        return (xfs_dablk_t)(db << (geo->blklog - geo->fsblog));
+}
+/*
+ * Convert byte in space to (DA) block
+ */
+static inline xfs_dablk_t
+xfs_dir2_byte_to_da(struct xfs_da_geometry *geo, xfs_dir2_off_t by)
+{
+        return xfs_dir2_db_to_da(geo, xfs_dir2_byte_to_db(geo, by));
+}
+/*
+ * Convert block and offset to dataptr
+ */
+static inline xfs_dir2_dataptr_t
+xfs_dir2_db_off_to_dataptr(struct xfs_da_geometry *geo, xfs_dir2_db_t db,
+                           xfs_dir2_data_aoff_t o)
+{
+        return xfs_dir2_byte_to_dataptr(xfs_dir2_db_off_to_byte(geo, db, o));
+}
+/*
+ * Convert block (dablk) to block (DB)
+ */
+static inline xfs_dir2_db_t
+xfs_dir2_da_to_db(struct xfs_da_geometry *geo, xfs_dablk_t da)
+{
+        return (xfs_dir2_db_t)(da >> (geo->blklog - geo->fsblog));
+}
+/*
+ * Convert block (dablk) to byte offset in space
+ */
+static inline xfs_dir2_off_t
+xfs_dir2_da_to_byte(struct xfs_da_geometry *geo, xfs_dablk_t da)
+{
+        return xfs_dir2_db_off_to_byte(geo, xfs_dir2_da_to_db(geo, da), 0);
+}
+/*
+ * Directory tail pointer accessor functions. Based on block geometry.
+ */
+static inline struct xfs_dir2_block_tail *
+xfs_dir2_block_tail_p(struct xfs_da_geometry *geo, struct xfs_dir2_data_hdr *hdr)
+{
+        return ((struct xfs_dir2_block_tail *)
+                ((char *)hdr + geo->blksize)) - 1;
+}
+static inline struct xfs_dir2_leaf_tail *
+xfs_dir2_leaf_tail_p(struct xfs_da_geometry *geo, struct xfs_dir2_leaf *lp)
+{
+        return (struct xfs_dir2_leaf_tail *)
+                ((char *)lp + geo->blksize -
+                  sizeof(struct xfs_dir2_leaf_tail));
+}
 #endif  /* __XFS_DIR2_H__ */
diff --git a/fs/xfs/libxfs/xfs_dir2_block.c b/fs/xfs/libxfs/xfs_dir2_block.c
index 9628ceccfa02..9354e190b82e 100644
--- a/fs/xfs/libxfs/xfs_dir2_block.c
+++ b/fs/xfs/libxfs/xfs_dir2_block.c
@@ -21,8 +21,6 @@
 #include "xfs_format.h"
 #include "xfs_log_format.h"
 #include "xfs_trans_resv.h"
-#include "xfs_sb.h"
-#include "xfs_ag.h"
 #include "xfs_mount.h"
 #include "xfs_da_format.h"
 #include "xfs_da_btree.h"
@@ -36,7 +34,6 @@
 #include "xfs_error.h"
 #include "xfs_trace.h"
 #include "xfs_cksum.h"
-#include "xfs_dinode.h"
 /*
 * Local function prototypes.
@@ -353,7 +350,6 @@ xfs_dir2_block_addname(
        int                     low;            /* low index for binary srch */
        int                     lowstale;       /* low stale index */
        int                     mid=0;          /* midpoint for binary srch */
-        xfs_mount_t             *mp;            /* filesystem mount point */
        int                     needlog;        /* need to log header */
        int                     needscan;       /* need to rescan freespace */
        __be16                  *tagp;          /* pointer to tag value */
@@ -363,7 +359,6 @@ xfs_dir2_block_addname(
        dp = args->dp;
        tp = args->trans;
-        mp = dp->i_mount;
        /* Read the (one and only) directory block into bp. */
        error = xfs_dir3_block_read(tp, dp, &bp);
@@ -618,7 +613,6 @@ xfs_dir2_block_lookup(
        xfs_inode_t             *dp;            /* incore inode */
        int                     ent;            /* entry index */
        int                     error;          /* error return value */
-        xfs_mount_t             *mp;            /* filesystem mount point */
        trace_xfs_dir2_block_lookup(args);
@@ -629,7 +623,6 @@ xfs_dir2_block_lookup(
        if ((error = xfs_dir2_block_lookup_int(args, &bp, &ent)))
                return error;
        dp = args->dp;
-        mp = dp->i_mount;
        hdr = bp->b_addr;
        xfs_dir3_data_check(dp, bp);
        btp = xfs_dir2_block_tail_p(args->geo, hdr);
@@ -770,7 +763,6 @@ xfs_dir2_block_removename(
        xfs_inode_t             *dp;            /* incore inode */
        int                     ent;            /* block leaf entry index */
        int                     error;          /* error return value */
-        xfs_mount_t             *mp;            /* filesystem mount point */
        int                     needlog;        /* need to log block header */
        int                     needscan;       /* need to fixup bestfree */
        xfs_dir2_sf_hdr_t       sfh;            /* shortform header */
@@ -788,7 +780,6 @@ xfs_dir2_block_removename(
        }
        dp = args->dp;
        tp = args->trans;
-        mp = dp->i_mount;
        hdr = bp->b_addr;
        btp = xfs_dir2_block_tail_p(args->geo, hdr);
        blp = xfs_dir2_block_leaf_p(btp);
@@ -852,7 +843,6 @@ xfs_dir2_block_replace(
        xfs_inode_t             *dp;            /* incore inode */
        int                     ent;            /* leaf entry index */
        int                     error;          /* error return value */
-        xfs_mount_t             *mp;            /* filesystem mount point */
        trace_xfs_dir2_block_replace(args);
@@ -864,7 +854,6 @@ xfs_dir2_block_replace(
                return error;
        }
        dp = args->dp;
-        mp = dp->i_mount;
        hdr = bp->b_addr;
        btp = xfs_dir2_block_tail_p(args->geo, hdr);
        blp = xfs_dir2_block_leaf_p(btp);
diff --git a/fs/xfs/libxfs/xfs_dir2_data.c b/fs/xfs/libxfs/xfs_dir2_data.c
index fdd803fecb8e..5ff31be9b1cd 100644
--- a/fs/xfs/libxfs/xfs_dir2_data.c
+++ b/fs/xfs/libxfs/xfs_dir2_data.c
@@ -21,8 +21,6 @@
 #include "xfs_format.h"
 #include "xfs_log_format.h"
 #include "xfs_trans_resv.h"
-#include "xfs_sb.h"
-#include "xfs_ag.h"
 #include "xfs_mount.h"
 #include "xfs_da_format.h"
 #include "xfs_da_btree.h"
diff --git a/fs/xfs/libxfs/xfs_dir2_leaf.c b/fs/xfs/libxfs/xfs_dir2_leaf.c
index a19174eb3cb2..106119955400 100644
--- a/fs/xfs/libxfs/xfs_dir2_leaf.c
+++ b/fs/xfs/libxfs/xfs_dir2_leaf.c
@@ -21,8 +21,6 @@
 #include "xfs_format.h"
 #include "xfs_log_format.h"
 #include "xfs_trans_resv.h"
-#include "xfs_sb.h"
-#include "xfs_ag.h"
 #include "xfs_mount.h"
 #include "xfs_da_format.h"
 #include "xfs_da_btree.h"
@@ -384,7 +382,6 @@ xfs_dir2_block_to_leaf(
        xfs_dir2_db_t           ldb;            /* leaf block's bno */
        xfs_dir2_leaf_t         *leaf;          /* leaf structure */
        xfs_dir2_leaf_tail_t    *ltp;           /* leaf's tail */
-        xfs_mount_t             *mp;            /* filesystem mount point */
        int                     needlog;        /* need to log block header */
        int                     needscan;       /* need to rescan bestfree */
        xfs_trans_t             *tp;            /* transaction pointer */
@@ -395,7 +392,6 @@ xfs_dir2_block_to_leaf(
        trace_xfs_dir2_block_to_leaf(args);
        dp = args->dp;
-        mp = dp->i_mount;
        tp = args->trans;
        /*
         * Add the leaf block to the inode.
@@ -626,7 +622,6 @@ xfs_dir2_leaf_addname(
        int                     lfloghigh;      /* high leaf logging index */
        int                     lowstale;       /* index of prev stale leaf */
        xfs_dir2_leaf_tail_t    *ltp;           /* leaf tail pointer */
-        xfs_mount_t             *mp;            /* filesystem mount point */
        int                     needbytes;      /* leaf block bytes needed */
        int                     needlog;        /* need to log data header */
        int                     needscan;       /* need to rescan data free */
@@ -641,7 +636,6 @@ xfs_dir2_leaf_addname(
        dp = args->dp;
        tp = args->trans;
-        mp = dp->i_mount;
        error = xfs_dir3_leaf_read(tp, dp, args->geo->leafblk, -1, &lbp);
        if (error)
@@ -1356,11 +1350,9 @@ xfs_dir2_leaf_removename(
        xfs_dir2_leaf_t         *leaf;          /* leaf structure */
        xfs_dir2_leaf_entry_t   *lep;           /* leaf entry */
        xfs_dir2_leaf_tail_t    *ltp;           /* leaf tail structure */
-        xfs_mount_t             *mp;            /* filesystem mount point */
        int                     needlog;        /* need to log data header */
        int                     needscan;       /* need to rescan data frees */
        xfs_dir2_data_off_t     oldbest;        /* old value of best free */
-        xfs_trans_t             *tp;            /* transaction pointer */
        struct xfs_dir2_data_free *bf;          /* bestfree table */
        struct xfs_dir2_leaf_entry *ents;
        struct xfs_dir3_icleaf_hdr leafhdr;
@@ -1374,8 +1366,6 @@ xfs_dir2_leaf_removename(
                return error;
        }
        dp = args->dp;
-        tp = args->trans;
-        mp = dp->i_mount;
        leaf = lbp->b_addr;
        hdr = dbp->b_addr;
        xfs_dir3_data_check(dp, dbp);
@@ -1607,11 +1597,9 @@ xfs_dir2_leaf_trim_data(
        int                     error;          /* error return value */
        xfs_dir2_leaf_t         *leaf;          /* leaf structure */
        xfs_dir2_leaf_tail_t    *ltp;           /* leaf tail structure */
-        xfs_mount_t             *mp;            /* filesystem mount point */
        xfs_trans_t             *tp;            /* transaction pointer */
        dp = args->dp;
-        mp = dp->i_mount;
        tp = args->trans;
        /*
         * Read the offending data block.  We need its buffer.
diff --git a/fs/xfs/libxfs/xfs_dir2_node.c b/fs/xfs/libxfs/xfs_dir2_node.c
index 2ae6ac2c11ae..41b80d3d3877 100644
--- a/fs/xfs/libxfs/xfs_dir2_node.c
+++ b/fs/xfs/libxfs/xfs_dir2_node.c
@@ -21,8 +21,6 @@
 #include "xfs_format.h"
 #include "xfs_log_format.h"
 #include "xfs_trans_resv.h"
-#include "xfs_sb.h"
-#include "xfs_ag.h"
 #include "xfs_mount.h"
 #include "xfs_da_format.h"
 #include "xfs_da_btree.h"
@@ -297,7 +295,6 @@ xfs_dir2_leaf_to_node(
        int                     i;              /* leaf freespace index */
        xfs_dir2_leaf_t         *leaf;          /* leaf structure */
        xfs_dir2_leaf_tail_t    *ltp;           /* leaf tail structure */
-        xfs_mount_t             *mp;            /* filesystem mount point */
        int                     n;              /* count of live freespc ents */
        xfs_dir2_data_off_t     off;            /* freespace entry value */
        __be16                  *to;            /* pointer to freespace entry */
@@ -307,7 +304,6 @@ xfs_dir2_leaf_to_node(
        trace_xfs_dir2_leaf_to_node(args);
        dp = args->dp;
-        mp = dp->i_mount;
        tp = args->trans;
        /*
         * Add a freespace block to the directory.
@@ -387,16 +383,12 @@ xfs_dir2_leafn_add(
        int                     lfloghigh;      /* high leaf entry logging */
        int                     lfloglow;       /* low leaf entry logging */
        int                     lowstale;       /* previous stale entry */
-        xfs_mount_t             *mp;            /* filesystem mount point */
-        xfs_trans_t             *tp;            /* transaction pointer */
        struct xfs_dir3_icleaf_hdr leafhdr;
        struct xfs_dir2_leaf_entry *ents;
        trace_xfs_dir2_leafn_add(args, index);
        dp = args->dp;
-        mp = dp->i_mount;
-        tp = args->trans;
        leaf = bp->b_addr;
        dp->d_ops->leaf_hdr_from_disk(&leafhdr, leaf);
        ents = dp->d_ops->leaf_ents_p(leaf);
@@ -1170,7 +1162,6 @@ xfs_dir2_leafn_remove(
        xfs_dir2_leaf_entry_t   *lep;           /* leaf entry */
        int                     longest;        /* longest data free entry */
        int                     off;            /* data block entry offset */
-        xfs_mount_t             *mp;            /* filesystem mount point */
        int                     needlog;        /* need to log data header */
        int                     needscan;       /* need to rescan data frees */
        xfs_trans_t             *tp;            /* transaction pointer */
@@ -1182,7 +1173,6 @@ xfs_dir2_leafn_remove(
        dp = args->dp;
        tp = args->trans;
-        mp = dp->i_mount;
        leaf = bp->b_addr;
        dp->d_ops->leaf_hdr_from_disk(&leafhdr, leaf);
        ents = dp->d_ops->leaf_ents_p(leaf);
@@ -1323,7 +1313,6 @@ xfs_dir2_leafn_split(
        xfs_da_args_t           *args;          /* operation arguments */
        xfs_dablk_t             blkno;          /* new leaf block number */
        int                     error;          /* error return value */
-        xfs_mount_t             *mp;            /* filesystem mount point */
        struct xfs_inode        *dp;
        /*
@@ -1331,7 +1320,6 @@ xfs_dir2_leafn_split(
         */
        args = state->args;
        dp = args->dp;
-        mp = dp->i_mount;
        ASSERT(oldblk->magic == XFS_DIR2_LEAFN_MAGIC);
        error = xfs_da_grow_inode(args, &blkno);
        if (error) {
@@ -2231,12 +2219,10 @@ xfs_dir2_node_trim_free(
        xfs_inode_t             *dp;            /* incore directory inode */
        int                     error;          /* error return code */
        xfs_dir2_free_t         *free;          /* freespace structure */
-        xfs_mount_t             *mp;            /* filesystem mount point */
        xfs_trans_t             *tp;            /* transaction pointer */
        struct xfs_dir3_icfree_hdr freehdr;
        dp = args->dp;
-        mp = dp->i_mount;
        tp = args->trans;
        /*
         * Read the freespace block.
diff --git a/fs/xfs/libxfs/xfs_dir2_priv.h b/fs/xfs/libxfs/xfs_dir2_priv.h
index 27ce0794d196..ef9f6ead96a4 100644
--- a/fs/xfs/libxfs/xfs_dir2_priv.h
+++ b/fs/xfs/libxfs/xfs_dir2_priv.h
@@ -20,140 +20,6 @@
 struct dir_context;
-/*
- * Directory offset/block conversion functions.
- *
- * DB blocks here are logical directory block numbers, not filesystem blocks.
- */
-/*
- * Convert dataptr to byte in file space
- */
-static inline xfs_dir2_off_t
-xfs_dir2_dataptr_to_byte(xfs_dir2_dataptr_t dp)
-{
-        return (xfs_dir2_off_t)dp << XFS_DIR2_DATA_ALIGN_LOG;
-}
-/*
- * Convert byte in file space to dataptr.  It had better be aligned.
- */
-static inline xfs_dir2_dataptr_t
-xfs_dir2_byte_to_dataptr(xfs_dir2_off_t by)
-{
-        return (xfs_dir2_dataptr_t)(by >> XFS_DIR2_DATA_ALIGN_LOG);
-}
-/*
- * Convert byte in space to (DB) block
- */
-static inline xfs_dir2_db_t
-xfs_dir2_byte_to_db(struct xfs_da_geometry *geo, xfs_dir2_off_t by)
-{
-        return (xfs_dir2_db_t)(by >> geo->blklog);
-}
-/*
- * Convert dataptr to a block number
- */
-static inline xfs_dir2_db_t
-xfs_dir2_dataptr_to_db(struct xfs_da_geometry *geo, xfs_dir2_dataptr_t dp)
-{
-        return xfs_dir2_byte_to_db(geo, xfs_dir2_dataptr_to_byte(dp));
-}
-/*
- * Convert byte in space to offset in a block
- */
-static inline xfs_dir2_data_aoff_t
-xfs_dir2_byte_to_off(struct xfs_da_geometry *geo, xfs_dir2_off_t by)
-{
-        return (xfs_dir2_data_aoff_t)(by & (geo->blksize - 1));
-}
-/*
- * Convert dataptr to a byte offset in a block
- */
-static inline xfs_dir2_data_aoff_t
-xfs_dir2_dataptr_to_off(struct xfs_da_geometry *geo, xfs_dir2_dataptr_t dp)
-{
-        return xfs_dir2_byte_to_off(geo, xfs_dir2_dataptr_to_byte(dp));
-}
-/*
- * Convert block and offset to byte in space
- */
-static inline xfs_dir2_off_t
-xfs_dir2_db_off_to_byte(struct xfs_da_geometry *geo, xfs_dir2_db_t db,
-                        xfs_dir2_data_aoff_t o)
-{
-        return ((xfs_dir2_off_t)db << geo->blklog) + o;
-}
-/*
- * Convert block (DB) to block (dablk)
- */
-static inline xfs_dablk_t
-xfs_dir2_db_to_da(struct xfs_da_geometry *geo, xfs_dir2_db_t db)
-{
-        return (xfs_dablk_t)(db << (geo->blklog - geo->fsblog));
-}
-/*
- * Convert byte in space to (DA) block
- */
-static inline xfs_dablk_t
-xfs_dir2_byte_to_da(struct xfs_da_geometry *geo, xfs_dir2_off_t by)
-{
-        return xfs_dir2_db_to_da(geo, xfs_dir2_byte_to_db(geo, by));
-}
-/*
- * Convert block and offset to dataptr
- */
-static inline xfs_dir2_dataptr_t
-xfs_dir2_db_off_to_dataptr(struct xfs_da_geometry *geo, xfs_dir2_db_t db,
-                           xfs_dir2_data_aoff_t o)
-{
-        return xfs_dir2_byte_to_dataptr(xfs_dir2_db_off_to_byte(geo, db, o));
-}
-/*
- * Convert block (dablk) to block (DB)
- */
-static inline xfs_dir2_db_t
-xfs_dir2_da_to_db(struct xfs_da_geometry *geo, xfs_dablk_t da)
-{
-        return (xfs_dir2_db_t)(da >> (geo->blklog - geo->fsblog));
-}
-/*
- * Convert block (dablk) to byte offset in space
- */
-static inline xfs_dir2_off_t
-xfs_dir2_da_to_byte(struct xfs_da_geometry *geo, xfs_dablk_t da)
-{
-        return xfs_dir2_db_off_to_byte(geo, xfs_dir2_da_to_db(geo, da), 0);
-}
-/*
- * Directory tail pointer accessor functions. Based on block geometry.
- */
-static inline struct xfs_dir2_block_tail *
-xfs_dir2_block_tail_p(struct xfs_da_geometry *geo, struct xfs_dir2_data_hdr *hdr)
-{
-        return ((struct xfs_dir2_block_tail *)
-                ((char *)hdr + geo->blksize)) - 1;
-}
-static inline struct xfs_dir2_leaf_tail *
-xfs_dir2_leaf_tail_p(struct xfs_da_geometry *geo, struct xfs_dir2_leaf *lp)
-{
-        return (struct xfs_dir2_leaf_tail *)
-                ((char *)lp + geo->blksize -
-                  sizeof(struct xfs_dir2_leaf_tail));
-}
 /* xfs_dir2.c */
 extern int xfs_dir_ino_validate(struct xfs_mount *mp, xfs_ino_t ino);
 extern int xfs_dir2_grow_inode(struct xfs_da_args *args, int space,
@@ -161,12 +27,6 @@ extern int xfs_dir2_grow_inode(struct xfs_da_args *args, int space,
 extern int xfs_dir_cilookup_result(struct xfs_da_args *args,
                                const unsigned char *name, int len);
-#define S_SHIFT 12
-extern const unsigned char xfs_mode_to_ftype[];
-extern unsigned char xfs_dir3_get_dtype(struct xfs_mount *mp,
-                                        __uint8_t filetype);
 /* xfs_dir2_block.c */
 extern int xfs_dir3_block_read(struct xfs_trans *tp, struct xfs_inode *dp,
diff --git a/fs/xfs/libxfs/xfs_dir2_sf.c b/fs/xfs/libxfs/xfs_dir2_sf.c
index 5079e051ef08..974d62e677f4 100644
--- a/fs/xfs/libxfs/xfs_dir2_sf.c
+++ b/fs/xfs/libxfs/xfs_dir2_sf.c
@@ -20,8 +20,6 @@
 #include "xfs_format.h"
 #include "xfs_log_format.h"
 #include "xfs_trans_resv.h"
-#include "xfs_sb.h"
-#include "xfs_ag.h"
 #include "xfs_mount.h"
 #include "xfs_da_format.h"
 #include "xfs_da_btree.h"
@@ -32,7 +30,6 @@
 #include "xfs_dir2.h"
 #include "xfs_dir2_priv.h"
 #include "xfs_trace.h"
-#include "xfs_dinode.h"
 /*
 * Prototypes for internal functions.
@@ -455,13 +452,11 @@ xfs_dir2_sf_addname_hard(
        xfs_dir2_sf_hdr_t       *oldsfp;        /* original shortform dir */
        xfs_dir2_sf_entry_t     *sfep;          /* entry in new dir */
        xfs_dir2_sf_hdr_t       *sfp;           /* new shortform dir */
-        struct xfs_mount        *mp;
        /*
         * Copy the old directory to the stack buffer.
         */
        dp = args->dp;
-        mp = dp->i_mount;
        sfp = (xfs_dir2_sf_hdr_t *)dp->i_df.if_u1.if_data;
        old_isize = (int)dp->i_d.di_size;
@@ -542,7 +537,6 @@ xfs_dir2_sf_addname_pick(
        xfs_inode_t             *dp;            /* incore directory inode */
        int                     holefit;        /* found hole it will fit in */
        int                     i;              /* entry number */
-        xfs_mount_t             *mp;            /* filesystem mount point */
        xfs_dir2_data_aoff_t    offset;         /* data block offset */
        xfs_dir2_sf_entry_t     *sfep;          /* shortform entry */
        xfs_dir2_sf_hdr_t       *sfp;           /* shortform structure */
@@ -550,7 +544,6 @@ xfs_dir2_sf_addname_pick(
        int                     used;           /* data bytes used */
        dp = args->dp;
-        mp = dp->i_mount;
        sfp = (xfs_dir2_sf_hdr_t *)dp->i_df.if_u1.if_data;
        size = dp->d_ops->data_entsize(args->namelen);
@@ -616,10 +609,8 @@ xfs_dir2_sf_check(
        int                     offset;         /* data offset */
        xfs_dir2_sf_entry_t     *sfep;          /* shortform dir entry */
        xfs_dir2_sf_hdr_t       *sfp;           /* shortform structure */
-        struct xfs_mount        *mp;
        dp = args->dp;
-        mp = dp->i_mount;
        sfp = (xfs_dir2_sf_hdr_t *)dp->i_df.if_u1.if_data;
        offset = dp->d_ops->data_first_offset;
@@ -1016,12 +1007,10 @@ xfs_dir2_sf_toino4(
        int                     oldsize;        /* old inode size */
        xfs_dir2_sf_entry_t     *sfep;          /* new sf entry */
        xfs_dir2_sf_hdr_t       *sfp;           /* new sf directory */
-        struct xfs_mount        *mp;
        trace_xfs_dir2_sf_toino4(args);
        dp = args->dp;
-        mp = dp->i_mount;
        /*
         * Copy the old directory to the buffer.
@@ -1094,12 +1083,10 @@ xfs_dir2_sf_toino8(
        int                     oldsize;        /* old inode size */
        xfs_dir2_sf_entry_t     *sfep;          /* new sf entry */
        xfs_dir2_sf_hdr_t       *sfp;           /* new sf directory */
-        struct xfs_mount        *mp;
        trace_xfs_dir2_sf_toino8(args);
        dp = args->dp;
-        mp = dp->i_mount;
        /*
         * Copy the old directory to the buffer.
diff --git a/fs/xfs/libxfs/xfs_dquot_buf.c b/fs/xfs/libxfs/xfs_dquot_buf.c
index bb969337efc8..6fbf2d853a54 100644
--- a/fs/xfs/libxfs/xfs_dquot_buf.c
+++ b/fs/xfs/libxfs/xfs_dquot_buf.c
@@ -22,8 +22,6 @@
 #include "xfs_format.h"
 #include "xfs_log_format.h"
 #include "xfs_trans_resv.h"
-#include "xfs_sb.h"
-#include "xfs_ag.h"
 #include "xfs_mount.h"
 #include "xfs_inode.h"
 #include "xfs_quota.h"
diff --git a/fs/xfs/libxfs/xfs_format.h b/fs/xfs/libxfs/xfs_format.h
index 7e42bba9a420..fbd6da263571 100644
--- a/fs/xfs/libxfs/xfs_format.h
+++ b/fs/xfs/libxfs/xfs_format.h
@@ -34,6 +34,1077 @@ struct xfs_buf;
 struct xfs_ifork;
 /*
+ * Super block
+ * Fits into a sector-sized buffer at address 0 of each allocation group.
+ * Only the first of these is ever updated except during growfs.
+ */
+#define XFS_SB_MAGIC            0x58465342      /* 'XFSB' */
+#define XFS_SB_VERSION_1        1               /* 5.3, 6.0.1, 6.1 */
+#define XFS_SB_VERSION_2        2               /* 6.2 - attributes */
+#define XFS_SB_VERSION_3        3               /* 6.2 - new inode version */
+#define XFS_SB_VERSION_4        4               /* 6.2+ - bitmask version */
+#define XFS_SB_VERSION_5        5               /* CRC enabled filesystem */
+#define XFS_SB_VERSION_NUMBITS          0x000f
+#define XFS_SB_VERSION_ALLFBITS         0xfff0
+#define XFS_SB_VERSION_ATTRBIT          0x0010
+#define XFS_SB_VERSION_NLINKBIT         0x0020
+#define XFS_SB_VERSION_QUOTABIT         0x0040
+#define XFS_SB_VERSION_ALIGNBIT         0x0080
+#define XFS_SB_VERSION_DALIGNBIT        0x0100
+#define XFS_SB_VERSION_SHAREDBIT        0x0200
+#define XFS_SB_VERSION_LOGV2BIT         0x0400
+#define XFS_SB_VERSION_SECTORBIT        0x0800
+#define XFS_SB_VERSION_EXTFLGBIT        0x1000
+#define XFS_SB_VERSION_DIRV2BIT         0x2000
+#define XFS_SB_VERSION_BORGBIT          0x4000  /* ASCII only case-insens. */
+#define XFS_SB_VERSION_MOREBITSBIT      0x8000
+/*
+ * Supported feature bit list is just all bits in the versionnum field because
+ * we've used them all up and understand them all. Except, of course, for the
+ * shared superblock bit, which nobody knows what it does and so is unsupported.
+ */
+#define XFS_SB_VERSION_OKBITS           \
+        ((XFS_SB_VERSION_NUMBITS | XFS_SB_VERSION_ALLFBITS) & \
+                ~XFS_SB_VERSION_SHAREDBIT)
+/*
+ * There are two words to hold XFS "feature" bits: the original
+ * word, sb_versionnum, and sb_features2.  Whenever a bit is set in
+ * sb_features2, the feature bit XFS_SB_VERSION_MOREBITSBIT must be set.
+ *
+ * These defines represent bits in sb_features2.
+ */
+#define XFS_SB_VERSION2_RESERVED1BIT    0x00000001
+#define XFS_SB_VERSION2_LAZYSBCOUNTBIT  0x00000002      /* Superblk counters */
+#define XFS_SB_VERSION2_RESERVED4BIT    0x00000004
+#define XFS_SB_VERSION2_ATTR2BIT        0x00000008      /* Inline attr rework */
+#define XFS_SB_VERSION2_PARENTBIT       0x00000010      /* parent pointers */
+#define XFS_SB_VERSION2_PROJID32BIT     0x00000080      /* 32 bit project id */
+#define XFS_SB_VERSION2_CRCBIT          0x00000100      /* metadata CRCs */
+#define XFS_SB_VERSION2_FTYPE           0x00000200      /* inode type in dir */
+#define XFS_SB_VERSION2_OKBITS          \
+        (XFS_SB_VERSION2_LAZYSBCOUNTBIT | \
+         XFS_SB_VERSION2_ATTR2BIT       | \
+         XFS_SB_VERSION2_PROJID32BIT    | \
+         XFS_SB_VERSION2_FTYPE)
+/*
+ * Superblock - in core version.  Must match the ondisk version below.
+ * Must be padded to 64 bit alignment.
+ */
+typedef struct xfs_sb {
+        __uint32_t      sb_magicnum;    /* magic number == XFS_SB_MAGIC */
+        __uint32_t      sb_blocksize;   /* logical block size, bytes */
+        xfs_rfsblock_t  sb_dblocks;     /* number of data blocks */
+        xfs_rfsblock_t  sb_rblocks;     /* number of realtime blocks */
+        xfs_rtblock_t   sb_rextents;    /* number of realtime extents */
+        uuid_t          sb_uuid;        /* file system unique id */
+        xfs_fsblock_t   sb_logstart;    /* starting block of log if internal */
+        xfs_ino_t       sb_rootino;     /* root inode number */
+        xfs_ino_t       sb_rbmino;      /* bitmap inode for realtime extents */
+        xfs_ino_t       sb_rsumino;     /* summary inode for rt bitmap */
+        xfs_agblock_t   sb_rextsize;    /* realtime extent size, blocks */
+        xfs_agblock_t   sb_agblocks;    /* size of an allocation group */
+        xfs_agnumber_t  sb_agcount;     /* number of allocation groups */
+        xfs_extlen_t    sb_rbmblocks;   /* number of rt bitmap blocks */
+        xfs_extlen_t    sb_logblocks;   /* number of log blocks */
+        __uint16_t      sb_versionnum;  /* header version == XFS_SB_VERSION */
+        __uint16_t      sb_sectsize;    /* volume sector size, bytes */
+        __uint16_t      sb_inodesize;   /* inode size, bytes */
+        __uint16_t      sb_inopblock;   /* inodes per block */
+        char            sb_fname[12];   /* file system name */
+        __uint8_t       sb_blocklog;    /* log2 of sb_blocksize */
+        __uint8_t       sb_sectlog;     /* log2 of sb_sectsize */
+        __uint8_t       sb_inodelog;    /* log2 of sb_inodesize */
+        __uint8_t       sb_inopblog;    /* log2 of sb_inopblock */
+        __uint8_t       sb_agblklog;    /* log2 of sb_agblocks (rounded up) */
+        __uint8_t       sb_rextslog;    /* log2 of sb_rextents */
+        __uint8_t       sb_inprogress;  /* mkfs is in progress, don't mount */
+        __uint8_t       sb_imax_pct;    /* max % of fs for inode space */
+                                        /* statistics */
+        /*
+         * These fields must remain contiguous.  If you really
+         * want to change their layout, make sure you fix the
+         * code in xfs_trans_apply_sb_deltas().
+         */
+        __uint64_t      sb_icount;      /* allocated inodes */
+        __uint64_t      sb_ifree;       /* free inodes */
+        __uint64_t      sb_fdblocks;    /* free data blocks */
+        __uint64_t      sb_frextents;   /* free realtime extents */
+        /*
+         * End contiguous fields.
+         */
+        xfs_ino_t       sb_uquotino;    /* user quota inode */
+        xfs_ino_t       sb_gquotino;    /* group quota inode */
+        __uint16_t      sb_qflags;      /* quota flags */
+        __uint8_t       sb_flags;       /* misc. flags */
+        __uint8_t       sb_shared_vn;   /* shared version number */
+        xfs_extlen_t    sb_inoalignmt;  /* inode chunk alignment, fsblocks */
+        __uint32_t      sb_unit;        /* stripe or raid unit */
+        __uint32_t      sb_width;       /* stripe or raid width */
+        __uint8_t       sb_dirblklog;   /* log2 of dir block size (fsbs) */
+        __uint8_t       sb_logsectlog;  /* log2 of the log sector size */
+        __uint16_t      sb_logsectsize; /* sector size for the log, bytes */
+        __uint32_t      sb_logsunit;    /* stripe unit size for the log */
+        __uint32_t      sb_features2;   /* additional feature bits */
+        /*
+         * bad features2 field as a result of failing to pad the sb
+         * structure to 64 bits. Some machines will be using this field
+         * for features2 bits. Easiest just to mark it bad and not use
+         * it for anything else.
+         */
+        __uint32_t      sb_bad_features2;
+        /* version 5 superblock fields start here */
+        /* feature masks */
+        __uint32_t      sb_features_compat;
+        __uint32_t      sb_features_ro_compat;
+        __uint32_t      sb_features_incompat;
+        __uint32_t      sb_features_log_incompat;
+        __uint32_t      sb_crc;         /* superblock crc */
+        __uint32_t      sb_pad;
+        xfs_ino_t       sb_pquotino;    /* project quota inode */
+        xfs_lsn_t       sb_lsn;         /* last write sequence */
+        /* must be padded to 64 bit alignment */
+} xfs_sb_t;
+#define XFS_SB_CRC_OFF          offsetof(struct xfs_sb, sb_crc)
+/*
+ * Superblock - on disk version.  Must match the in core version above.
+ * Must be padded to 64 bit alignment.
+ */
+typedef struct xfs_dsb {
+        __be32          sb_magicnum;    /* magic number == XFS_SB_MAGIC */
+        __be32          sb_blocksize;   /* logical block size, bytes */
+        __be64          sb_dblocks;     /* number of data blocks */
+        __be64          sb_rblocks;     /* number of realtime blocks */
+        __be64          sb_rextents;    /* number of realtime extents */
+        uuid_t          sb_uuid;        /* file system unique id */
+        __be64          sb_logstart;    /* starting block of log if internal */
+        __be64          sb_rootino;     /* root inode number */
+        __be64          sb_rbmino;      /* bitmap inode for realtime extents */
+        __be64          sb_rsumino;     /* summary inode for rt bitmap */
+        __be32          sb_rextsize;    /* realtime extent size, blocks */
+        __be32          sb_agblocks;    /* size of an allocation group */
+        __be32          sb_agcount;     /* number of allocation groups */
+        __be32          sb_rbmblocks;   /* number of rt bitmap blocks */
+        __be32          sb_logblocks;   /* number of log blocks */
+        __be16          sb_versionnum;  /* header version == XFS_SB_VERSION */
+        __be16          sb_sectsize;    /* volume sector size, bytes */
+        __be16          sb_inodesize;   /* inode size, bytes */
+        __be16          sb_inopblock;   /* inodes per block */
+        char            sb_fname[12];   /* file system name */
+        __u8            sb_blocklog;    /* log2 of sb_blocksize */
+        __u8            sb_sectlog;     /* log2 of sb_sectsize */
+        __u8            sb_inodelog;    /* log2 of sb_inodesize */
+        __u8            sb_inopblog;    /* log2 of sb_inopblock */
+        __u8            sb_agblklog;    /* log2 of sb_agblocks (rounded up) */
+        __u8            sb_rextslog;    /* log2 of sb_rextents */
+        __u8            sb_inprogress;  /* mkfs is in progress, don't mount */
+        __u8            sb_imax_pct;    /* max % of fs for inode space */
+                                        /* statistics */
+        /*
+         * These fields must remain contiguous.  If you really
+         * want to change their layout, make sure you fix the
+         * code in xfs_trans_apply_sb_deltas().
+         */
+        __be64          sb_icount;      /* allocated inodes */
+        __be64          sb_ifree;       /* free inodes */
+        __be64          sb_fdblocks;    /* free data blocks */
+        __be64          sb_frextents;   /* free realtime extents */
+        /*
+         * End contiguous fields.
+         */
+        __be64          sb_uquotino;    /* user quota inode */
+        __be64          sb_gquotino;    /* group quota inode */
+        __be16          sb_qflags;      /* quota flags */
+        __u8            sb_flags;       /* misc. flags */
+        __u8            sb_shared_vn;   /* shared version number */
+        __be32          sb_inoalignmt;  /* inode chunk alignment, fsblocks */
+        __be32          sb_unit;        /* stripe or raid unit */
+        __be32          sb_width;       /* stripe or raid width */
+        __u8            sb_dirblklog;   /* log2 of dir block size (fsbs) */
+        __u8            sb_logsectlog;  /* log2 of the log sector size */
+        __be16          sb_logsectsize; /* sector size for the log, bytes */
+        __be32          sb_logsunit;    /* stripe unit size for the log */
+        __be32          sb_features2;   /* additional feature bits */
+        /*
+         * bad features2 field as a result of failing to pad the sb
+         * structure to 64 bits. Some machines will be using this field
+         * for features2 bits. Easiest just to mark it bad and not use
+         * it for anything else.
+         */
+        __be32          sb_bad_features2;
+        /* version 5 superblock fields start here */
+        /* feature masks */
+        __be32          sb_features_compat;
+        __be32          sb_features_ro_compat;
+        __be32          sb_features_incompat;
+        __be32          sb_features_log_incompat;
+        __le32          sb_crc;         /* superblock crc */
+        __be32          sb_pad;
+        __be64          sb_pquotino;    /* project quota inode */
+        __be64          sb_lsn;         /* last write sequence */
+        /* must be padded to 64 bit alignment */
+} xfs_dsb_t;
+/*
+ * Sequence number values for the fields.
+ */
+typedef enum {
+        XFS_SBS_MAGICNUM, XFS_SBS_BLOCKSIZE, XFS_SBS_DBLOCKS, XFS_SBS_RBLOCKS,
+        XFS_SBS_REXTENTS, XFS_SBS_UUID, XFS_SBS_LOGSTART, XFS_SBS_ROOTINO,
+        XFS_SBS_RBMINO, XFS_SBS_RSUMINO, XFS_SBS_REXTSIZE, XFS_SBS_AGBLOCKS,
+        XFS_SBS_AGCOUNT, XFS_SBS_RBMBLOCKS, XFS_SBS_LOGBLOCKS,
+        XFS_SBS_VERSIONNUM, XFS_SBS_SECTSIZE, XFS_SBS_INODESIZE,
+        XFS_SBS_INOPBLOCK, XFS_SBS_FNAME, XFS_SBS_BLOCKLOG,
+        XFS_SBS_SECTLOG, XFS_SBS_INODELOG, XFS_SBS_INOPBLOG, XFS_SBS_AGBLKLOG,
+        XFS_SBS_REXTSLOG, XFS_SBS_INPROGRESS, XFS_SBS_IMAX_PCT, XFS_SBS_ICOUNT,
+        XFS_SBS_IFREE, XFS_SBS_FDBLOCKS, XFS_SBS_FREXTENTS, XFS_SBS_UQUOTINO,
+        XFS_SBS_GQUOTINO, XFS_SBS_QFLAGS, XFS_SBS_FLAGS, XFS_SBS_SHARED_VN,
+        XFS_SBS_INOALIGNMT, XFS_SBS_UNIT, XFS_SBS_WIDTH, XFS_SBS_DIRBLKLOG,
+        XFS_SBS_LOGSECTLOG, XFS_SBS_LOGSECTSIZE, XFS_SBS_LOGSUNIT,
+        XFS_SBS_FEATURES2, XFS_SBS_BAD_FEATURES2, XFS_SBS_FEATURES_COMPAT,
+        XFS_SBS_FEATURES_RO_COMPAT, XFS_SBS_FEATURES_INCOMPAT,
+        XFS_SBS_FEATURES_LOG_INCOMPAT, XFS_SBS_CRC, XFS_SBS_PAD,
+        XFS_SBS_PQUOTINO, XFS_SBS_LSN,
+        XFS_SBS_FIELDCOUNT
+} xfs_sb_field_t;
+/*
+ * Mask values, defined based on the xfs_sb_field_t values.
+ * Only define the ones we're using.
+ */
+#define XFS_SB_MVAL(x)          (1LL << XFS_SBS_ ## x)
+#define XFS_SB_UUID             XFS_SB_MVAL(UUID)
+#define XFS_SB_FNAME            XFS_SB_MVAL(FNAME)
+#define XFS_SB_ROOTINO          XFS_SB_MVAL(ROOTINO)
+#define XFS_SB_RBMINO           XFS_SB_MVAL(RBMINO)
+#define XFS_SB_RSUMINO          XFS_SB_MVAL(RSUMINO)
+#define XFS_SB_VERSIONNUM       XFS_SB_MVAL(VERSIONNUM)
+#define XFS_SB_UQUOTINO         XFS_SB_MVAL(UQUOTINO)
+#define XFS_SB_GQUOTINO         XFS_SB_MVAL(GQUOTINO)
+#define XFS_SB_QFLAGS           XFS_SB_MVAL(QFLAGS)
+#define XFS_SB_SHARED_VN        XFS_SB_MVAL(SHARED_VN)
+#define XFS_SB_UNIT             XFS_SB_MVAL(UNIT)
+#define XFS_SB_WIDTH            XFS_SB_MVAL(WIDTH)
+#define XFS_SB_ICOUNT           XFS_SB_MVAL(ICOUNT)
+#define XFS_SB_IFREE            XFS_SB_MVAL(IFREE)
+#define XFS_SB_FDBLOCKS         XFS_SB_MVAL(FDBLOCKS)
+#define XFS_SB_FEATURES2        XFS_SB_MVAL(FEATURES2)
+#define XFS_SB_BAD_FEATURES2    XFS_SB_MVAL(BAD_FEATURES2)
+#define XFS_SB_FEATURES_COMPAT  XFS_SB_MVAL(FEATURES_COMPAT)
+#define XFS_SB_FEATURES_RO_COMPAT XFS_SB_MVAL(FEATURES_RO_COMPAT)
+#define XFS_SB_FEATURES_INCOMPAT XFS_SB_MVAL(FEATURES_INCOMPAT)
+#define XFS_SB_FEATURES_LOG_INCOMPAT XFS_SB_MVAL(FEATURES_LOG_INCOMPAT)
+#define XFS_SB_CRC              XFS_SB_MVAL(CRC)
+#define XFS_SB_PQUOTINO         XFS_SB_MVAL(PQUOTINO)
+#define XFS_SB_NUM_BITS         ((int)XFS_SBS_FIELDCOUNT)
+#define XFS_SB_ALL_BITS         ((1LL << XFS_SB_NUM_BITS) - 1)
+#define XFS_SB_MOD_BITS         \
+        (XFS_SB_UUID | XFS_SB_ROOTINO | XFS_SB_RBMINO | XFS_SB_RSUMINO | \
+         XFS_SB_VERSIONNUM | XFS_SB_UQUOTINO | XFS_SB_GQUOTINO | \
+         XFS_SB_QFLAGS | XFS_SB_SHARED_VN | XFS_SB_UNIT | XFS_SB_WIDTH | \
+         XFS_SB_ICOUNT | XFS_SB_IFREE | XFS_SB_FDBLOCKS | XFS_SB_FEATURES2 | \
+         XFS_SB_BAD_FEATURES2 | XFS_SB_FEATURES_COMPAT | \
+         XFS_SB_FEATURES_RO_COMPAT | XFS_SB_FEATURES_INCOMPAT | \
+         XFS_SB_FEATURES_LOG_INCOMPAT | XFS_SB_PQUOTINO)
+/*
+ * Misc. Flags - warning - these will be cleared by xfs_repair unless
+ * a feature bit is set when the flag is used.
+ */
+#define XFS_SBF_NOFLAGS         0x00    /* no flags set */
+#define XFS_SBF_READONLY        0x01    /* only read-only mounts allowed */
+/*
+ * define max. shared version we can interoperate with
+ */
+#define XFS_SB_MAX_SHARED_VN    0
+#define XFS_SB_VERSION_NUM(sbp) ((sbp)->sb_versionnum & XFS_SB_VERSION_NUMBITS)
+/*
+ * The first XFS version we support is a v4 superblock with V2 directories.
+ */
+static inline bool xfs_sb_good_v4_features(struct xfs_sb *sbp)
+{
+        if (!(sbp->sb_versionnum & XFS_SB_VERSION_DIRV2BIT))
+                return false;
+        /* check for unknown features in the fs */
+        if ((sbp->sb_versionnum & ~XFS_SB_VERSION_OKBITS) ||
+            ((sbp->sb_versionnum & XFS_SB_VERSION_MOREBITSBIT) &&
+             (sbp->sb_features2 & ~XFS_SB_VERSION2_OKBITS)))
+                return false;
+        return true;
+}
+static inline bool xfs_sb_good_version(struct xfs_sb *sbp)
+{
+        if (XFS_SB_VERSION_NUM(sbp) == XFS_SB_VERSION_5)
+                return true;
+        if (XFS_SB_VERSION_NUM(sbp) == XFS_SB_VERSION_4)
+                return xfs_sb_good_v4_features(sbp);
+        return false;
+}
+/*
+ * Detect a mismatched features2 field.  Older kernels read/wrote
+ * this into the wrong slot, so to be safe we keep them in sync.
+ */
+static inline bool xfs_sb_has_mismatched_features2(struct xfs_sb *sbp)
+{
+        return sbp->sb_bad_features2 != sbp->sb_features2;
+}
+static inline bool xfs_sb_version_hasattr(struct xfs_sb *sbp)
+{
+        return (sbp->sb_versionnum & XFS_SB_VERSION_ATTRBIT);
+}
+static inline void xfs_sb_version_addattr(struct xfs_sb *sbp)
+{
+        sbp->sb_versionnum |= XFS_SB_VERSION_ATTRBIT;
+}
+static inline bool xfs_sb_version_hasquota(struct xfs_sb *sbp)
+{
+        return (sbp->sb_versionnum & XFS_SB_VERSION_QUOTABIT);
+}
+static inline void xfs_sb_version_addquota(struct xfs_sb *sbp)
+{
+        sbp->sb_versionnum |= XFS_SB_VERSION_QUOTABIT;
+}
+static inline bool xfs_sb_version_hasalign(struct xfs_sb *sbp)
+{
+        return (XFS_SB_VERSION_NUM(sbp) == XFS_SB_VERSION_5 ||
+                (sbp->sb_versionnum & XFS_SB_VERSION_ALIGNBIT));
+}
+static inline bool xfs_sb_version_hasdalign(struct xfs_sb *sbp)
+{
+        return (sbp->sb_versionnum & XFS_SB_VERSION_DALIGNBIT);
+}
+static inline bool xfs_sb_version_haslogv2(struct xfs_sb *sbp)
+{
+        return XFS_SB_VERSION_NUM(sbp) == XFS_SB_VERSION_5 ||
+               (sbp->sb_versionnum & XFS_SB_VERSION_LOGV2BIT);
+}
+static inline bool xfs_sb_version_hasextflgbit(struct xfs_sb *sbp)
+{
+        return XFS_SB_VERSION_NUM(sbp) == XFS_SB_VERSION_5 ||
+               (sbp->sb_versionnum & XFS_SB_VERSION_EXTFLGBIT);
+}
+static inline bool xfs_sb_version_hassector(struct xfs_sb *sbp)
+{
+        return (sbp->sb_versionnum & XFS_SB_VERSION_SECTORBIT);
+}
+static inline bool xfs_sb_version_hasasciici(struct xfs_sb *sbp)
+{
+        return (sbp->sb_versionnum & XFS_SB_VERSION_BORGBIT);
+}
+static inline bool xfs_sb_version_hasmorebits(struct xfs_sb *sbp)
+{
+        return XFS_SB_VERSION_NUM(sbp) == XFS_SB_VERSION_5 ||
+               (sbp->sb_versionnum & XFS_SB_VERSION_MOREBITSBIT);
+}
+/*
+ * sb_features2 bit version macros.
+ */
+static inline bool xfs_sb_version_haslazysbcount(struct xfs_sb *sbp)
+{
+        return (XFS_SB_VERSION_NUM(sbp) == XFS_SB_VERSION_5) ||
+               (xfs_sb_version_hasmorebits(sbp) &&
+                (sbp->sb_features2 & XFS_SB_VERSION2_LAZYSBCOUNTBIT));
+}
+static inline bool xfs_sb_version_hasattr2(struct xfs_sb *sbp)
+{
+        return (XFS_SB_VERSION_NUM(sbp) == XFS_SB_VERSION_5) ||
+               (xfs_sb_version_hasmorebits(sbp) &&
+                (sbp->sb_features2 & XFS_SB_VERSION2_ATTR2BIT));
+}
+static inline void xfs_sb_version_addattr2(struct xfs_sb *sbp)
+{
+        sbp->sb_versionnum |= XFS_SB_VERSION_MOREBITSBIT;
+        sbp->sb_features2 |= XFS_SB_VERSION2_ATTR2BIT;
+        sbp->sb_bad_features2 |= XFS_SB_VERSION2_ATTR2BIT;
+}
+static inline void xfs_sb_version_removeattr2(struct xfs_sb *sbp)
+{
+        sbp->sb_features2 &= ~XFS_SB_VERSION2_ATTR2BIT;
+        sbp->sb_bad_features2 &= ~XFS_SB_VERSION2_ATTR2BIT;
+        if (!sbp->sb_features2)
+                sbp->sb_versionnum &= ~XFS_SB_VERSION_MOREBITSBIT;
+}
+static inline bool xfs_sb_version_hasprojid32bit(struct xfs_sb *sbp)
+{
+        return (XFS_SB_VERSION_NUM(sbp) == XFS_SB_VERSION_5) ||
+               (xfs_sb_version_hasmorebits(sbp) &&
+                (sbp->sb_features2 & XFS_SB_VERSION2_PROJID32BIT));
+}
+static inline void xfs_sb_version_addprojid32bit(struct xfs_sb *sbp)
+{
+        sbp->sb_versionnum |= XFS_SB_VERSION_MOREBITSBIT;
+        sbp->sb_features2 |= XFS_SB_VERSION2_PROJID32BIT;
+        sbp->sb_bad_features2 |= XFS_SB_VERSION2_PROJID32BIT;
+}
+/*
+ * Extended v5 superblock feature masks. These are to be used for new v5
+ * superblock features only.
+ *
+ * Compat features are new features that old kernels will not notice or affect
+ * and so can mount read-write without issues.
+ *
+ * RO-Compat (read only) are features that old kernels can read but will break
+ * if they write. Hence only read-only mounts of such filesystems are allowed on
+ * kernels that don't support the feature bit.
+ *
+ * InCompat features are features which old kernels will not understand and so
+ * must not mount.
+ *
+ * Log-InCompat features are for changes to log formats or new transactions that
+ * can't be replayed on older kernels. The fields are set when the filesystem is
+ * mounted, and a clean unmount clears the fields.
+ */
+#define XFS_SB_FEAT_COMPAT_ALL 0
+#define XFS_SB_FEAT_COMPAT_UNKNOWN      ~XFS_SB_FEAT_COMPAT_ALL
+static inline bool
+xfs_sb_has_compat_feature(
+        struct xfs_sb   *sbp,
+        __uint32_t      feature)
+{
+        return (sbp->sb_features_compat & feature) != 0;
+}
+#define XFS_SB_FEAT_RO_COMPAT_FINOBT   (1 << 0)         /* free inode btree */
+#define XFS_SB_FEAT_RO_COMPAT_ALL \
+                (XFS_SB_FEAT_RO_COMPAT_FINOBT)
+#define XFS_SB_FEAT_RO_COMPAT_UNKNOWN   ~XFS_SB_FEAT_RO_COMPAT_ALL
+static inline bool
+xfs_sb_has_ro_compat_feature(
+        struct xfs_sb   *sbp,
+        __uint32_t      feature)
+{
+        return (sbp->sb_features_ro_compat & feature) != 0;
+}
+#define XFS_SB_FEAT_INCOMPAT_FTYPE      (1 << 0)        /* filetype in dirent */
+#define XFS_SB_FEAT_INCOMPAT_ALL \
+                (XFS_SB_FEAT_INCOMPAT_FTYPE)
+#define XFS_SB_FEAT_INCOMPAT_UNKNOWN    ~XFS_SB_FEAT_INCOMPAT_ALL
+static inline bool
+xfs_sb_has_incompat_feature(
+        struct xfs_sb   *sbp,
+        __uint32_t      feature)
+{
+        return (sbp->sb_features_incompat & feature) != 0;
+}
+#define XFS_SB_FEAT_INCOMPAT_LOG_ALL 0
+#define XFS_SB_FEAT_INCOMPAT_LOG_UNKNOWN        ~XFS_SB_FEAT_INCOMPAT_LOG_ALL
+static inline bool
+xfs_sb_has_incompat_log_feature(
+        struct xfs_sb   *sbp,
+        __uint32_t      feature)
+{
+        return (sbp->sb_features_log_incompat & feature) != 0;
+}
+/*
+ * V5 superblock specific feature checks
+ */
+static inline int xfs_sb_version_hascrc(struct xfs_sb *sbp)
+{
+        return XFS_SB_VERSION_NUM(sbp) == XFS_SB_VERSION_5;
+}
+static inline int xfs_sb_version_has_pquotino(struct xfs_sb *sbp)
+{
+        return XFS_SB_VERSION_NUM(sbp) == XFS_SB_VERSION_5;
+}
+static inline int xfs_sb_version_hasftype(struct xfs_sb *sbp)
+{
+        return (XFS_SB_VERSION_NUM(sbp) == XFS_SB_VERSION_5 &&
+                xfs_sb_has_incompat_feature(sbp, XFS_SB_FEAT_INCOMPAT_FTYPE)) ||
+               (xfs_sb_version_hasmorebits(sbp) &&
+                 (sbp->sb_features2 & XFS_SB_VERSION2_FTYPE));
+}
+static inline int xfs_sb_version_hasfinobt(xfs_sb_t *sbp)
+{
+        return (XFS_SB_VERSION_NUM(sbp) == XFS_SB_VERSION_5) &&
+                (sbp->sb_features_ro_compat & XFS_SB_FEAT_RO_COMPAT_FINOBT);
+}
+/*
+ * end of superblock version macros
+ */
+static inline bool
+xfs_is_quota_inode(struct xfs_sb *sbp, xfs_ino_t ino)
+{
+        return (ino == sbp->sb_uquotino ||
+                ino == sbp->sb_gquotino ||
+                ino == sbp->sb_pquotino);
+}
+#define XFS_SB_DADDR            ((xfs_daddr_t)0) /* daddr in filesystem/ag */
+#define XFS_SB_BLOCK(mp)        XFS_HDR_BLOCK(mp, XFS_SB_DADDR)
+#define XFS_BUF_TO_SBP(bp)      ((xfs_dsb_t *)((bp)->b_addr))
+#define XFS_HDR_BLOCK(mp,d)     ((xfs_agblock_t)XFS_BB_TO_FSBT(mp,d))
+#define XFS_DADDR_TO_FSB(mp,d)  XFS_AGB_TO_FSB(mp, \
+                        xfs_daddr_to_agno(mp,d), xfs_daddr_to_agbno(mp,d))
+#define XFS_FSB_TO_DADDR(mp,fsbno)      XFS_AGB_TO_DADDR(mp, \
+                        XFS_FSB_TO_AGNO(mp,fsbno), XFS_FSB_TO_AGBNO(mp,fsbno))
+/*
+ * File system sector to basic block conversions.
+ */
+#define XFS_FSS_TO_BB(mp,sec)   ((sec) << (mp)->m_sectbb_log)
+/*
+ * File system block to basic block conversions.
+ */
+#define XFS_FSB_TO_BB(mp,fsbno) ((fsbno) << (mp)->m_blkbb_log)
+#define XFS_BB_TO_FSB(mp,bb)    \
+        (((bb) + (XFS_FSB_TO_BB(mp,1) - 1)) >> (mp)->m_blkbb_log)
+#define XFS_BB_TO_FSBT(mp,bb)   ((bb) >> (mp)->m_blkbb_log)
+/*
+ * File system block to byte conversions.
+ */
+#define XFS_FSB_TO_B(mp,fsbno)  ((xfs_fsize_t)(fsbno) << (mp)->m_sb.sb_blocklog)
+#define XFS_B_TO_FSB(mp,b)      \
+        ((((__uint64_t)(b)) + (mp)->m_blockmask) >> (mp)->m_sb.sb_blocklog)
+#define XFS_B_TO_FSBT(mp,b)     (((__uint64_t)(b)) >> (mp)->m_sb.sb_blocklog)
+#define XFS_B_FSB_OFFSET(mp,b)  ((b) & (mp)->m_blockmask)
+/*
+ * Allocation group header
+ *
+ * This is divided into three structures, placed in sequential 512-byte
+ * buffers after a copy of the superblock (also in a 512-byte buffer).
+ */
+#define XFS_AGF_MAGIC   0x58414746      /* 'XAGF' */
+#define XFS_AGI_MAGIC   0x58414749      /* 'XAGI' */
+#define XFS_AGFL_MAGIC  0x5841464c      /* 'XAFL' */
+#define XFS_AGF_VERSION 1
+#define XFS_AGI_VERSION 1
+#define XFS_AGF_GOOD_VERSION(v) ((v) == XFS_AGF_VERSION)
+#define XFS_AGI_GOOD_VERSION(v) ((v) == XFS_AGI_VERSION)
+/*
+ * Btree number 0 is bno, 1 is cnt.  This value gives the size of the
+ * arrays below.
+ */
+#define XFS_BTNUM_AGF   ((int)XFS_BTNUM_CNTi + 1)
+/*
+ * The second word of agf_levels in the first a.g. overlaps the EFS
+ * superblock's magic number.  Since the magic numbers valid for EFS
+ * are > 64k, our value cannot be confused for an EFS superblock's.
+ */
+typedef struct xfs_agf {
+        /*
+         * Common allocation group header information
+         */
+        __be32          agf_magicnum;   /* magic number == XFS_AGF_MAGIC */
+        __be32          agf_versionnum; /* header version == XFS_AGF_VERSION */
+        __be32          agf_seqno;      /* sequence # starting from 0 */
+        __be32          agf_length;     /* size in blocks of a.g. */
+        /*
+         * Freespace information
+         */
+        __be32          agf_roots[XFS_BTNUM_AGF];       /* root blocks */
+        __be32          agf_spare0;     /* spare field */
+        __be32          agf_levels[XFS_BTNUM_AGF];      /* btree levels */
+        __be32          agf_spare1;     /* spare field */
+        __be32          agf_flfirst;    /* first freelist block's index */
+        __be32          agf_fllast;     /* last freelist block's index */
+        __be32          agf_flcount;    /* count of blocks in freelist */
+        __be32          agf_freeblks;   /* total free blocks */
+        __be32          agf_longest;    /* longest free space */
+        __be32          agf_btreeblks;  /* # of blocks held in AGF btrees */
+        uuid_t          agf_uuid;       /* uuid of filesystem */
+        /*
+         * reserve some contiguous space for future logged fields before we add
+         * the unlogged fields. This makes the range logging via flags and
+         * structure offsets much simpler.
+         */
+        __be64          agf_spare64[16];
+        /* unlogged fields, written during buffer writeback. */
+        __be64          agf_lsn;        /* last write sequence */
+        __be32          agf_crc;        /* crc of agf sector */
+        __be32          agf_spare2;
+        /* structure must be padded to 64 bit alignment */
+} xfs_agf_t;
+#define XFS_AGF_CRC_OFF         offsetof(struct xfs_agf, agf_crc)
+#define XFS_AGF_MAGICNUM        0x00000001
+#define XFS_AGF_VERSIONNUM      0x00000002
+#define XFS_AGF_SEQNO           0x00000004
+#define XFS_AGF_LENGTH          0x00000008
+#define XFS_AGF_ROOTS           0x00000010
+#define XFS_AGF_LEVELS          0x00000020
+#define XFS_AGF_FLFIRST         0x00000040
+#define XFS_AGF_FLLAST          0x00000080
+#define XFS_AGF_FLCOUNT         0x00000100
+#define XFS_AGF_FREEBLKS        0x00000200
+#define XFS_AGF_LONGEST         0x00000400
+#define XFS_AGF_BTREEBLKS       0x00000800
+#define XFS_AGF_UUID            0x00001000
+#define XFS_AGF_NUM_BITS        13
+#define XFS_AGF_ALL_BITS        ((1 << XFS_AGF_NUM_BITS) - 1)
+#define XFS_AGF_FLAGS \
+        { XFS_AGF_MAGICNUM,     "MAGICNUM" }, \
+        { XFS_AGF_VERSIONNUM,   "VERSIONNUM" }, \
+        { XFS_AGF_SEQNO,        "SEQNO" }, \
+        { XFS_AGF_LENGTH,       "LENGTH" }, \
+        { XFS_AGF_ROOTS,        "ROOTS" }, \
+        { XFS_AGF_LEVELS,       "LEVELS" }, \
+        { XFS_AGF_FLFIRST,      "FLFIRST" }, \
+        { XFS_AGF_FLLAST,       "FLLAST" }, \
+        { XFS_AGF_FLCOUNT,      "FLCOUNT" }, \
+        { XFS_AGF_FREEBLKS,     "FREEBLKS" }, \
+        { XFS_AGF_LONGEST,      "LONGEST" }, \
+        { XFS_AGF_BTREEBLKS,    "BTREEBLKS" }, \
+        { XFS_AGF_UUID,         "UUID" }
+/* disk block (xfs_daddr_t) in the AG */
+#define XFS_AGF_DADDR(mp)       ((xfs_daddr_t)(1 << (mp)->m_sectbb_log))
+#define XFS_AGF_BLOCK(mp)       XFS_HDR_BLOCK(mp, XFS_AGF_DADDR(mp))
+#define XFS_BUF_TO_AGF(bp)      ((xfs_agf_t *)((bp)->b_addr))
+/*
+ * Size of the unlinked inode hash table in the agi.
+ */
+#define XFS_AGI_UNLINKED_BUCKETS        64
+typedef struct xfs_agi {
+        /*
+         * Common allocation group header information
+         */
+        __be32          agi_magicnum;   /* magic number == XFS_AGI_MAGIC */
+        __be32          agi_versionnum; /* header version == XFS_AGI_VERSION */
+        __be32          agi_seqno;      /* sequence # starting from 0 */
+        __be32          agi_length;     /* size in blocks of a.g. */
+        /*
+         * Inode information
+         * Inodes are mapped by interpreting the inode number, so no
+         * mapping data is needed here.
+         */
+        __be32          agi_count;      /* count of allocated inodes */
+        __be32          agi_root;       /* root of inode btree */
+        __be32          agi_level;      /* levels in inode btree */
+        __be32          agi_freecount;  /* number of free inodes */
+        __be32          agi_newino;     /* new inode just allocated */
+        __be32          agi_dirino;     /* last directory inode chunk */
+        /*
+         * Hash table of inodes which have been unlinked but are
+         * still being referenced.
+         */
+        __be32          agi_unlinked[XFS_AGI_UNLINKED_BUCKETS];
+        /*
+         * This marks the end of logging region 1 and start of logging region 2.
+         */
+        uuid_t          agi_uuid;       /* uuid of filesystem */
+        __be32          agi_crc;        /* crc of agi sector */
+        __be32          agi_pad32;
+        __be64          agi_lsn;        /* last write sequence */
+        __be32          agi_free_root; /* root of the free inode btree */
+        __be32          agi_free_level;/* levels in free inode btree */
+        /* structure must be padded to 64 bit alignment */
+} xfs_agi_t;
+#define XFS_AGI_CRC_OFF         offsetof(struct xfs_agi, agi_crc)
+#define XFS_AGI_MAGICNUM        (1 << 0)
+#define XFS_AGI_VERSIONNUM      (1 << 1)
+#define XFS_AGI_SEQNO           (1 << 2)
+#define XFS_AGI_LENGTH          (1 << 3)
+#define XFS_AGI_COUNT           (1 << 4)
+#define XFS_AGI_ROOT            (1 << 5)
+#define XFS_AGI_LEVEL           (1 << 6)
+#define XFS_AGI_FREECOUNT       (1 << 7)
+#define XFS_AGI_NEWINO          (1 << 8)
+#define XFS_AGI_DIRINO          (1 << 9)
+#define XFS_AGI_UNLINKED        (1 << 10)
+#define XFS_AGI_NUM_BITS_R1     11      /* end of the 1st agi logging region */
+#define XFS_AGI_ALL_BITS_R1     ((1 << XFS_AGI_NUM_BITS_R1) - 1)
+#define XFS_AGI_FREE_ROOT       (1 << 11)
+#define XFS_AGI_FREE_LEVEL      (1 << 12)
+#define XFS_AGI_NUM_BITS_R2     13
+/* disk block (xfs_daddr_t) in the AG */
+#define XFS_AGI_DADDR(mp)       ((xfs_daddr_t)(2 << (mp)->m_sectbb_log))
+#define XFS_AGI_BLOCK(mp)       XFS_HDR_BLOCK(mp, XFS_AGI_DADDR(mp))
+#define XFS_BUF_TO_AGI(bp)      ((xfs_agi_t *)((bp)->b_addr))
+/*
+ * The third a.g. block contains the a.g. freelist, an array
+ * of block pointers to blocks owned by the allocation btree code.
+ */
+#define XFS_AGFL_DADDR(mp)      ((xfs_daddr_t)(3 << (mp)->m_sectbb_log))
+#define XFS_AGFL_BLOCK(mp)      XFS_HDR_BLOCK(mp, XFS_AGFL_DADDR(mp))
+#define XFS_BUF_TO_AGFL(bp)     ((xfs_agfl_t *)((bp)->b_addr))
+#define XFS_BUF_TO_AGFL_BNO(mp, bp) \
+        (xfs_sb_version_hascrc(&((mp)->m_sb)) ? \
+                &(XFS_BUF_TO_AGFL(bp)->agfl_bno[0]) : \
+                (__be32 *)(bp)->b_addr)
+/*
+ * Size of the AGFL.  For CRC-enabled filesystes we steal a couple of
+ * slots in the beginning of the block for a proper header with the
+ * location information and CRC.
+ */
+#define XFS_AGFL_SIZE(mp) \
+        (((mp)->m_sb.sb_sectsize - \
+         (xfs_sb_version_hascrc(&((mp)->m_sb)) ? \
+                sizeof(struct xfs_agfl) : 0)) / \
+          sizeof(xfs_agblock_t))
+typedef struct xfs_agfl {
+        __be32          agfl_magicnum;
+        __be32          agfl_seqno;
+        uuid_t          agfl_uuid;
+        __be64          agfl_lsn;
+        __be32          agfl_crc;
+        __be32          agfl_bno[];     /* actually XFS_AGFL_SIZE(mp) */
+} xfs_agfl_t;
+#define XFS_AGFL_CRC_OFF        offsetof(struct xfs_agfl, agfl_crc)
+#define XFS_AG_MAXLEVELS(mp)            ((mp)->m_ag_maxlevels)
+#define XFS_MIN_FREELIST_RAW(bl,cl,mp)  \
+        (MIN(bl + 1, XFS_AG_MAXLEVELS(mp)) + MIN(cl + 1, XFS_AG_MAXLEVELS(mp)))
+#define XFS_MIN_FREELIST(a,mp)          \
+        (XFS_MIN_FREELIST_RAW(          \
+                be32_to_cpu((a)->agf_levels[XFS_BTNUM_BNOi]), \
+                be32_to_cpu((a)->agf_levels[XFS_BTNUM_CNTi]), mp))
+#define XFS_MIN_FREELIST_PAG(pag,mp)    \
+        (XFS_MIN_FREELIST_RAW(          \
+                (unsigned int)(pag)->pagf_levels[XFS_BTNUM_BNOi], \
+                (unsigned int)(pag)->pagf_levels[XFS_BTNUM_CNTi], mp))
+#define XFS_AGB_TO_FSB(mp,agno,agbno)   \
+        (((xfs_fsblock_t)(agno) << (mp)->m_sb.sb_agblklog) | (agbno))
+#define XFS_FSB_TO_AGNO(mp,fsbno)       \
+        ((xfs_agnumber_t)((fsbno) >> (mp)->m_sb.sb_agblklog))
+#define XFS_FSB_TO_AGBNO(mp,fsbno)      \
+        ((xfs_agblock_t)((fsbno) & xfs_mask32lo((mp)->m_sb.sb_agblklog)))
+#define XFS_AGB_TO_DADDR(mp,agno,agbno) \
+        ((xfs_daddr_t)XFS_FSB_TO_BB(mp, \
+                (xfs_fsblock_t)(agno) * (mp)->m_sb.sb_agblocks + (agbno)))
+#define XFS_AG_DADDR(mp,agno,d)         (XFS_AGB_TO_DADDR(mp, agno, 0) + (d))
+/*
+ * For checking for bad ranges of xfs_daddr_t's, covering multiple
+ * allocation groups or a single xfs_daddr_t that's a superblock copy.
+ */
+#define XFS_AG_CHECK_DADDR(mp,d,len)    \
+        ((len) == 1 ? \
+            ASSERT((d) == XFS_SB_DADDR || \
+                   xfs_daddr_to_agbno(mp, d) != XFS_SB_DADDR) : \
+            ASSERT(xfs_daddr_to_agno(mp, d) == \
+                   xfs_daddr_to_agno(mp, (d) + (len) - 1)))
+typedef struct xfs_timestamp {
+        __be32          t_sec;          /* timestamp seconds */
+        __be32          t_nsec;         /* timestamp nanoseconds */
+} xfs_timestamp_t;
+/*
+ * On-disk inode structure.
+ *
+ * This is just the header or "dinode core", the inode is expanded to fill a
+ * variable size the leftover area split into a data and an attribute fork.
+ * The format of the data and attribute fork depends on the format of the
+ * inode as indicated by di_format and di_aformat.  To access the data and
+ * attribute use the XFS_DFORK_DPTR, XFS_DFORK_APTR, and XFS_DFORK_PTR macros
+ * below.
+ *
+ * There is a very similar struct icdinode in xfs_inode which matches the
+ * layout of the first 96 bytes of this structure, but is kept in native
+ * format instead of big endian.
+ *
+ * Note: di_flushiter is only used by v1/2 inodes - it's effectively a zeroed
+ * padding field for v3 inodes.
+ */
+#define XFS_DINODE_MAGIC                0x494e  /* 'IN' */
+#define XFS_DINODE_GOOD_VERSION(v)      ((v) >= 1 && (v) <= 3)
+typedef struct xfs_dinode {
+        __be16          di_magic;       /* inode magic # = XFS_DINODE_MAGIC */
+        __be16          di_mode;        /* mode and type of file */
+        __u8            di_version;     /* inode version */
+        __u8            di_format;      /* format of di_c data */
+        __be16          di_onlink;      /* old number of links to file */
+        __be32          di_uid;         /* owner's user id */
+        __be32          di_gid;         /* owner's group id */
+        __be32          di_nlink;       /* number of links to file */
+        __be16          di_projid_lo;   /* lower part of owner's project id */
+        __be16          di_projid_hi;   /* higher part owner's project id */
+        __u8            di_pad[6];      /* unused, zeroed space */
+        __be16          di_flushiter;   /* incremented on flush */
+        xfs_timestamp_t di_atime;       /* time last accessed */
+        xfs_timestamp_t di_mtime;       /* time last modified */
+        xfs_timestamp_t di_ctime;       /* time created/inode modified */
+        __be64          di_size;        /* number of bytes in file */
+        __be64          di_nblocks;     /* # of direct & btree blocks used */
+        __be32          di_extsize;     /* basic/minimum extent size for file */
+        __be32          di_nextents;    /* number of extents in data fork */
+        __be16          di_anextents;   /* number of extents in attribute fork*/
+        __u8            di_forkoff;     /* attr fork offs, <<3 for 64b align */
+        __s8            di_aformat;     /* format of attr fork's data */
+        __be32          di_dmevmask;    /* DMIG event mask */
+        __be16          di_dmstate;     /* DMIG state info */
+        __be16          di_flags;       /* random flags, XFS_DIFLAG_... */
+        __be32          di_gen;         /* generation number */
+        /* di_next_unlinked is the only non-core field in the old dinode */
+        __be32          di_next_unlinked;/* agi unlinked list ptr */
+        /* start of the extended dinode, writable fields */
+        __le32          di_crc;         /* CRC of the inode */
+        __be64          di_changecount; /* number of attribute changes */
+        __be64          di_lsn;         /* flush sequence */
+        __be64          di_flags2;      /* more random flags */
+        __u8            di_pad2[16];    /* more padding for future expansion */
+        /* fields only written to during inode creation */
+        xfs_timestamp_t di_crtime;      /* time created */
+        __be64          di_ino;         /* inode number */
+        uuid_t          di_uuid;        /* UUID of the filesystem */
+        /* structure must be padded to 64 bit alignment */
+} xfs_dinode_t;
+#define XFS_DINODE_CRC_OFF      offsetof(struct xfs_dinode, di_crc)
+#define DI_MAX_FLUSH 0xffff
+/*
+ * Size of the core inode on disk.  Version 1 and 2 inodes have
+ * the same size, but version 3 has grown a few additional fields.
+ */
+static inline uint xfs_dinode_size(int version)
+{
+        if (version == 3)
+                return sizeof(struct xfs_dinode);
+        return offsetof(struct xfs_dinode, di_crc);
+}
+/*
+ * The 32 bit link count in the inode theoretically maxes out at UINT_MAX.
+ * Since the pathconf interface is signed, we use 2^31 - 1 instead.
+ * The old inode format had a 16 bit link count, so its maximum is USHRT_MAX.
+ */
+#define XFS_MAXLINK             ((1U << 31) - 1U)
+#define XFS_MAXLINK_1           65535U
+/*
+ * Values for di_format
+ */
+typedef enum xfs_dinode_fmt {
+        XFS_DINODE_FMT_DEV,             /* xfs_dev_t */
+        XFS_DINODE_FMT_LOCAL,           /* bulk data */
+        XFS_DINODE_FMT_EXTENTS,         /* struct xfs_bmbt_rec */
+        XFS_DINODE_FMT_BTREE,           /* struct xfs_bmdr_block */
+        XFS_DINODE_FMT_UUID             /* uuid_t */
+} xfs_dinode_fmt_t;
+/*
+ * Inode minimum and maximum sizes.
+ */
+#define XFS_DINODE_MIN_LOG      8
+#define XFS_DINODE_MAX_LOG      11
+#define XFS_DINODE_MIN_SIZE     (1 << XFS_DINODE_MIN_LOG)
+#define XFS_DINODE_MAX_SIZE     (1 << XFS_DINODE_MAX_LOG)
+/*
+ * Inode size for given fs.
+ */
+#define XFS_LITINO(mp, version) \
+        ((int)(((mp)->m_sb.sb_inodesize) - xfs_dinode_size(version)))
+/*
+ * Inode data & attribute fork sizes, per inode.
+ */
+#define XFS_DFORK_Q(dip)                ((dip)->di_forkoff != 0)
+#define XFS_DFORK_BOFF(dip)             ((int)((dip)->di_forkoff << 3))
+#define XFS_DFORK_DSIZE(dip,mp) \
+        (XFS_DFORK_Q(dip) ? \
+                XFS_DFORK_BOFF(dip) : \
+                XFS_LITINO(mp, (dip)->di_version))
+#define XFS_DFORK_ASIZE(dip,mp) \
+        (XFS_DFORK_Q(dip) ? \
+                XFS_LITINO(mp, (dip)->di_version) - XFS_DFORK_BOFF(dip) : \
+                0)
+#define XFS_DFORK_SIZE(dip,mp,w) \
+        ((w) == XFS_DATA_FORK ? \
+                XFS_DFORK_DSIZE(dip, mp) : \
+                XFS_DFORK_ASIZE(dip, mp))
+/*
+ * Return pointers to the data or attribute forks.
+ */
+#define XFS_DFORK_DPTR(dip) \
+        ((char *)dip + xfs_dinode_size(dip->di_version))
+#define XFS_DFORK_APTR(dip)     \
+        (XFS_DFORK_DPTR(dip) + XFS_DFORK_BOFF(dip))
+#define XFS_DFORK_PTR(dip,w)    \
+        ((w) == XFS_DATA_FORK ? XFS_DFORK_DPTR(dip) : XFS_DFORK_APTR(dip))
+#define XFS_DFORK_FORMAT(dip,w) \
+        ((w) == XFS_DATA_FORK ? \
+                (dip)->di_format : \
+                (dip)->di_aformat)
+#define XFS_DFORK_NEXTENTS(dip,w) \
+        ((w) == XFS_DATA_FORK ? \
+                be32_to_cpu((dip)->di_nextents) : \
+                be16_to_cpu((dip)->di_anextents))
+/*
+ * For block and character special files the 32bit dev_t is stored at the
+ * beginning of the data fork.
+ */
+static inline xfs_dev_t xfs_dinode_get_rdev(struct xfs_dinode *dip)
+{
+        return be32_to_cpu(*(__be32 *)XFS_DFORK_DPTR(dip));
+}
+static inline void xfs_dinode_put_rdev(struct xfs_dinode *dip, xfs_dev_t rdev)
+{
+        *(__be32 *)XFS_DFORK_DPTR(dip) = cpu_to_be32(rdev);
+}
+/*
+ * Values for di_flags
+ * There should be a one-to-one correspondence between these flags and the
+ * XFS_XFLAG_s.
+ */
+#define XFS_DIFLAG_REALTIME_BIT  0      /* file's blocks come from rt area */
+#define XFS_DIFLAG_PREALLOC_BIT  1      /* file space has been preallocated */
+#define XFS_DIFLAG_NEWRTBM_BIT   2      /* for rtbitmap inode, new format */
+#define XFS_DIFLAG_IMMUTABLE_BIT 3      /* inode is immutable */
+#define XFS_DIFLAG_APPEND_BIT    4      /* inode is append-only */
+#define XFS_DIFLAG_SYNC_BIT      5      /* inode is written synchronously */
+#define XFS_DIFLAG_NOATIME_BIT   6      /* do not update atime */
+#define XFS_DIFLAG_NODUMP_BIT    7      /* do not dump */
+#define XFS_DIFLAG_RTINHERIT_BIT 8      /* create with realtime bit set */
+#define XFS_DIFLAG_PROJINHERIT_BIT   9  /* create with parents projid */
+#define XFS_DIFLAG_NOSYMLINKS_BIT   10  /* disallow symlink creation */
+#define XFS_DIFLAG_EXTSIZE_BIT      11  /* inode extent size allocator hint */
+#define XFS_DIFLAG_EXTSZINHERIT_BIT 12  /* inherit inode extent size */
+#define XFS_DIFLAG_NODEFRAG_BIT     13  /* do not reorganize/defragment */
+#define XFS_DIFLAG_FILESTREAM_BIT   14  /* use filestream allocator */
+#define XFS_DIFLAG_REALTIME      (1 << XFS_DIFLAG_REALTIME_BIT)
+#define XFS_DIFLAG_PREALLOC      (1 << XFS_DIFLAG_PREALLOC_BIT)
+#define XFS_DIFLAG_NEWRTBM       (1 << XFS_DIFLAG_NEWRTBM_BIT)
+#define XFS_DIFLAG_IMMUTABLE     (1 << XFS_DIFLAG_IMMUTABLE_BIT)
+#define XFS_DIFLAG_APPEND        (1 << XFS_DIFLAG_APPEND_BIT)
+#define XFS_DIFLAG_SYNC          (1 << XFS_DIFLAG_SYNC_BIT)
+#define XFS_DIFLAG_NOATIME       (1 << XFS_DIFLAG_NOATIME_BIT)
+#define XFS_DIFLAG_NODUMP        (1 << XFS_DIFLAG_NODUMP_BIT)
+#define XFS_DIFLAG_RTINHERIT     (1 << XFS_DIFLAG_RTINHERIT_BIT)
+#define XFS_DIFLAG_PROJINHERIT   (1 << XFS_DIFLAG_PROJINHERIT_BIT)
+#define XFS_DIFLAG_NOSYMLINKS    (1 << XFS_DIFLAG_NOSYMLINKS_BIT)
+#define XFS_DIFLAG_EXTSIZE       (1 << XFS_DIFLAG_EXTSIZE_BIT)
+#define XFS_DIFLAG_EXTSZINHERIT  (1 << XFS_DIFLAG_EXTSZINHERIT_BIT)
+#define XFS_DIFLAG_NODEFRAG      (1 << XFS_DIFLAG_NODEFRAG_BIT)
+#define XFS_DIFLAG_FILESTREAM    (1 << XFS_DIFLAG_FILESTREAM_BIT)
+#define XFS_DIFLAG_ANY \
+        (XFS_DIFLAG_REALTIME | XFS_DIFLAG_PREALLOC | XFS_DIFLAG_NEWRTBM | \
+         XFS_DIFLAG_IMMUTABLE | XFS_DIFLAG_APPEND | XFS_DIFLAG_SYNC | \
+         XFS_DIFLAG_NOATIME | XFS_DIFLAG_NODUMP | XFS_DIFLAG_RTINHERIT | \
+         XFS_DIFLAG_PROJINHERIT | XFS_DIFLAG_NOSYMLINKS | XFS_DIFLAG_EXTSIZE | \
+         XFS_DIFLAG_EXTSZINHERIT | XFS_DIFLAG_NODEFRAG | XFS_DIFLAG_FILESTREAM)
+/*
+ * Inode number format:
+ * low inopblog bits - offset in block
+ * next agblklog bits - block number in ag
+ * next agno_log bits - ag number
+ * high agno_log-agblklog-inopblog bits - 0
+ */
+#define XFS_INO_MASK(k)                 (__uint32_t)((1ULL << (k)) - 1)
+#define XFS_INO_OFFSET_BITS(mp)         (mp)->m_sb.sb_inopblog
+#define XFS_INO_AGBNO_BITS(mp)          (mp)->m_sb.sb_agblklog
+#define XFS_INO_AGINO_BITS(mp)          (mp)->m_agino_log
+#define XFS_INO_AGNO_BITS(mp)           (mp)->m_agno_log
+#define XFS_INO_BITS(mp)                \
+        XFS_INO_AGNO_BITS(mp) + XFS_INO_AGINO_BITS(mp)
+#define XFS_INO_TO_AGNO(mp,i)           \
+        ((xfs_agnumber_t)((i) >> XFS_INO_AGINO_BITS(mp)))
+#define XFS_INO_TO_AGINO(mp,i)          \
+        ((xfs_agino_t)(i) & XFS_INO_MASK(XFS_INO_AGINO_BITS(mp)))
+#define XFS_INO_TO_AGBNO(mp,i)          \
+        (((xfs_agblock_t)(i) >> XFS_INO_OFFSET_BITS(mp)) & \
+                XFS_INO_MASK(XFS_INO_AGBNO_BITS(mp)))
+#define XFS_INO_TO_OFFSET(mp,i)         \
+        ((int)(i) & XFS_INO_MASK(XFS_INO_OFFSET_BITS(mp)))
+#define XFS_INO_TO_FSB(mp,i)            \
+        XFS_AGB_TO_FSB(mp, XFS_INO_TO_AGNO(mp,i), XFS_INO_TO_AGBNO(mp,i))
+#define XFS_AGINO_TO_INO(mp,a,i)        \
+        (((xfs_ino_t)(a) << XFS_INO_AGINO_BITS(mp)) | (i))
+#define XFS_AGINO_TO_AGBNO(mp,i)        ((i) >> XFS_INO_OFFSET_BITS(mp))
+#define XFS_AGINO_TO_OFFSET(mp,i)       \
+        ((i) & XFS_INO_MASK(XFS_INO_OFFSET_BITS(mp)))
+#define XFS_OFFBNO_TO_AGINO(mp,b,o)     \
+        ((xfs_agino_t)(((b) << XFS_INO_OFFSET_BITS(mp)) | (o)))
+#define XFS_MAXINUMBER          ((xfs_ino_t)((1ULL << 56) - 1ULL))
+#define XFS_MAXINUMBER_32       ((xfs_ino_t)((1ULL << 32) - 1ULL))
+/*
 * RealTime Device format definitions
 */
@@ -413,4 +1484,40 @@ struct xfs_btree_block {
 #define XFS_BTREE_LBLOCK_CRC_OFF \
        offsetof(struct xfs_btree_block, bb_u.l.bb_crc)
+/*
+ * On-disk XFS access control list structure.
+ */
+struct xfs_acl_entry {
+        __be32  ae_tag;
+        __be32  ae_id;
+        __be16  ae_perm;
+        __be16  ae_pad;         /* fill the implicit hole in the structure */
+};
+struct xfs_acl {
+        __be32                  acl_cnt;
+        struct xfs_acl_entry    acl_entry[0];
+};
+/*
+ * The number of ACL entries allowed is defined by the on-disk format.
+ * For v4 superblocks, that is limited to 25 entries. For v5 superblocks, it is
+ * limited only by the maximum size of the xattr that stores the information.
+ */
+#define XFS_ACL_MAX_ENTRIES(mp) \
+        (xfs_sb_version_hascrc(&mp->m_sb) \
+                ?  (XATTR_SIZE_MAX - sizeof(struct xfs_acl)) / \
+                                                sizeof(struct xfs_acl_entry) \
+                : 25)
+#define XFS_ACL_MAX_SIZE(mp) \
+        (sizeof(struct xfs_acl) + \
+                sizeof(struct xfs_acl_entry) * XFS_ACL_MAX_ENTRIES((mp)))
+/* On-disk XFS extended attribute names */
+#define SGI_ACL_FILE            (unsigned char *)"SGI_ACL_FILE"
+#define SGI_ACL_DEFAULT         (unsigned char *)"SGI_ACL_DEFAULT"
+#define SGI_ACL_FILE_SIZE       (sizeof(SGI_ACL_FILE)-1)
+#define SGI_ACL_DEFAULT_SIZE    (sizeof(SGI_ACL_DEFAULT)-1)
 #endif /* __XFS_FORMAT_H__ */
diff --git a/fs/xfs/libxfs/xfs_ialloc.c b/fs/xfs/libxfs/xfs_ialloc.c
index 23dcb72fc5e6..116ef1ddb3e3 100644
--- a/fs/xfs/libxfs/xfs_ialloc.c
+++ b/fs/xfs/libxfs/xfs_ialloc.c
@@ -22,9 +22,7 @@
 #include "xfs_log_format.h"
 #include "xfs_trans_resv.h"
 #include "xfs_bit.h"
-#include "xfs_inum.h"
 #include "xfs_sb.h"
-#include "xfs_ag.h"
 #include "xfs_mount.h"
 #include "xfs_inode.h"
 #include "xfs_btree.h"
@@ -39,7 +37,6 @@
 #include "xfs_buf_item.h"
 #include "xfs_icreate_item.h"
 #include "xfs_icache.h"
-#include "xfs_dinode.h"
 #include "xfs_trace.h"
@@ -48,12 +45,12 @@
 */
 static inline int
 xfs_ialloc_cluster_alignment(
-        xfs_alloc_arg_t *args)
+        struct xfs_mount        *mp)
 {
-        if (xfs_sb_version_hasalign(&args->mp->m_sb) &&
+        if (xfs_sb_version_hasalign(&mp->m_sb) &&
-            args->mp->m_sb.sb_inoalignmt >=
+            mp->m_sb.sb_inoalignmt >=
-             XFS_B_TO_FSBT(args->mp, args->mp->m_inode_cluster_size))
+                        XFS_B_TO_FSBT(mp, mp->m_inode_cluster_size))
-                return args->mp->m_sb.sb_inoalignmt;
+                return mp->m_sb.sb_inoalignmt;
        return 1;
 }
@@ -412,7 +409,7 @@ xfs_ialloc_ag_alloc(
                 * but not to use them in the actual exact allocation.
                 */
                args.alignment = 1;
-                args.minalignslop = xfs_ialloc_cluster_alignment(&args) - 1;
+                args.minalignslop = xfs_ialloc_cluster_alignment(args.mp) - 1;
                /* Allow space for the inode btree to split. */
                args.minleft = args.mp->m_in_maxlevels - 1;
@@ -448,7 +445,7 @@ xfs_ialloc_ag_alloc(
                        args.alignment = args.mp->m_dalign;
                        isaligned = 1;
                } else
-                        args.alignment = xfs_ialloc_cluster_alignment(&args);
+                        args.alignment = xfs_ialloc_cluster_alignment(args.mp);
                /*
                 * Need to figure out where to allocate the inode blocks.
                 * Ideally they should be spaced out through the a.g.
@@ -477,7 +474,7 @@ xfs_ialloc_ag_alloc(
                args.type = XFS_ALLOCTYPE_NEAR_BNO;
                args.agbno = be32_to_cpu(agi->agi_root);
                args.fsbno = XFS_AGB_TO_FSB(args.mp, agno, args.agbno);
-                args.alignment = xfs_ialloc_cluster_alignment(&args);
+                args.alignment = xfs_ialloc_cluster_alignment(args.mp);
                if ((error = xfs_alloc_vextent(&args)))
                        return error;
        }
@@ -632,10 +629,24 @@ xfs_ialloc_ag_select(
                }
                /*
-                 * Is there enough free space for the file plus a block of
+                 * Check that there is enough free space for the file plus a
-                 * inodes? (if we need to allocate some)?
+                 * chunk of inodes if we need to allocate some. If this is the
+                 * first pass across the AGs, take into account the potential
+                 * space needed for alignment of inode chunks when checking the
+                 * longest contiguous free space in the AG - this prevents us
+                 * from getting ENOSPC because we have free space larger than
+                 * m_ialloc_blks but alignment constraints prevent us from using
+                 * it.
+                 *
+                 * If we can't find an AG with space for full alignment slack to
+                 * be taken into account, we must be near ENOSPC in all AGs.
+                 * Hence we don't include alignment for the second pass and so
+                 * if we fail allocation due to alignment issues then it is most
+                 * likely a real ENOSPC condition.
                 */
                ineed = mp->m_ialloc_blks;
+                if (flags && ineed > 1)
+                        ineed += xfs_ialloc_cluster_alignment(mp);
                longest = pag->pagf_longest;
                if (!longest)
                        longest = pag->pagf_flcount > 0;
@@ -1137,11 +1148,7 @@ xfs_dialloc_ag_update_inobt(
        XFS_WANT_CORRUPTED_RETURN((rec.ir_free == frec->ir_free) &&
                                  (rec.ir_freecount == frec->ir_freecount));
-        error = xfs_inobt_update(cur, &rec);
+        return xfs_inobt_update(cur, &rec);
-        if (error)
-                return error;
-        return 0;
 }
 /*
diff --git a/fs/xfs/libxfs/xfs_ialloc.h b/fs/xfs/libxfs/xfs_ialloc.h
index 95ad1c002d60..100007d56449 100644
--- a/fs/xfs/libxfs/xfs_ialloc.h
+++ b/fs/xfs/libxfs/xfs_ialloc.h
@@ -160,4 +160,8 @@ int xfs_ialloc_inode_init(struct xfs_mount *mp, struct xfs_trans *tp,
                          xfs_agnumber_t agno, xfs_agblock_t agbno,
                          xfs_agblock_t length, unsigned int gen);
+int xfs_read_agi(struct xfs_mount *mp, struct xfs_trans *tp,
+                xfs_agnumber_t agno, struct xfs_buf **bpp);
 #endif  /* __XFS_IALLOC_H__ */
diff --git a/fs/xfs/libxfs/xfs_ialloc_btree.c b/fs/xfs/libxfs/xfs_ialloc_btree.c
index c9b06f30fe86..964c465ca69c 100644
--- a/fs/xfs/libxfs/xfs_ialloc_btree.c
+++ b/fs/xfs/libxfs/xfs_ialloc_btree.c
@@ -22,8 +22,6 @@
 #include "xfs_log_format.h"
 #include "xfs_trans_resv.h"
 #include "xfs_bit.h"
-#include "xfs_sb.h"
-#include "xfs_ag.h"
 #include "xfs_mount.h"
 #include "xfs_inode.h"
 #include "xfs_btree.h"
diff --git a/fs/xfs/libxfs/xfs_inode_buf.c b/fs/xfs/libxfs/xfs_inode_buf.c
index f18fd2da49f7..002b6b3a1988 100644
--- a/fs/xfs/libxfs/xfs_inode_buf.c
+++ b/fs/xfs/libxfs/xfs_inode_buf.c
@@ -21,8 +21,6 @@
 #include "xfs_format.h"
 #include "xfs_log_format.h"
 #include "xfs_trans_resv.h"
-#include "xfs_sb.h"
-#include "xfs_ag.h"
 #include "xfs_mount.h"
 #include "xfs_inode.h"
 #include "xfs_error.h"
@@ -30,7 +28,6 @@
 #include "xfs_icache.h"
 #include "xfs_trans.h"
 #include "xfs_ialloc.h"
-#include "xfs_dinode.h"
 /*
 * Check that none of the inode's in the buffer have a next
diff --git a/fs/xfs/libxfs/xfs_inode_fork.c b/fs/xfs/libxfs/xfs_inode_fork.c
index 6a00f7fed69d..0defbd02f62d 100644
--- a/fs/xfs/libxfs/xfs_inode_fork.c
+++ b/fs/xfs/libxfs/xfs_inode_fork.c
@@ -22,9 +22,6 @@
 #include "xfs_format.h"
 #include "xfs_log_format.h"
 #include "xfs_trans_resv.h"
-#include "xfs_inum.h"
-#include "xfs_sb.h"
-#include "xfs_ag.h"
 #include "xfs_mount.h"
 #include "xfs_inode.h"
 #include "xfs_trans.h"
@@ -34,7 +31,6 @@
 #include "xfs_error.h"
 #include "xfs_trace.h"
 #include "xfs_attr_sf.h"
-#include "xfs_dinode.h"
 kmem_zone_t *xfs_ifork_zone;
diff --git a/fs/xfs/libxfs/xfs_inum.h b/fs/xfs/libxfs/xfs_inum.h
deleted file mode 100644
index 4ff2278e147a..000000000000
--- a/fs/xfs/libxfs/xfs_inum.h
+++ /dev/null
@@ -1,60 +0,0 @@
-/*
- * Copyright (c) 2000-2003,2005 Silicon Graphics, Inc.
- * All Rights Reserved.
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License as
- * published by the Free Software Foundation.
- *
- * This program is distributed in the hope that it would be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, write the Free Software Foundation,
- * Inc.,  51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
- */
-#ifndef __XFS_INUM_H__
-#define __XFS_INUM_H__
-/*
- * Inode number format:
- * low inopblog bits - offset in block
- * next agblklog bits - block number in ag
- * next agno_log bits - ag number
- * high agno_log-agblklog-inopblog bits - 0
- */
-struct xfs_mount;
-#define XFS_INO_MASK(k)                 (__uint32_t)((1ULL << (k)) - 1)
-#define XFS_INO_OFFSET_BITS(mp)         (mp)->m_sb.sb_inopblog
-#define XFS_INO_AGBNO_BITS(mp)          (mp)->m_sb.sb_agblklog
-#define XFS_INO_AGINO_BITS(mp)          (mp)->m_agino_log
-#define XFS_INO_AGNO_BITS(mp)           (mp)->m_agno_log
-#define XFS_INO_BITS(mp)                \
-        XFS_INO_AGNO_BITS(mp) + XFS_INO_AGINO_BITS(mp)
-#define XFS_INO_TO_AGNO(mp,i)           \
-        ((xfs_agnumber_t)((i) >> XFS_INO_AGINO_BITS(mp)))
-#define XFS_INO_TO_AGINO(mp,i)          \
-        ((xfs_agino_t)(i) & XFS_INO_MASK(XFS_INO_AGINO_BITS(mp)))
-#define XFS_INO_TO_AGBNO(mp,i)          \
-        (((xfs_agblock_t)(i) >> XFS_INO_OFFSET_BITS(mp)) & \
-                XFS_INO_MASK(XFS_INO_AGBNO_BITS(mp)))
-#define XFS_INO_TO_OFFSET(mp,i)         \
-        ((int)(i) & XFS_INO_MASK(XFS_INO_OFFSET_BITS(mp)))
-#define XFS_INO_TO_FSB(mp,i)            \
-        XFS_AGB_TO_FSB(mp, XFS_INO_TO_AGNO(mp,i), XFS_INO_TO_AGBNO(mp,i))
-#define XFS_AGINO_TO_INO(mp,a,i)        \
-        (((xfs_ino_t)(a) << XFS_INO_AGINO_BITS(mp)) | (i))
-#define XFS_AGINO_TO_AGBNO(mp,i)        ((i) >> XFS_INO_OFFSET_BITS(mp))
-#define XFS_AGINO_TO_OFFSET(mp,i)       \
-        ((i) & XFS_INO_MASK(XFS_INO_OFFSET_BITS(mp)))
-#define XFS_OFFBNO_TO_AGINO(mp,b,o)     \
-        ((xfs_agino_t)(((b) << XFS_INO_OFFSET_BITS(mp)) | (o)))
-#define XFS_MAXINUMBER          ((xfs_ino_t)((1ULL << 56) - 1ULL))
-#define XFS_MAXINUMBER_32       ((xfs_ino_t)((1ULL << 32) - 1ULL))
-#endif  /* __XFS_INUM_H__ */
diff --git a/fs/xfs/libxfs/xfs_log_format.h b/fs/xfs/libxfs/xfs_log_format.h
index aff12f2d4428..265314690415 100644
--- a/fs/xfs/libxfs/xfs_log_format.h
+++ b/fs/xfs/libxfs/xfs_log_format.h
@@ -361,7 +361,7 @@ typedef struct xfs_ictimestamp {
 /*
 * NOTE:  This structure must be kept identical to struct xfs_dinode
- *        in xfs_dinode.h except for the endianness annotations.
+ *        except for the endianness annotations.
 */
 typedef struct xfs_icdinode {
        __uint16_t      di_magic;       /* inode magic # = XFS_DINODE_MAGIC */
diff --git a/fs/xfs/libxfs/xfs_log_rlimit.c b/fs/xfs/libxfs/xfs_log_rlimit.c
index ee7e0e80246b..c10597973333 100644
--- a/fs/xfs/libxfs/xfs_log_rlimit.c
+++ b/fs/xfs/libxfs/xfs_log_rlimit.c
@@ -21,8 +21,6 @@
 #include "xfs_format.h"
 #include "xfs_log_format.h"
 #include "xfs_trans_resv.h"
-#include "xfs_ag.h"
-#include "xfs_sb.h"
 #include "xfs_mount.h"
 #include "xfs_da_format.h"
 #include "xfs_trans_space.h"
diff --git a/fs/xfs/libxfs/xfs_rtbitmap.c b/fs/xfs/libxfs/xfs_rtbitmap.c
index 7c818f1e4484..9b59ffa1fc19 100644
--- a/fs/xfs/libxfs/xfs_rtbitmap.c
+++ b/fs/xfs/libxfs/xfs_rtbitmap.c
@@ -22,8 +22,6 @@
 #include "xfs_log_format.h"
 #include "xfs_trans_resv.h"
 #include "xfs_bit.h"
-#include "xfs_sb.h"
-#include "xfs_ag.h"
 #include "xfs_mount.h"
 #include "xfs_inode.h"
 #include "xfs_bmap.h"
@@ -36,7 +34,6 @@
 #include "xfs_trace.h"
 #include "xfs_buf.h"
 #include "xfs_icache.h"
-#include "xfs_dinode.h"
 #include "xfs_rtalloc.h"
diff --git a/fs/xfs/libxfs/xfs_sb.c b/fs/xfs/libxfs/xfs_sb.c
index 5f902fa7913f..752915fa775a 100644
--- a/fs/xfs/libxfs/xfs_sb.c
+++ b/fs/xfs/libxfs/xfs_sb.c
@@ -23,7 +23,6 @@
 #include "xfs_trans_resv.h"
 #include "xfs_bit.h"
 #include "xfs_sb.h"
-#include "xfs_ag.h"
 #include "xfs_mount.h"
 #include "xfs_inode.h"
 #include "xfs_ialloc.h"
@@ -33,7 +32,6 @@
 #include "xfs_cksum.h"
 #include "xfs_trans.h"
 #include "xfs_buf_item.h"
-#include "xfs_dinode.h"
 #include "xfs_bmap_btree.h"
 #include "xfs_alloc_btree.h"
 #include "xfs_ialloc_btree.h"
diff --git a/fs/xfs/libxfs/xfs_sb.h b/fs/xfs/libxfs/xfs_sb.h
index 2e739708afd3..8eb1c54bafbf 100644
--- a/fs/xfs/libxfs/xfs_sb.h
+++ b/fs/xfs/libxfs/xfs_sb.h
@@ -19,590 +19,6 @@
 #define __XFS_SB_H__
 /*
- * Super block
- * Fits into a sector-sized buffer at address 0 of each allocation group.
- * Only the first of these is ever updated except during growfs.
- */
-struct xfs_buf;
-struct xfs_mount;
-struct xfs_trans;
-#define XFS_SB_MAGIC            0x58465342      /* 'XFSB' */
-#define XFS_SB_VERSION_1        1               /* 5.3, 6.0.1, 6.1 */
-#define XFS_SB_VERSION_2        2               /* 6.2 - attributes */
-#define XFS_SB_VERSION_3        3               /* 6.2 - new inode version */
-#define XFS_SB_VERSION_4        4               /* 6.2+ - bitmask version */
-#define XFS_SB_VERSION_5        5               /* CRC enabled filesystem */
-#define XFS_SB_VERSION_NUMBITS          0x000f
-#define XFS_SB_VERSION_ALLFBITS         0xfff0
-#define XFS_SB_VERSION_ATTRBIT          0x0010
-#define XFS_SB_VERSION_NLINKBIT         0x0020
-#define XFS_SB_VERSION_QUOTABIT         0x0040
-#define XFS_SB_VERSION_ALIGNBIT         0x0080
-#define XFS_SB_VERSION_DALIGNBIT        0x0100
-#define XFS_SB_VERSION_SHAREDBIT        0x0200
-#define XFS_SB_VERSION_LOGV2BIT         0x0400
-#define XFS_SB_VERSION_SECTORBIT        0x0800
-#define XFS_SB_VERSION_EXTFLGBIT        0x1000
-#define XFS_SB_VERSION_DIRV2BIT         0x2000
-#define XFS_SB_VERSION_BORGBIT          0x4000  /* ASCII only case-insens. */
-#define XFS_SB_VERSION_MOREBITSBIT      0x8000
-/*
- * Supported feature bit list is just all bits in the versionnum field because
- * we've used them all up and understand them all. Except, of course, for the
- * shared superblock bit, which nobody knows what it does and so is unsupported.
- */
-#define XFS_SB_VERSION_OKBITS           \
-        ((XFS_SB_VERSION_NUMBITS | XFS_SB_VERSION_ALLFBITS) & \
-                ~XFS_SB_VERSION_SHAREDBIT)
-/*
- * There are two words to hold XFS "feature" bits: the original
- * word, sb_versionnum, and sb_features2.  Whenever a bit is set in
- * sb_features2, the feature bit XFS_SB_VERSION_MOREBITSBIT must be set.
- *
- * These defines represent bits in sb_features2.
- */
-#define XFS_SB_VERSION2_RESERVED1BIT    0x00000001
-#define XFS_SB_VERSION2_LAZYSBCOUNTBIT  0x00000002      /* Superblk counters */
-#define XFS_SB_VERSION2_RESERVED4BIT    0x00000004
-#define XFS_SB_VERSION2_ATTR2BIT        0x00000008      /* Inline attr rework */
-#define XFS_SB_VERSION2_PARENTBIT       0x00000010      /* parent pointers */
-#define XFS_SB_VERSION2_PROJID32BIT     0x00000080      /* 32 bit project id */
-#define XFS_SB_VERSION2_CRCBIT          0x00000100      /* metadata CRCs */
-#define XFS_SB_VERSION2_FTYPE           0x00000200      /* inode type in dir */
-#define XFS_SB_VERSION2_OKBITS          \
-        (XFS_SB_VERSION2_LAZYSBCOUNTBIT | \
-         XFS_SB_VERSION2_ATTR2BIT       | \
-         XFS_SB_VERSION2_PROJID32BIT    | \
-         XFS_SB_VERSION2_FTYPE)
-/*
- * Superblock - in core version.  Must match the ondisk version below.
- * Must be padded to 64 bit alignment.
- */
-typedef struct xfs_sb {
-        __uint32_t      sb_magicnum;    /* magic number == XFS_SB_MAGIC */
-        __uint32_t      sb_blocksize;   /* logical block size, bytes */
-        xfs_rfsblock_t  sb_dblocks;     /* number of data blocks */
-        xfs_rfsblock_t  sb_rblocks;     /* number of realtime blocks */
-        xfs_rtblock_t   sb_rextents;    /* number of realtime extents */
-        uuid_t          sb_uuid;        /* file system unique id */
-        xfs_fsblock_t   sb_logstart;    /* starting block of log if internal */
-        xfs_ino_t       sb_rootino;     /* root inode number */
-        xfs_ino_t       sb_rbmino;      /* bitmap inode for realtime extents */
-        xfs_ino_t       sb_rsumino;     /* summary inode for rt bitmap */
-        xfs_agblock_t   sb_rextsize;    /* realtime extent size, blocks */
-        xfs_agblock_t   sb_agblocks;    /* size of an allocation group */
-        xfs_agnumber_t  sb_agcount;     /* number of allocation groups */
-        xfs_extlen_t    sb_rbmblocks;   /* number of rt bitmap blocks */
-        xfs_extlen_t    sb_logblocks;   /* number of log blocks */
-        __uint16_t      sb_versionnum;  /* header version == XFS_SB_VERSION */
-        __uint16_t      sb_sectsize;    /* volume sector size, bytes */
-        __uint16_t      sb_inodesize;   /* inode size, bytes */
-        __uint16_t      sb_inopblock;   /* inodes per block */
-        char            sb_fname[12];   /* file system name */
-        __uint8_t       sb_blocklog;    /* log2 of sb_blocksize */
-        __uint8_t       sb_sectlog;     /* log2 of sb_sectsize */
-        __uint8_t       sb_inodelog;    /* log2 of sb_inodesize */
-        __uint8_t       sb_inopblog;    /* log2 of sb_inopblock */
-        __uint8_t       sb_agblklog;    /* log2 of sb_agblocks (rounded up) */
-        __uint8_t       sb_rextslog;    /* log2 of sb_rextents */
-        __uint8_t       sb_inprogress;  /* mkfs is in progress, don't mount */
-        __uint8_t       sb_imax_pct;    /* max % of fs for inode space */
-                                        /* statistics */
-        /*
-         * These fields must remain contiguous.  If you really
-         * want to change their layout, make sure you fix the
-         * code in xfs_trans_apply_sb_deltas().
-         */
-        __uint64_t      sb_icount;      /* allocated inodes */
-        __uint64_t      sb_ifree;       /* free inodes */
-        __uint64_t      sb_fdblocks;    /* free data blocks */
-        __uint64_t      sb_frextents;   /* free realtime extents */
-        /*
-         * End contiguous fields.
-         */
-        xfs_ino_t       sb_uquotino;    /* user quota inode */
-        xfs_ino_t       sb_gquotino;    /* group quota inode */
-        __uint16_t      sb_qflags;      /* quota flags */
-        __uint8_t       sb_flags;       /* misc. flags */
-        __uint8_t       sb_shared_vn;   /* shared version number */
-        xfs_extlen_t    sb_inoalignmt;  /* inode chunk alignment, fsblocks */
-        __uint32_t      sb_unit;        /* stripe or raid unit */
-        __uint32_t      sb_width;       /* stripe or raid width */
-        __uint8_t       sb_dirblklog;   /* log2 of dir block size (fsbs) */
-        __uint8_t       sb_logsectlog;  /* log2 of the log sector size */
-        __uint16_t      sb_logsectsize; /* sector size for the log, bytes */
-        __uint32_t      sb_logsunit;    /* stripe unit size for the log */
-        __uint32_t      sb_features2;   /* additional feature bits */
-        /*
-         * bad features2 field as a result of failing to pad the sb
-         * structure to 64 bits. Some machines will be using this field
-         * for features2 bits. Easiest just to mark it bad and not use
-         * it for anything else.
-         */
-        __uint32_t      sb_bad_features2;
-        /* version 5 superblock fields start here */
-        /* feature masks */
-        __uint32_t      sb_features_compat;
-        __uint32_t      sb_features_ro_compat;
-        __uint32_t      sb_features_incompat;
-        __uint32_t      sb_features_log_incompat;
-        __uint32_t      sb_crc;         /* superblock crc */
-        __uint32_t      sb_pad;
-        xfs_ino_t       sb_pquotino;    /* project quota inode */
-        xfs_lsn_t       sb_lsn;         /* last write sequence */
-        /* must be padded to 64 bit alignment */
-} xfs_sb_t;
-#define XFS_SB_CRC_OFF          offsetof(struct xfs_sb, sb_crc)
-/*
- * Superblock - on disk version.  Must match the in core version above.
- * Must be padded to 64 bit alignment.
- */
-typedef struct xfs_dsb {
-        __be32          sb_magicnum;    /* magic number == XFS_SB_MAGIC */
-        __be32          sb_blocksize;   /* logical block size, bytes */
-        __be64          sb_dblocks;     /* number of data blocks */
-        __be64          sb_rblocks;     /* number of realtime blocks */
-        __be64          sb_rextents;    /* number of realtime extents */
-        uuid_t          sb_uuid;        /* file system unique id */
-        __be64          sb_logstart;    /* starting block of log if internal */
-        __be64          sb_rootino;     /* root inode number */
-        __be64          sb_rbmino;      /* bitmap inode for realtime extents */
-        __be64          sb_rsumino;     /* summary inode for rt bitmap */
-        __be32          sb_rextsize;    /* realtime extent size, blocks */
-        __be32          sb_agblocks;    /* size of an allocation group */
-        __be32          sb_agcount;     /* number of allocation groups */
-        __be32          sb_rbmblocks;   /* number of rt bitmap blocks */
-        __be32          sb_logblocks;   /* number of log blocks */
-        __be16          sb_versionnum;  /* header version == XFS_SB_VERSION */
-        __be16          sb_sectsize;    /* volume sector size, bytes */
-        __be16          sb_inodesize;   /* inode size, bytes */
-        __be16          sb_inopblock;   /* inodes per block */
-        char            sb_fname[12];   /* file system name */
-        __u8            sb_blocklog;    /* log2 of sb_blocksize */
-        __u8            sb_sectlog;     /* log2 of sb_sectsize */
-        __u8            sb_inodelog;    /* log2 of sb_inodesize */
-        __u8            sb_inopblog;    /* log2 of sb_inopblock */
-        __u8            sb_agblklog;    /* log2 of sb_agblocks (rounded up) */
-        __u8            sb_rextslog;    /* log2 of sb_rextents */
-        __u8            sb_inprogress;  /* mkfs is in progress, don't mount */
-        __u8            sb_imax_pct;    /* max % of fs for inode space */
-                                        /* statistics */
-        /*
-         * These fields must remain contiguous.  If you really
-         * want to change their layout, make sure you fix the
-         * code in xfs_trans_apply_sb_deltas().
-         */
-        __be64          sb_icount;      /* allocated inodes */
-        __be64          sb_ifree;       /* free inodes */
-        __be64          sb_fdblocks;    /* free data blocks */
-        __be64          sb_frextents;   /* free realtime extents */
-        /*
-         * End contiguous fields.
-         */
-        __be64          sb_uquotino;    /* user quota inode */
-        __be64          sb_gquotino;    /* group quota inode */
-        __be16          sb_qflags;      /* quota flags */
-        __u8            sb_flags;       /* misc. flags */
-        __u8            sb_shared_vn;   /* shared version number */
-        __be32          sb_inoalignmt;  /* inode chunk alignment, fsblocks */
-        __be32          sb_unit;        /* stripe or raid unit */
-        __be32          sb_width;       /* stripe or raid width */
-        __u8            sb_dirblklog;   /* log2 of dir block size (fsbs) */
-        __u8            sb_logsectlog;  /* log2 of the log sector size */
-        __be16          sb_logsectsize; /* sector size for the log, bytes */
-        __be32          sb_logsunit;    /* stripe unit size for the log */
-        __be32          sb_features2;   /* additional feature bits */
-        /*
-         * bad features2 field as a result of failing to pad the sb
-         * structure to 64 bits. Some machines will be using this field
-         * for features2 bits. Easiest just to mark it bad and not use
-         * it for anything else.
-         */
-        __be32          sb_bad_features2;
-        /* version 5 superblock fields start here */
-        /* feature masks */
-        __be32          sb_features_compat;
-        __be32          sb_features_ro_compat;
-        __be32          sb_features_incompat;
-        __be32          sb_features_log_incompat;
-        __le32          sb_crc;         /* superblock crc */
-        __be32          sb_pad;
-        __be64          sb_pquotino;    /* project quota inode */
-        __be64          sb_lsn;         /* last write sequence */
-        /* must be padded to 64 bit alignment */
-} xfs_dsb_t;
-/*
- * Sequence number values for the fields.
- */
-typedef enum {
-        XFS_SBS_MAGICNUM, XFS_SBS_BLOCKSIZE, XFS_SBS_DBLOCKS, XFS_SBS_RBLOCKS,
-        XFS_SBS_REXTENTS, XFS_SBS_UUID, XFS_SBS_LOGSTART, XFS_SBS_ROOTINO,
-        XFS_SBS_RBMINO, XFS_SBS_RSUMINO, XFS_SBS_REXTSIZE, XFS_SBS_AGBLOCKS,
-        XFS_SBS_AGCOUNT, XFS_SBS_RBMBLOCKS, XFS_SBS_LOGBLOCKS,
-        XFS_SBS_VERSIONNUM, XFS_SBS_SECTSIZE, XFS_SBS_INODESIZE,
-        XFS_SBS_INOPBLOCK, XFS_SBS_FNAME, XFS_SBS_BLOCKLOG,
-        XFS_SBS_SECTLOG, XFS_SBS_INODELOG, XFS_SBS_INOPBLOG, XFS_SBS_AGBLKLOG,
-        XFS_SBS_REXTSLOG, XFS_SBS_INPROGRESS, XFS_SBS_IMAX_PCT, XFS_SBS_ICOUNT,
-        XFS_SBS_IFREE, XFS_SBS_FDBLOCKS, XFS_SBS_FREXTENTS, XFS_SBS_UQUOTINO,
-        XFS_SBS_GQUOTINO, XFS_SBS_QFLAGS, XFS_SBS_FLAGS, XFS_SBS_SHARED_VN,
-        XFS_SBS_INOALIGNMT, XFS_SBS_UNIT, XFS_SBS_WIDTH, XFS_SBS_DIRBLKLOG,
-        XFS_SBS_LOGSECTLOG, XFS_SBS_LOGSECTSIZE, XFS_SBS_LOGSUNIT,
-        XFS_SBS_FEATURES2, XFS_SBS_BAD_FEATURES2, XFS_SBS_FEATURES_COMPAT,
-        XFS_SBS_FEATURES_RO_COMPAT, XFS_SBS_FEATURES_INCOMPAT,
-        XFS_SBS_FEATURES_LOG_INCOMPAT, XFS_SBS_CRC, XFS_SBS_PAD,
-        XFS_SBS_PQUOTINO, XFS_SBS_LSN,
-        XFS_SBS_FIELDCOUNT
-} xfs_sb_field_t;
-/*
- * Mask values, defined based on the xfs_sb_field_t values.
- * Only define the ones we're using.
- */
-#define XFS_SB_MVAL(x)          (1LL << XFS_SBS_ ## x)
-#define XFS_SB_UUID             XFS_SB_MVAL(UUID)
-#define XFS_SB_FNAME            XFS_SB_MVAL(FNAME)
-#define XFS_SB_ROOTINO          XFS_SB_MVAL(ROOTINO)
-#define XFS_SB_RBMINO           XFS_SB_MVAL(RBMINO)
-#define XFS_SB_RSUMINO          XFS_SB_MVAL(RSUMINO)
-#define XFS_SB_VERSIONNUM       XFS_SB_MVAL(VERSIONNUM)
-#define XFS_SB_UQUOTINO         XFS_SB_MVAL(UQUOTINO)
-#define XFS_SB_GQUOTINO         XFS_SB_MVAL(GQUOTINO)
-#define XFS_SB_QFLAGS           XFS_SB_MVAL(QFLAGS)
-#define XFS_SB_SHARED_VN        XFS_SB_MVAL(SHARED_VN)
-#define XFS_SB_UNIT             XFS_SB_MVAL(UNIT)
-#define XFS_SB_WIDTH            XFS_SB_MVAL(WIDTH)
-#define XFS_SB_ICOUNT           XFS_SB_MVAL(ICOUNT)
-#define XFS_SB_IFREE            XFS_SB_MVAL(IFREE)
-#define XFS_SB_FDBLOCKS         XFS_SB_MVAL(FDBLOCKS)
-#define XFS_SB_FEATURES2        XFS_SB_MVAL(FEATURES2)
-#define XFS_SB_BAD_FEATURES2    XFS_SB_MVAL(BAD_FEATURES2)
-#define XFS_SB_FEATURES_COMPAT  XFS_SB_MVAL(FEATURES_COMPAT)
-#define XFS_SB_FEATURES_RO_COMPAT XFS_SB_MVAL(FEATURES_RO_COMPAT)
-#define XFS_SB_FEATURES_INCOMPAT XFS_SB_MVAL(FEATURES_INCOMPAT)
-#define XFS_SB_FEATURES_LOG_INCOMPAT XFS_SB_MVAL(FEATURES_LOG_INCOMPAT)
-#define XFS_SB_CRC              XFS_SB_MVAL(CRC)
-#define XFS_SB_PQUOTINO         XFS_SB_MVAL(PQUOTINO)
-#define XFS_SB_NUM_BITS         ((int)XFS_SBS_FIELDCOUNT)
-#define XFS_SB_ALL_BITS         ((1LL << XFS_SB_NUM_BITS) - 1)
-#define XFS_SB_MOD_BITS         \
-        (XFS_SB_UUID | XFS_SB_ROOTINO | XFS_SB_RBMINO | XFS_SB_RSUMINO | \
-         XFS_SB_VERSIONNUM | XFS_SB_UQUOTINO | XFS_SB_GQUOTINO | \
-         XFS_SB_QFLAGS | XFS_SB_SHARED_VN | XFS_SB_UNIT | XFS_SB_WIDTH | \
-         XFS_SB_ICOUNT | XFS_SB_IFREE | XFS_SB_FDBLOCKS | XFS_SB_FEATURES2 | \
-         XFS_SB_BAD_FEATURES2 | XFS_SB_FEATURES_COMPAT | \
-         XFS_SB_FEATURES_RO_COMPAT | XFS_SB_FEATURES_INCOMPAT | \
-         XFS_SB_FEATURES_LOG_INCOMPAT | XFS_SB_PQUOTINO)
-/*
- * Misc. Flags - warning - these will be cleared by xfs_repair unless
- * a feature bit is set when the flag is used.
- */
-#define XFS_SBF_NOFLAGS         0x00    /* no flags set */
-#define XFS_SBF_READONLY        0x01    /* only read-only mounts allowed */
-/*
- * define max. shared version we can interoperate with
- */
-#define XFS_SB_MAX_SHARED_VN    0
-#define XFS_SB_VERSION_NUM(sbp) ((sbp)->sb_versionnum & XFS_SB_VERSION_NUMBITS)
-/*
- * The first XFS version we support is a v4 superblock with V2 directories.
- */
-static inline bool xfs_sb_good_v4_features(struct xfs_sb *sbp)
-{
-        if (!(sbp->sb_versionnum & XFS_SB_VERSION_DIRV2BIT))
-                return false;
-        /* check for unknown features in the fs */
-        if ((sbp->sb_versionnum & ~XFS_SB_VERSION_OKBITS) ||
-            ((sbp->sb_versionnum & XFS_SB_VERSION_MOREBITSBIT) &&
-             (sbp->sb_features2 & ~XFS_SB_VERSION2_OKBITS)))
-                return false;
-        return true;
-}
-static inline bool xfs_sb_good_version(struct xfs_sb *sbp)
-{
-        if (XFS_SB_VERSION_NUM(sbp) == XFS_SB_VERSION_5)
-                return true;
-        if (XFS_SB_VERSION_NUM(sbp) == XFS_SB_VERSION_4)
-                return xfs_sb_good_v4_features(sbp);
-        return false;
-}
-/*
- * Detect a mismatched features2 field.  Older kernels read/wrote
- * this into the wrong slot, so to be safe we keep them in sync.
- */
-static inline bool xfs_sb_has_mismatched_features2(struct xfs_sb *sbp)
-{
-        return sbp->sb_bad_features2 != sbp->sb_features2;
-}
-static inline bool xfs_sb_version_hasattr(struct xfs_sb *sbp)
-{
-        return (sbp->sb_versionnum & XFS_SB_VERSION_ATTRBIT);
-}
-static inline void xfs_sb_version_addattr(struct xfs_sb *sbp)
-{
-        sbp->sb_versionnum |= XFS_SB_VERSION_ATTRBIT;
-}
-static inline bool xfs_sb_version_hasquota(struct xfs_sb *sbp)
-{
-        return (sbp->sb_versionnum & XFS_SB_VERSION_QUOTABIT);
-}
-static inline void xfs_sb_version_addquota(struct xfs_sb *sbp)
-{
-        sbp->sb_versionnum |= XFS_SB_VERSION_QUOTABIT;
-}
-static inline bool xfs_sb_version_hasalign(struct xfs_sb *sbp)
-{
-        return (XFS_SB_VERSION_NUM(sbp) == XFS_SB_VERSION_5 ||
-                (sbp->sb_versionnum & XFS_SB_VERSION_ALIGNBIT));
-}
-static inline bool xfs_sb_version_hasdalign(struct xfs_sb *sbp)
-{
-        return (sbp->sb_versionnum & XFS_SB_VERSION_DALIGNBIT);
-}
-static inline bool xfs_sb_version_haslogv2(struct xfs_sb *sbp)
-{
-        return XFS_SB_VERSION_NUM(sbp) == XFS_SB_VERSION_5 ||
-               (sbp->sb_versionnum & XFS_SB_VERSION_LOGV2BIT);
-}
-static inline bool xfs_sb_version_hasextflgbit(struct xfs_sb *sbp)
-{
-        return XFS_SB_VERSION_NUM(sbp) == XFS_SB_VERSION_5 ||
-               (sbp->sb_versionnum & XFS_SB_VERSION_EXTFLGBIT);
-}
-static inline bool xfs_sb_version_hassector(struct xfs_sb *sbp)
-{
-        return (sbp->sb_versionnum & XFS_SB_VERSION_SECTORBIT);
-}
-static inline bool xfs_sb_version_hasasciici(struct xfs_sb *sbp)
-{
-        return (sbp->sb_versionnum & XFS_SB_VERSION_BORGBIT);
-}
-static inline bool xfs_sb_version_hasmorebits(struct xfs_sb *sbp)
-{
-        return XFS_SB_VERSION_NUM(sbp) == XFS_SB_VERSION_5 ||
-               (sbp->sb_versionnum & XFS_SB_VERSION_MOREBITSBIT);
-}
-/*
- * sb_features2 bit version macros.
- */
-static inline bool xfs_sb_version_haslazysbcount(struct xfs_sb *sbp)
-{
-        return (XFS_SB_VERSION_NUM(sbp) == XFS_SB_VERSION_5) ||
-               (xfs_sb_version_hasmorebits(sbp) &&
-                (sbp->sb_features2 & XFS_SB_VERSION2_LAZYSBCOUNTBIT));
-}
-static inline bool xfs_sb_version_hasattr2(struct xfs_sb *sbp)
-{
-        return (XFS_SB_VERSION_NUM(sbp) == XFS_SB_VERSION_5) ||
-               (xfs_sb_version_hasmorebits(sbp) &&
-                (sbp->sb_features2 & XFS_SB_VERSION2_ATTR2BIT));
-}
-static inline void xfs_sb_version_addattr2(struct xfs_sb *sbp)
-{
-        sbp->sb_versionnum |= XFS_SB_VERSION_MOREBITSBIT;
-        sbp->sb_features2 |= XFS_SB_VERSION2_ATTR2BIT;
-        sbp->sb_bad_features2 |= XFS_SB_VERSION2_ATTR2BIT;
-}
-static inline void xfs_sb_version_removeattr2(struct xfs_sb *sbp)
-{
-        sbp->sb_features2 &= ~XFS_SB_VERSION2_ATTR2BIT;
-        sbp->sb_bad_features2 &= ~XFS_SB_VERSION2_ATTR2BIT;
-        if (!sbp->sb_features2)
-                sbp->sb_versionnum &= ~XFS_SB_VERSION_MOREBITSBIT;
-}
-static inline bool xfs_sb_version_hasprojid32bit(struct xfs_sb *sbp)
-{
-        return (XFS_SB_VERSION_NUM(sbp) == XFS_SB_VERSION_5) ||
-               (xfs_sb_version_hasmorebits(sbp) &&
-                (sbp->sb_features2 & XFS_SB_VERSION2_PROJID32BIT));
-}
-static inline void xfs_sb_version_addprojid32bit(struct xfs_sb *sbp)
-{
-        sbp->sb_versionnum |= XFS_SB_VERSION_MOREBITSBIT;
-        sbp->sb_features2 |= XFS_SB_VERSION2_PROJID32BIT;
-        sbp->sb_bad_features2 |= XFS_SB_VERSION2_PROJID32BIT;
-}
-/*
- * Extended v5 superblock feature masks. These are to be used for new v5
- * superblock features only.
- *
- * Compat features are new features that old kernels will not notice or affect
- * and so can mount read-write without issues.
- *
- * RO-Compat (read only) are features that old kernels can read but will break
- * if they write. Hence only read-only mounts of such filesystems are allowed on
- * kernels that don't support the feature bit.
- *
- * InCompat features are features which old kernels will not understand and so
- * must not mount.
- *
- * Log-InCompat features are for changes to log formats or new transactions that
- * can't be replayed on older kernels. The fields are set when the filesystem is
- * mounted, and a clean unmount clears the fields.
- */
-#define XFS_SB_FEAT_COMPAT_ALL 0
-#define XFS_SB_FEAT_COMPAT_UNKNOWN      ~XFS_SB_FEAT_COMPAT_ALL
-static inline bool
-xfs_sb_has_compat_feature(
-        struct xfs_sb   *sbp,
-        __uint32_t      feature)
-{
-        return (sbp->sb_features_compat & feature) != 0;
-}
-#define XFS_SB_FEAT_RO_COMPAT_FINOBT   (1 << 0)         /* free inode btree */
-#define XFS_SB_FEAT_RO_COMPAT_ALL \
-                (XFS_SB_FEAT_RO_COMPAT_FINOBT)
-#define XFS_SB_FEAT_RO_COMPAT_UNKNOWN   ~XFS_SB_FEAT_RO_COMPAT_ALL
-static inline bool
-xfs_sb_has_ro_compat_feature(
-        struct xfs_sb   *sbp,
-        __uint32_t      feature)
-{
-        return (sbp->sb_features_ro_compat & feature) != 0;
-}
-#define XFS_SB_FEAT_INCOMPAT_FTYPE      (1 << 0)        /* filetype in dirent */
-#define XFS_SB_FEAT_INCOMPAT_ALL \
-                (XFS_SB_FEAT_INCOMPAT_FTYPE)
-#define XFS_SB_FEAT_INCOMPAT_UNKNOWN    ~XFS_SB_FEAT_INCOMPAT_ALL
-static inline bool
-xfs_sb_has_incompat_feature(
-        struct xfs_sb   *sbp,
-        __uint32_t      feature)
-{
-        return (sbp->sb_features_incompat & feature) != 0;
-}
-#define XFS_SB_FEAT_INCOMPAT_LOG_ALL 0
-#define XFS_SB_FEAT_INCOMPAT_LOG_UNKNOWN        ~XFS_SB_FEAT_INCOMPAT_LOG_ALL
-static inline bool
-xfs_sb_has_incompat_log_feature(
-        struct xfs_sb   *sbp,
-        __uint32_t      feature)
-{
-        return (sbp->sb_features_log_incompat & feature) != 0;
-}
-/*
- * V5 superblock specific feature checks
- */
-static inline int xfs_sb_version_hascrc(struct xfs_sb *sbp)
-{
-        return XFS_SB_VERSION_NUM(sbp) == XFS_SB_VERSION_5;
-}
-static inline int xfs_sb_version_has_pquotino(struct xfs_sb *sbp)
-{
-        return XFS_SB_VERSION_NUM(sbp) == XFS_SB_VERSION_5;
-}
-static inline int xfs_sb_version_hasftype(struct xfs_sb *sbp)
-{
-        return (XFS_SB_VERSION_NUM(sbp) == XFS_SB_VERSION_5 &&
-                xfs_sb_has_incompat_feature(sbp, XFS_SB_FEAT_INCOMPAT_FTYPE)) ||
-               (xfs_sb_version_hasmorebits(sbp) &&
-                 (sbp->sb_features2 & XFS_SB_VERSION2_FTYPE));
-}
-static inline int xfs_sb_version_hasfinobt(xfs_sb_t *sbp)
-{
-        return (XFS_SB_VERSION_NUM(sbp) == XFS_SB_VERSION_5) &&
-                (sbp->sb_features_ro_compat & XFS_SB_FEAT_RO_COMPAT_FINOBT);
-}
-/*
- * end of superblock version macros
- */
-static inline bool
-xfs_is_quota_inode(struct xfs_sb *sbp, xfs_ino_t ino)
-{
-        return (ino == sbp->sb_uquotino ||
-                ino == sbp->sb_gquotino ||
-                ino == sbp->sb_pquotino);
-}
-#define XFS_SB_DADDR            ((xfs_daddr_t)0) /* daddr in filesystem/ag */
-#define XFS_SB_BLOCK(mp)        XFS_HDR_BLOCK(mp, XFS_SB_DADDR)
-#define XFS_BUF_TO_SBP(bp)      ((xfs_dsb_t *)((bp)->b_addr))
-#define XFS_HDR_BLOCK(mp,d)     ((xfs_agblock_t)XFS_BB_TO_FSBT(mp,d))
-#define XFS_DADDR_TO_FSB(mp,d)  XFS_AGB_TO_FSB(mp, \
-                        xfs_daddr_to_agno(mp,d), xfs_daddr_to_agbno(mp,d))
-#define XFS_FSB_TO_DADDR(mp,fsbno)      XFS_AGB_TO_DADDR(mp, \
-                        XFS_FSB_TO_AGNO(mp,fsbno), XFS_FSB_TO_AGBNO(mp,fsbno))
-/*
- * File system sector to basic block conversions.
- */
-#define XFS_FSS_TO_BB(mp,sec)   ((sec) << (mp)->m_sectbb_log)
-/*
- * File system block to basic block conversions.
- */
-#define XFS_FSB_TO_BB(mp,fsbno) ((fsbno) << (mp)->m_blkbb_log)
-#define XFS_BB_TO_FSB(mp,bb)    \
-        (((bb) + (XFS_FSB_TO_BB(mp,1) - 1)) >> (mp)->m_blkbb_log)
-#define XFS_BB_TO_FSBT(mp,bb)   ((bb) >> (mp)->m_blkbb_log)
-/*
- * File system block to byte conversions.
- */
-#define XFS_FSB_TO_B(mp,fsbno)  ((xfs_fsize_t)(fsbno) << (mp)->m_sb.sb_blocklog)
-#define XFS_B_TO_FSB(mp,b)      \
-        ((((__uint64_t)(b)) + (mp)->m_blockmask) >> (mp)->m_sb.sb_blocklog)
-#define XFS_B_TO_FSBT(mp,b)     (((__uint64_t)(b)) >> (mp)->m_sb.sb_blocklog)
-#define XFS_B_FSB_OFFSET(mp,b)  ((b) & (mp)->m_blockmask)
-/*
 * perag get/put wrappers for ref counting
 */
 extern struct xfs_perag *xfs_perag_get(struct xfs_mount *, xfs_agnumber_t);
diff --git a/fs/xfs/libxfs/xfs_symlink_remote.c b/fs/xfs/libxfs/xfs_symlink_remote.c
index 5782f037eab4..c80c5236c3da 100644
--- a/fs/xfs/libxfs/xfs_symlink_remote.c
+++ b/fs/xfs/libxfs/xfs_symlink_remote.c
@@ -22,8 +22,6 @@
 #include "xfs_log_format.h"
 #include "xfs_shared.h"
 #include "xfs_trans_resv.h"
-#include "xfs_ag.h"
-#include "xfs_sb.h"
 #include "xfs_mount.h"
 #include "xfs_bmap_btree.h"
 #include "xfs_inode.h"
diff --git a/fs/xfs/libxfs/xfs_trans_resv.c b/fs/xfs/libxfs/xfs_trans_resv.c
index f2bda7c76b8a..6c1330f29050 100644
--- a/fs/xfs/libxfs/xfs_trans_resv.c
+++ b/fs/xfs/libxfs/xfs_trans_resv.c
@@ -22,8 +22,6 @@
 #include "xfs_format.h"
 #include "xfs_log_format.h"
 #include "xfs_trans_resv.h"
-#include "xfs_sb.h"
-#include "xfs_ag.h"
 #include "xfs_mount.h"
 #include "xfs_da_format.h"
 #include "xfs_da_btree.h"
diff --git a/fs/xfs/xfs_acl.c b/fs/xfs/xfs_acl.c
index a65fa5dde6e9..4b641676f258 100644
--- a/fs/xfs/xfs_acl.c
+++ b/fs/xfs/xfs_acl.c
@@ -19,8 +19,6 @@
 #include "xfs_format.h"
 #include "xfs_log_format.h"
 #include "xfs_trans_resv.h"
-#include "xfs_ag.h"
-#include "xfs_sb.h"
 #include "xfs_mount.h"
 #include "xfs_inode.h"
 #include "xfs_acl.h"
diff --git a/fs/xfs/xfs_acl.h b/fs/xfs/xfs_acl.h
index 5dc163744511..3841b07f27bf 100644
--- a/fs/xfs/xfs_acl.h
+++ b/fs/xfs/xfs_acl.h
@@ -22,42 +22,6 @@ struct inode;
 struct posix_acl;
 struct xfs_inode;
-#define XFS_ACL_NOT_PRESENT (-1)
-/* On-disk XFS access control list structure */
-struct xfs_acl_entry {
-        __be32  ae_tag;
-        __be32  ae_id;
-        __be16  ae_perm;
-        __be16  ae_pad;         /* fill the implicit hole in the structure */
-};
-struct xfs_acl {
-        __be32                  acl_cnt;
-        struct xfs_acl_entry    acl_entry[0];
-};
-/*
- * The number of ACL entries allowed is defined by the on-disk format.
- * For v4 superblocks, that is limited to 25 entries. For v5 superblocks, it is
- * limited only by the maximum size of the xattr that stores the information.
- */
-#define XFS_ACL_MAX_ENTRIES(mp) \
-        (xfs_sb_version_hascrc(&mp->m_sb) \
-                ?  (XATTR_SIZE_MAX - sizeof(struct xfs_acl)) / \
-                                                sizeof(struct xfs_acl_entry) \
-                : 25)
-#define XFS_ACL_MAX_SIZE(mp) \
-        (sizeof(struct xfs_acl) + \
-                sizeof(struct xfs_acl_entry) * XFS_ACL_MAX_ENTRIES((mp)))
-/* On-disk XFS extended attribute names */
-#define SGI_ACL_FILE            (unsigned char *)"SGI_ACL_FILE"
-#define SGI_ACL_DEFAULT         (unsigned char *)"SGI_ACL_DEFAULT"
-#define SGI_ACL_FILE_SIZE       (sizeof(SGI_ACL_FILE)-1)
-#define SGI_ACL_DEFAULT_SIZE    (sizeof(SGI_ACL_DEFAULT)-1)
 #ifdef CONFIG_XFS_POSIX_ACL
 extern struct posix_acl *xfs_get_acl(struct inode *inode, int type);
 extern int xfs_set_acl(struct inode *inode, struct posix_acl *acl, int type);
diff --git a/fs/xfs/xfs_aops.c b/fs/xfs/xfs_aops.c
index f5b2453a43b2..18e2f3bbae5e 100644
--- a/fs/xfs/xfs_aops.c
+++ b/fs/xfs/xfs_aops.c
@@ -20,8 +20,6 @@
 #include "xfs_format.h"
 #include "xfs_log_format.h"
 #include "xfs_trans_resv.h"
-#include "xfs_sb.h"
-#include "xfs_ag.h"
 #include "xfs_mount.h"
 #include "xfs_inode.h"
 #include "xfs_trans.h"
@@ -33,7 +31,6 @@
 #include "xfs_bmap.h"
 #include "xfs_bmap_util.h"
 #include "xfs_bmap_btree.h"
-#include "xfs_dinode.h"
 #include <linux/aio.h>
 #include <linux/gfp.h>
 #include <linux/mpage.h>
diff --git a/fs/xfs/xfs_attr_inactive.c b/fs/xfs/xfs_attr_inactive.c
index aa2a8b1838a2..83af4c149635 100644
--- a/fs/xfs/xfs_attr_inactive.c
+++ b/fs/xfs/xfs_attr_inactive.c
@@ -23,8 +23,6 @@
 #include "xfs_log_format.h"
 #include "xfs_trans_resv.h"
 #include "xfs_bit.h"
-#include "xfs_sb.h"
-#include "xfs_ag.h"
 #include "xfs_mount.h"
 #include "xfs_da_format.h"
 #include "xfs_da_btree.h"
@@ -39,7 +37,6 @@
 #include "xfs_error.h"
 #include "xfs_quota.h"
 #include "xfs_trace.h"
-#include "xfs_dinode.h"
 #include "xfs_dir2.h"
 /*
diff --git a/fs/xfs/xfs_attr_list.c b/fs/xfs/xfs_attr_list.c
index 62db83ab6cbc..a43d370d2c58 100644
--- a/fs/xfs/xfs_attr_list.c
+++ b/fs/xfs/xfs_attr_list.c
@@ -22,8 +22,6 @@
 #include "xfs_log_format.h"
 #include "xfs_trans_resv.h"
 #include "xfs_bit.h"
-#include "xfs_sb.h"
-#include "xfs_ag.h"
 #include "xfs_mount.h"
 #include "xfs_da_format.h"
 #include "xfs_da_btree.h"
@@ -39,7 +37,6 @@
 #include "xfs_trace.h"
 #include "xfs_buf_item.h"
 #include "xfs_cksum.h"
-#include "xfs_dinode.h"
 #include "xfs_dir2.h"
 STATIC int
diff --git a/fs/xfs/xfs_bmap_util.c b/fs/xfs/xfs_bmap_util.c
index 281002689d64..22a5dcb70b32 100644
--- a/fs/xfs/xfs_bmap_util.c
+++ b/fs/xfs/xfs_bmap_util.c
@@ -23,8 +23,6 @@
 #include "xfs_log_format.h"
 #include "xfs_trans_resv.h"
 #include "xfs_bit.h"
-#include "xfs_sb.h"
-#include "xfs_ag.h"
 #include "xfs_mount.h"
 #include "xfs_da_format.h"
 #include "xfs_inode.h"
@@ -42,7 +40,6 @@
 #include "xfs_trace.h"
 #include "xfs_icache.h"
 #include "xfs_log.h"
-#include "xfs_dinode.h"
 /* Kernel only BMAP related definitions and functions */
diff --git a/fs/xfs/xfs_buf.c b/fs/xfs/xfs_buf.c
index 24b4ebea0d4d..bb502a391792 100644
--- a/fs/xfs/xfs_buf.c
+++ b/fs/xfs/xfs_buf.c
@@ -34,18 +34,16 @@
 #include <linux/backing-dev.h>
 #include <linux/freezer.h>
+#include "xfs_format.h"
 #include "xfs_log_format.h"
 #include "xfs_trans_resv.h"
 #include "xfs_sb.h"
-#include "xfs_ag.h"
 #include "xfs_mount.h"
 #include "xfs_trace.h"
 #include "xfs_log.h"
 static kmem_zone_t *xfs_buf_zone;
-static struct workqueue_struct *xfslogd_workqueue;
 #ifdef XFS_BUF_LOCK_TRACKING
 # define XB_SET_OWNER(bp)       ((bp)->b_last_holder = current->pid)
 # define XB_CLEAR_OWNER(bp)     ((bp)->b_last_holder = -1)
@@ -463,7 +461,7 @@ _xfs_buf_find(
         * have to check that the buffer falls within the filesystem bounds.
         */
        eofs = XFS_FSB_TO_BB(btp->bt_mount, btp->bt_mount->m_sb.sb_dblocks);
-        if (blkno >= eofs) {
+        if (blkno < 0 || blkno >= eofs) {
                /*
                 * XXX (dgc): we should really be returning -EFSCORRUPTED here,
                 * but none of the higher level infrastructure supports
@@ -1043,7 +1041,7 @@ xfs_buf_ioend_work(
        struct work_struct      *work)
 {
        struct xfs_buf          *bp =
-                container_of(work, xfs_buf_t, b_iodone_work);
+                container_of(work, xfs_buf_t, b_ioend_work);
        xfs_buf_ioend(bp);
 }
@@ -1052,8 +1050,8 @@ void
 xfs_buf_ioend_async(
        struct xfs_buf  *bp)
 {
-        INIT_WORK(&bp->b_iodone_work, xfs_buf_ioend_work);
+        INIT_WORK(&bp->b_ioend_work, xfs_buf_ioend_work);
-        queue_work(xfslogd_workqueue, &bp->b_iodone_work);
+        queue_work(bp->b_ioend_wq, &bp->b_ioend_work);
 }
 void
@@ -1222,6 +1220,13 @@ _xfs_buf_ioapply(
         */
        bp->b_error = 0;
+        /*
+         * Initialize the I/O completion workqueue if we haven't yet or the
+         * submitter has not opted to specify a custom one.
+         */
+        if (!bp->b_ioend_wq)
+                bp->b_ioend_wq = bp->b_target->bt_mount->m_buf_workqueue;
        if (bp->b_flags & XBF_WRITE) {
                if (bp->b_flags & XBF_SYNCIO)
                        rw = WRITE_SYNC;
@@ -1882,15 +1887,8 @@ xfs_buf_init(void)
        if (!xfs_buf_zone)
                goto out;
-        xfslogd_workqueue = alloc_workqueue("xfslogd",
-                                WQ_MEM_RECLAIM | WQ_HIGHPRI | WQ_FREEZABLE, 1);
-        if (!xfslogd_workqueue)
-                goto out_free_buf_zone;
        return 0;
- out_free_buf_zone:
-        kmem_zone_destroy(xfs_buf_zone);
 out:
        return -ENOMEM;
 }
@@ -1898,6 +1896,5 @@ xfs_buf_init(void)
 void
 xfs_buf_terminate(void)
 {
-        destroy_workqueue(xfslogd_workqueue);
        kmem_zone_destroy(xfs_buf_zone);
 }
diff --git a/fs/xfs/xfs_buf.h b/fs/xfs/xfs_buf.h
index 82002c00af90..75ff5d5a7d2e 100644
--- a/fs/xfs/xfs_buf.h
+++ b/fs/xfs/xfs_buf.h
@@ -164,7 +164,8 @@ typedef struct xfs_buf {
        struct xfs_perag        *b_pag;         /* contains rbtree root */
        xfs_buftarg_t           *b_target;      /* buffer target (device) */
        void                    *b_addr;        /* virtual address of buffer */
-        struct work_struct      b_iodone_work;
+        struct work_struct      b_ioend_work;
+        struct workqueue_struct *b_ioend_wq;    /* I/O completion wq */
        xfs_buf_iodone_t        b_iodone;       /* I/O completion function */
        struct completion       b_iowait;       /* queue for I/O waiters */
        void                    *b_fspriv;
diff --git a/fs/xfs/xfs_buf_item.c b/fs/xfs/xfs_buf_item.c
index f15969543326..3f9bd58edec7 100644
--- a/fs/xfs/xfs_buf_item.c
+++ b/fs/xfs/xfs_buf_item.c
@@ -17,11 +17,11 @@
 */
 #include "xfs.h"
 #include "xfs_fs.h"
+#include "xfs_format.h"
 #include "xfs_log_format.h"
 #include "xfs_trans_resv.h"
 #include "xfs_bit.h"
 #include "xfs_sb.h"
-#include "xfs_ag.h"
 #include "xfs_mount.h"
 #include "xfs_trans.h"
 #include "xfs_buf_item.h"
diff --git a/fs/xfs/xfs_dir2_readdir.c b/fs/xfs/xfs_dir2_readdir.c
index f1b69edcdf31..098cd78fe708 100644
--- a/fs/xfs/xfs_dir2_readdir.c
+++ b/fs/xfs/xfs_dir2_readdir.c
@@ -22,8 +22,6 @@
 #include "xfs_log_format.h"
 #include "xfs_trans_resv.h"
 #include "xfs_bit.h"
-#include "xfs_sb.h"
-#include "xfs_ag.h"
 #include "xfs_mount.h"
 #include "xfs_da_format.h"
 #include "xfs_da_btree.h"
@@ -34,7 +32,6 @@
 #include "xfs_trace.h"
 #include "xfs_bmap.h"
 #include "xfs_trans.h"
-#include "xfs_dinode.h"
 /*
 * Directory file type support functions
@@ -44,7 +41,7 @@ static unsigned char xfs_dir3_filetype_table[] = {
        DT_FIFO, DT_SOCK, DT_LNK, DT_WHT,
 };
-unsigned char
+static unsigned char
 xfs_dir3_get_dtype(
        struct xfs_mount        *mp,
        __uint8_t               filetype)
@@ -57,22 +54,6 @@ xfs_dir3_get_dtype(
        return xfs_dir3_filetype_table[filetype];
 }
-/*
- * @mode, if set, indicates that the type field needs to be set up.
- * This uses the transformation from file mode to DT_* as defined in linux/fs.h
- * for file type specification. This will be propagated into the directory
- * structure if appropriate for the given operation and filesystem config.
- */
-const unsigned char xfs_mode_to_ftype[S_IFMT >> S_SHIFT] = {
-        [0]                     = XFS_DIR3_FT_UNKNOWN,
-        [S_IFREG >> S_SHIFT]    = XFS_DIR3_FT_REG_FILE,
-        [S_IFDIR >> S_SHIFT]    = XFS_DIR3_FT_DIR,
-        [S_IFCHR >> S_SHIFT]    = XFS_DIR3_FT_CHRDEV,
-        [S_IFBLK >> S_SHIFT]    = XFS_DIR3_FT_BLKDEV,
-        [S_IFIFO >> S_SHIFT]    = XFS_DIR3_FT_FIFO,
-        [S_IFSOCK >> S_SHIFT]   = XFS_DIR3_FT_SOCK,
-        [S_IFLNK >> S_SHIFT]    = XFS_DIR3_FT_SYMLINK,
-};
 STATIC int
 xfs_dir2_sf_getdents(
diff --git a/fs/xfs/xfs_discard.c b/fs/xfs/xfs_discard.c
index 13d08a1b390e..799e5a2d334d 100644
--- a/fs/xfs/xfs_discard.c
+++ b/fs/xfs/xfs_discard.c
@@ -20,7 +20,6 @@
 #include "xfs_log_format.h"
 #include "xfs_trans_resv.h"
 #include "xfs_sb.h"
-#include "xfs_ag.h"
 #include "xfs_mount.h"
 #include "xfs_quota.h"
 #include "xfs_inode.h"
diff --git a/fs/xfs/xfs_dquot.c b/fs/xfs/xfs_dquot.c
index 63c2de49f61d..02c01bbbc789 100644
--- a/fs/xfs/xfs_dquot.c
+++ b/fs/xfs/xfs_dquot.c
@@ -22,8 +22,6 @@
 #include "xfs_shared.h"
 #include "xfs_trans_resv.h"
 #include "xfs_bit.h"
-#include "xfs_sb.h"
-#include "xfs_ag.h"
 #include "xfs_mount.h"
 #include "xfs_inode.h"
 #include "xfs_bmap.h"
diff --git a/fs/xfs/xfs_dquot_item.c b/fs/xfs/xfs_dquot_item.c
index f33fbaaa4d8a..814cff94e78f 100644
--- a/fs/xfs/xfs_dquot_item.c
+++ b/fs/xfs/xfs_dquot_item.c
@@ -20,8 +20,6 @@
 #include "xfs_format.h"
 #include "xfs_log_format.h"
 #include "xfs_trans_resv.h"
-#include "xfs_sb.h"
-#include "xfs_ag.h"
 #include "xfs_mount.h"
 #include "xfs_inode.h"
 #include "xfs_quota.h"
diff --git a/fs/xfs/xfs_error.c b/fs/xfs/xfs_error.c
index b92fd7bc49e3..3ee186ac1093 100644
--- a/fs/xfs/xfs_error.c
+++ b/fs/xfs/xfs_error.c
@@ -20,8 +20,6 @@
 #include "xfs_fs.h"
 #include "xfs_log_format.h"
 #include "xfs_trans_resv.h"
-#include "xfs_sb.h"
-#include "xfs_ag.h"
 #include "xfs_mount.h"
 #include "xfs_error.h"
diff --git a/fs/xfs/xfs_export.c b/fs/xfs/xfs_export.c
index 5a6bd5d8779a..5eb4a14e0a0f 100644
--- a/fs/xfs/xfs_export.c
+++ b/fs/xfs/xfs_export.c
@@ -19,10 +19,9 @@
 #include "xfs_format.h"
 #include "xfs_log_format.h"
 #include "xfs_trans_resv.h"
-#include "xfs_sb.h"
-#include "xfs_ag.h"
 #include "xfs_mount.h"
 #include "xfs_da_format.h"
+#include "xfs_da_btree.h"
 #include "xfs_dir2.h"
 #include "xfs_export.h"
 #include "xfs_inode.h"
diff --git a/fs/xfs/xfs_extent_busy.c b/fs/xfs/xfs_extent_busy.c
index fd22f69049d4..c263e079273e 100644
--- a/fs/xfs/xfs_extent_busy.c
+++ b/fs/xfs/xfs_extent_busy.c
@@ -24,7 +24,6 @@
 #include "xfs_shared.h"
 #include "xfs_trans_resv.h"
 #include "xfs_sb.h"
-#include "xfs_ag.h"
 #include "xfs_mount.h"
 #include "xfs_alloc.h"
 #include "xfs_extent_busy.h"
diff --git a/fs/xfs/xfs_extfree_item.c b/fs/xfs/xfs_extfree_item.c
index c4327419dc5c..cb7fe64cdbfa 100644
--- a/fs/xfs/xfs_extfree_item.c
+++ b/fs/xfs/xfs_extfree_item.c
@@ -17,10 +17,9 @@
 */
 #include "xfs.h"
 #include "xfs_fs.h"
+#include "xfs_format.h"
 #include "xfs_log_format.h"
 #include "xfs_trans_resv.h"
-#include "xfs_sb.h"
-#include "xfs_ag.h"
 #include "xfs_mount.h"
 #include "xfs_trans.h"
 #include "xfs_trans_priv.h"
diff --git a/fs/xfs/xfs_file.c b/fs/xfs/xfs_file.c
index eb596b419942..13e974e6a889 100644
--- a/fs/xfs/xfs_file.c
+++ b/fs/xfs/xfs_file.c
@@ -21,8 +21,6 @@
 #include "xfs_format.h"
 #include "xfs_log_format.h"
 #include "xfs_trans_resv.h"
-#include "xfs_sb.h"
-#include "xfs_ag.h"
 #include "xfs_mount.h"
 #include "xfs_da_format.h"
 #include "xfs_da_btree.h"
@@ -37,7 +35,6 @@
 #include "xfs_ioctl.h"
 #include "xfs_trace.h"
 #include "xfs_log.h"
-#include "xfs_dinode.h"
 #include "xfs_icache.h"
 #include <linux/aio.h>
@@ -933,7 +930,6 @@ xfs_file_readdir(
 {
        struct inode    *inode = file_inode(file);
        xfs_inode_t     *ip = XFS_I(inode);
-        int             error;
        size_t          bufsize;
        /*
@@ -950,10 +946,7 @@ xfs_file_readdir(
         */
        bufsize = (size_t)min_t(loff_t, 32768, ip->i_d.di_size);
-        error = xfs_readdir(ip, ctx, bufsize);
+        return xfs_readdir(ip, ctx, bufsize);
-        if (error)
-                return error;
-        return 0;
 }
 STATIC int
diff --git a/fs/xfs/xfs_filestream.c b/fs/xfs/xfs_filestream.c
index e92730c1d3ca..a2e86e8a0fea 100644
--- a/fs/xfs/xfs_filestream.c
+++ b/fs/xfs/xfs_filestream.c
@@ -20,16 +20,13 @@
 #include "xfs_format.h"
 #include "xfs_log_format.h"
 #include "xfs_trans_resv.h"
-#include "xfs_ag.h"
 #include "xfs_sb.h"
 #include "xfs_mount.h"
-#include "xfs_inum.h"
 #include "xfs_inode.h"
 #include "xfs_bmap.h"
 #include "xfs_bmap_util.h"
 #include "xfs_alloc.h"
 #include "xfs_mru_cache.h"
-#include "xfs_dinode.h"
 #include "xfs_filestream.h"
 #include "xfs_trace.h"
diff --git a/fs/xfs/xfs_fsops.c b/fs/xfs/xfs_fsops.c
index c05ac8b70fa9..fdc64220fcb0 100644
--- a/fs/xfs/xfs_fsops.c
+++ b/fs/xfs/xfs_fsops.c
@@ -22,7 +22,6 @@
 #include "xfs_log_format.h"
 #include "xfs_trans_resv.h"
 #include "xfs_sb.h"
-#include "xfs_ag.h"
 #include "xfs_mount.h"
 #include "xfs_da_format.h"
 #include "xfs_da_btree.h"
@@ -40,7 +39,6 @@
 #include "xfs_rtalloc.h"
 #include "xfs_trace.h"
 #include "xfs_log.h"
-#include "xfs_dinode.h"
 #include "xfs_filestream.h"
 /*
diff --git a/fs/xfs/xfs_icache.c b/fs/xfs/xfs_icache.c
index b45f7b27b5df..9771b7ef62ed 100644
--- a/fs/xfs/xfs_icache.c
+++ b/fs/xfs/xfs_icache.c
@@ -20,9 +20,7 @@
 #include "xfs_format.h"
 #include "xfs_log_format.h"
 #include "xfs_trans_resv.h"
-#include "xfs_inum.h"
 #include "xfs_sb.h"
-#include "xfs_ag.h"
 #include "xfs_mount.h"
 #include "xfs_inode.h"
 #include "xfs_error.h"
@@ -65,6 +63,7 @@ xfs_inode_alloc(
                return NULL;
        }
+        XFS_STATS_INC(vn_active);
        ASSERT(atomic_read(&ip->i_pincount) == 0);
        ASSERT(!spin_is_locked(&ip->i_flags_lock));
        ASSERT(!xfs_isiflocked(ip));
@@ -130,6 +129,7 @@ xfs_inode_free(
        /* asserts to verify all state is correct here */
        ASSERT(atomic_read(&ip->i_pincount) == 0);
        ASSERT(!xfs_isiflocked(ip));
+        XFS_STATS_DEC(vn_active);
        call_rcu(&VFS_I(ip)->i_rcu, xfs_inode_free_callback);
 }
diff --git a/fs/xfs/xfs_icache.h b/fs/xfs/xfs_icache.h
index 46748b86b12f..62f1f91c32cb 100644
--- a/fs/xfs/xfs_icache.h
+++ b/fs/xfs/xfs_icache.h
@@ -34,6 +34,14 @@ struct xfs_eofblocks {
 #define SYNC_TRYLOCK            0x0002  /* only try to lock inodes */
 /*
+ * tags for inode radix tree
+ */
+#define XFS_ICI_NO_TAG          (-1)    /* special flag for an untagged lookup
+                                           in xfs_inode_ag_iterator */
+#define XFS_ICI_RECLAIM_TAG     0       /* inode is to be reclaimed */
+#define XFS_ICI_EOFBLOCKS_TAG   1       /* inode has blocks beyond EOF */
+/*
 * Flags for xfs_iget()
 */
 #define XFS_IGET_CREATE         0x1
diff --git a/fs/xfs/xfs_icreate_item.c b/fs/xfs/xfs_icreate_item.c
index 7e4549233251..d45ca72af6fb 100644
--- a/fs/xfs/xfs_icreate_item.c
+++ b/fs/xfs/xfs_icreate_item.c
@@ -18,11 +18,10 @@
 #include "xfs.h"
 #include "xfs_fs.h"
 #include "xfs_shared.h"
+#include "xfs_format.h"
 #include "xfs_log_format.h"
 #include "xfs_trans_resv.h"
 #include "xfs_bit.h"
-#include "xfs_sb.h"
-#include "xfs_ag.h"
 #include "xfs_mount.h"
 #include "xfs_trans.h"
 #include "xfs_trans_priv.h"
diff --git a/fs/xfs/xfs_inode.c b/fs/xfs/xfs_inode.c
index 8ed049d1e332..41f804e740d7 100644
--- a/fs/xfs/xfs_inode.c
+++ b/fs/xfs/xfs_inode.c
@@ -23,9 +23,7 @@
 #include "xfs_format.h"
 #include "xfs_log_format.h"
 #include "xfs_trans_resv.h"
-#include "xfs_inum.h"
 #include "xfs_sb.h"
-#include "xfs_ag.h"
 #include "xfs_mount.h"
 #include "xfs_inode.h"
 #include "xfs_da_format.h"
@@ -1082,7 +1080,7 @@ xfs_create(
        struct xfs_dquot        *udqp = NULL;
        struct xfs_dquot        *gdqp = NULL;
        struct xfs_dquot        *pdqp = NULL;
-        struct xfs_trans_res    tres;
+        struct xfs_trans_res    *tres;
        uint                    resblks;
        trace_xfs_create(dp, name);
@@ -1105,13 +1103,11 @@ xfs_create(
        if (is_dir) {
                rdev = 0;
                resblks = XFS_MKDIR_SPACE_RES(mp, name->len);
-                tres.tr_logres = M_RES(mp)->tr_mkdir.tr_logres;
+                tres = &M_RES(mp)->tr_mkdir;
-                tres.tr_logcount = XFS_MKDIR_LOG_COUNT;
                tp = xfs_trans_alloc(mp, XFS_TRANS_MKDIR);
        } else {
                resblks = XFS_CREATE_SPACE_RES(mp, name->len);
-                tres.tr_logres = M_RES(mp)->tr_create.tr_logres;
+                tres = &M_RES(mp)->tr_create;
-                tres.tr_logcount = XFS_CREATE_LOG_COUNT;
                tp = xfs_trans_alloc(mp, XFS_TRANS_CREATE);
        }
@@ -1123,17 +1119,16 @@ xfs_create(
         * the case we'll drop the one we have and get a more
         * appropriate transaction later.
         */
-        tres.tr_logflags = XFS_TRANS_PERM_LOG_RES;
+        error = xfs_trans_reserve(tp, tres, resblks, 0);
-        error = xfs_trans_reserve(tp, &tres, resblks, 0);
        if (error == -ENOSPC) {
                /* flush outstanding delalloc blocks and retry */
                xfs_flush_inodes(mp);
-                error = xfs_trans_reserve(tp, &tres, resblks, 0);
+                error = xfs_trans_reserve(tp, tres, resblks, 0);
        }
        if (error == -ENOSPC) {
                /* No space at all so try a "no-allocation" reservation */
                resblks = 0;
-                error = xfs_trans_reserve(tp, &tres, 0, 0);
+                error = xfs_trans_reserve(tp, tres, 0, 0);
        }
        if (error) {
                cancel_flags = 0;
@@ -2488,9 +2483,7 @@ xfs_remove(
        xfs_fsblock_t           first_block;
        int                     cancel_flags;
        int                     committed;
-        int                     link_zero;
        uint                    resblks;
-        uint                    log_count;
        trace_xfs_remove(dp, name);
@@ -2505,13 +2498,10 @@ xfs_remove(
        if (error)
                goto std_return;
-        if (is_dir) {
+        if (is_dir)
                tp = xfs_trans_alloc(mp, XFS_TRANS_RMDIR);
-                log_count = XFS_DEFAULT_LOG_COUNT;
+        else
-        } else {
                tp = xfs_trans_alloc(mp, XFS_TRANS_REMOVE);
-                log_count = XFS_REMOVE_LOG_COUNT;
-        }
        cancel_flags = XFS_TRANS_RELEASE_LOG_RES;
        /*
@@ -2579,9 +2569,6 @@ xfs_remove(
        if (error)
                goto out_trans_cancel;
-        /* Determine if this is the last link while the inode is locked */
-        link_zero = (ip->i_d.di_nlink == 0);
        xfs_bmap_init(&free_list, &first_block);
        error = xfs_dir_removename(tp, dp, name, ip->i_ino,
                                        &first_block, &free_list, resblks);
diff --git a/fs/xfs/xfs_inode.h b/fs/xfs/xfs_inode.h
index 9af2882e1f4c..4ed2ba9342dc 100644
--- a/fs/xfs/xfs_inode.h
+++ b/fs/xfs/xfs_inode.h
@@ -20,7 +20,6 @@
 #include "xfs_inode_buf.h"
 #include "xfs_inode_fork.h"
-#include "xfs_dinode.h"
 /*
 * Kernel only inode definitions
@@ -324,7 +323,6 @@ static inline int xfs_isiflocked(struct xfs_inode *ip)
        (((pip)->i_mount->m_flags & XFS_MOUNT_GRPID) || \
         ((pip)->i_d.di_mode & S_ISGID))
 int             xfs_release(struct xfs_inode *ip);
 void            xfs_inactive(struct xfs_inode *ip);
 int             xfs_lookup(struct xfs_inode *dp, struct xfs_name *name,
diff --git a/fs/xfs/xfs_inode_item.c b/fs/xfs/xfs_inode_item.c
index 63de0b0acc32..bf13a5a7e2f4 100644
--- a/fs/xfs/xfs_inode_item.c
+++ b/fs/xfs/xfs_inode_item.c
@@ -20,8 +20,6 @@
 #include "xfs_format.h"
 #include "xfs_log_format.h"
 #include "xfs_trans_resv.h"
-#include "xfs_sb.h"
-#include "xfs_ag.h"
 #include "xfs_mount.h"
 #include "xfs_inode.h"
 #include "xfs_trans.h"
@@ -29,7 +27,6 @@
 #include "xfs_error.h"
 #include "xfs_trace.h"
 #include "xfs_trans_priv.h"
-#include "xfs_dinode.h"
 #include "xfs_log.h"
diff --git a/fs/xfs/xfs_ioctl.c b/fs/xfs/xfs_ioctl.c
index 24c926b6fe85..a1831980a68e 100644
--- a/fs/xfs/xfs_ioctl.c
+++ b/fs/xfs/xfs_ioctl.c
@@ -21,8 +21,6 @@
 #include "xfs_format.h"
 #include "xfs_log_format.h"
 #include "xfs_trans_resv.h"
-#include "xfs_sb.h"
-#include "xfs_ag.h"
 #include "xfs_mount.h"
 #include "xfs_inode.h"
 #include "xfs_ioctl.h"
@@ -40,7 +38,6 @@
 #include "xfs_trace.h"
 #include "xfs_icache.h"
 #include "xfs_symlink.h"
-#include "xfs_dinode.h"
 #include "xfs_trans.h"
 #include <linux/capability.h>
diff --git a/fs/xfs/xfs_ioctl32.c b/fs/xfs/xfs_ioctl32.c
index 94ce027e28e3..ec6772866f3d 100644
--- a/fs/xfs/xfs_ioctl32.c
+++ b/fs/xfs/xfs_ioctl32.c
@@ -25,8 +25,6 @@
 #include "xfs_format.h"
 #include "xfs_log_format.h"
 #include "xfs_trans_resv.h"
-#include "xfs_sb.h"
-#include "xfs_ag.h"
 #include "xfs_mount.h"
 #include "xfs_inode.h"
 #include "xfs_itable.h"
diff --git a/fs/xfs/xfs_iomap.c b/fs/xfs/xfs_iomap.c
index afcf3c926565..c980e2a5086b 100644
--- a/fs/xfs/xfs_iomap.c
+++ b/fs/xfs/xfs_iomap.c
@@ -21,8 +21,6 @@
 #include "xfs_format.h"
 #include "xfs_log_format.h"
 #include "xfs_trans_resv.h"
-#include "xfs_sb.h"
-#include "xfs_ag.h"
 #include "xfs_mount.h"
 #include "xfs_inode.h"
 #include "xfs_btree.h"
@@ -38,7 +36,6 @@
 #include "xfs_quota.h"
 #include "xfs_dquot_item.h"
 #include "xfs_dquot.h"
-#include "xfs_dinode.h"
 #define XFS_WRITEIO_ALIGN(mp,off)       (((off) >> mp->m_writeio_log) \
@@ -52,7 +49,6 @@ xfs_iomap_eof_align_last_fsb(
        xfs_extlen_t    extsize,
        xfs_fileoff_t   *last_fsb)
 {
-        xfs_fileoff_t   new_last_fsb = 0;
        xfs_extlen_t    align = 0;
        int             eof, error;
@@ -70,8 +66,8 @@ xfs_iomap_eof_align_last_fsb(
                else if (mp->m_dalign)
                        align = mp->m_dalign;
-                if (align && XFS_ISIZE(ip) >= XFS_FSB_TO_B(mp, align))
+                if (align && XFS_ISIZE(ip) < XFS_FSB_TO_B(mp, align))
-                        new_last_fsb = roundup_64(*last_fsb, align);
+                        align = 0;
        }
        /*
@@ -79,14 +75,14 @@ xfs_iomap_eof_align_last_fsb(
         * (when file on a real-time subvolume or has di_extsize hint).
         */
        if (extsize) {
-                if (new_last_fsb)
+                if (align)
-                        align = roundup_64(new_last_fsb, extsize);
+                        align = roundup_64(align, extsize);
                else
                        align = extsize;
-                new_last_fsb = roundup_64(*last_fsb, align);
        }
-        if (new_last_fsb) {
+        if (align) {
+                xfs_fileoff_t   new_last_fsb = roundup_64(*last_fsb, align);
                error = xfs_bmap_eof(ip, new_last_fsb, XFS_DATA_FORK, &eof);
                if (error)
                        return error;
@@ -264,7 +260,6 @@ xfs_iomap_eof_want_preallocate(
 {
        xfs_fileoff_t   start_fsb;
        xfs_filblks_t   count_fsb;
-        xfs_fsblock_t   firstblock;
        int             n, error, imaps;
        int             found_delalloc = 0;
@@ -289,7 +284,6 @@ xfs_iomap_eof_want_preallocate(
        count_fsb = XFS_B_TO_FSB(mp, mp->m_super->s_maxbytes);
        while (count_fsb > 0) {
                imaps = nimaps;
-                firstblock = NULLFSBLOCK;
                error = xfs_bmapi_read(ip, start_fsb, count_fsb, imap, &imaps,
                                       0);
                if (error)
diff --git a/fs/xfs/xfs_iops.c b/fs/xfs/xfs_iops.c
index ec6dcdc181ee..c50311cae1b1 100644
--- a/fs/xfs/xfs_iops.c
+++ b/fs/xfs/xfs_iops.c
@@ -21,8 +21,6 @@
 #include "xfs_format.h"
 #include "xfs_log_format.h"
 #include "xfs_trans_resv.h"
-#include "xfs_sb.h"
-#include "xfs_ag.h"
 #include "xfs_mount.h"
 #include "xfs_da_format.h"
 #include "xfs_inode.h"
@@ -37,8 +35,7 @@
 #include "xfs_icache.h"
 #include "xfs_symlink.h"
 #include "xfs_da_btree.h"
-#include "xfs_dir2_priv.h"
+#include "xfs_dir2.h"
-#include "xfs_dinode.h"
 #include "xfs_trans_space.h"
 #include <linux/capability.h>
diff --git a/fs/xfs/xfs_itable.c b/fs/xfs/xfs_itable.c
index 894924a5129b..82e314258f73 100644
--- a/fs/xfs/xfs_itable.c
+++ b/fs/xfs/xfs_itable.c
@@ -21,9 +21,6 @@
 #include "xfs_format.h"
 #include "xfs_log_format.h"
 #include "xfs_trans_resv.h"
-#include "xfs_inum.h"
-#include "xfs_sb.h"
-#include "xfs_ag.h"
 #include "xfs_mount.h"
 #include "xfs_inode.h"
 #include "xfs_btree.h"
@@ -33,7 +30,6 @@
 #include "xfs_error.h"
 #include "xfs_trace.h"
 #include "xfs_icache.h"
-#include "xfs_dinode.h"
 STATIC int
 xfs_internal_inum(
@@ -352,7 +348,6 @@ xfs_bulkstat(
        int                     *done)  /* 1 if there are more stats to get */
 {
        xfs_buf_t               *agbp;  /* agi header buffer */
-        xfs_agi_t               *agi;   /* agi header data */
        xfs_agino_t             agino;  /* inode # in allocation group */
        xfs_agnumber_t          agno;   /* allocation group number */
        xfs_btree_cur_t         *cur;   /* btree cursor for ialloc btree */
@@ -403,7 +398,6 @@ xfs_bulkstat(
                error = xfs_ialloc_read_agi(mp, NULL, agno, &agbp);
                if (error)
                        break;
-                agi = XFS_BUF_TO_AGI(agbp);
                /*
                 * Allocate and initialize a btree cursor for ialloc btree.
                 */
diff --git a/fs/xfs/xfs_linux.h b/fs/xfs/xfs_linux.h
index 6a51619d8690..c31d2c2eadc4 100644
--- a/fs/xfs/xfs_linux.h
+++ b/fs/xfs/xfs_linux.h
@@ -384,4 +384,10 @@ static inline __uint64_t howmany_64(__uint64_t x, __uint32_t y)
 #endif /* XFS_WARN */
 #endif /* DEBUG */
+#ifdef CONFIG_XFS_RT
+#define XFS_IS_REALTIME_INODE(ip) ((ip)->i_d.di_flags & XFS_DIFLAG_REALTIME)
+#else
+#define XFS_IS_REALTIME_INODE(ip) (0)
+#endif
 #endif /* __XFS_LINUX__ */
diff --git a/fs/xfs/xfs_log.c b/fs/xfs/xfs_log.c
index fe88ef67f93a..e408bf5a3ff7 100644
--- a/fs/xfs/xfs_log.c
+++ b/fs/xfs/xfs_log.c
@@ -21,8 +21,6 @@
 #include "xfs_format.h"
 #include "xfs_log_format.h"
 #include "xfs_trans_resv.h"
-#include "xfs_sb.h"
-#include "xfs_ag.h"
 #include "xfs_mount.h"
 #include "xfs_error.h"
 #include "xfs_trans.h"
@@ -1031,7 +1029,7 @@ xfs_log_need_covered(xfs_mount_t *mp)
        struct xlog     *log = mp->m_log;
        int             needed = 0;
-        if (!xfs_fs_writable(mp))
+        if (!xfs_fs_writable(mp, SB_FREEZE_WRITE))
                return 0;
        if (!xlog_cil_empty(log))
@@ -1808,6 +1806,8 @@ xlog_sync(
        XFS_BUF_ZEROFLAGS(bp);
        XFS_BUF_ASYNC(bp);
        bp->b_flags |= XBF_SYNCIO;
+        /* use high priority completion wq */
+        bp->b_ioend_wq = log->l_mp->m_log_workqueue;
        if (log->l_mp->m_flags & XFS_MOUNT_BARRIER) {
                bp->b_flags |= XBF_FUA;
@@ -1856,6 +1856,8 @@ xlog_sync(
                bp->b_flags |= XBF_SYNCIO;
                if (log->l_mp->m_flags & XFS_MOUNT_BARRIER)
                        bp->b_flags |= XBF_FUA;
+                /* use high priority completion wq */
+                bp->b_ioend_wq = log->l_mp->m_log_workqueue;
                ASSERT(XFS_BUF_ADDR(bp) <= log->l_logBBsize-1);
                ASSERT(XFS_BUF_ADDR(bp) + BTOBB(count) <= log->l_logBBsize);
diff --git a/fs/xfs/xfs_log_cil.c b/fs/xfs/xfs_log_cil.c
index f506c457011e..45cc0ce18adf 100644
--- a/fs/xfs/xfs_log_cil.c
+++ b/fs/xfs/xfs_log_cil.c
@@ -17,11 +17,10 @@
 #include "xfs.h"
 #include "xfs_fs.h"
+#include "xfs_format.h"
 #include "xfs_log_format.h"
 #include "xfs_shared.h"
 #include "xfs_trans_resv.h"
-#include "xfs_sb.h"
-#include "xfs_ag.h"
 #include "xfs_mount.h"
 #include "xfs_error.h"
 #include "xfs_alloc.h"
diff --git a/fs/xfs/xfs_log_recover.c b/fs/xfs/xfs_log_recover.c
index 00cd7f3a8f59..a5a945fc3bdc 100644
--- a/fs/xfs/xfs_log_recover.c
+++ b/fs/xfs/xfs_log_recover.c
@@ -22,11 +22,10 @@
 #include "xfs_log_format.h"
 #include "xfs_trans_resv.h"
 #include "xfs_bit.h"
-#include "xfs_inum.h"
 #include "xfs_sb.h"
-#include "xfs_ag.h"
 #include "xfs_mount.h"
 #include "xfs_da_format.h"
+#include "xfs_da_btree.h"
 #include "xfs_inode.h"
 #include "xfs_trans.h"
 #include "xfs_log.h"
@@ -42,7 +41,6 @@
 #include "xfs_trace.h"
 #include "xfs_icache.h"
 #include "xfs_bmap_btree.h"
-#include "xfs_dinode.h"
 #include "xfs_error.h"
 #include "xfs_dir2.h"
diff --git a/fs/xfs/xfs_message.c b/fs/xfs/xfs_message.c
index 63ca2f0420b1..d8b67547ab34 100644
--- a/fs/xfs/xfs_message.c
+++ b/fs/xfs/xfs_message.c
@@ -17,10 +17,9 @@
 #include "xfs.h"
 #include "xfs_fs.h"
+#include "xfs_format.h"
 #include "xfs_log_format.h"
 #include "xfs_trans_resv.h"
-#include "xfs_sb.h"
-#include "xfs_ag.h"
 #include "xfs_mount.h"
 /*
diff --git a/fs/xfs/xfs_mount.c b/fs/xfs/xfs_mount.c
index 51435dbce9c4..d3d38836f87f 100644
--- a/fs/xfs/xfs_mount.c
+++ b/fs/xfs/xfs_mount.c
@@ -22,11 +22,10 @@
 #include "xfs_log_format.h"
 #include "xfs_trans_resv.h"
 #include "xfs_bit.h"
-#include "xfs_inum.h"
 #include "xfs_sb.h"
-#include "xfs_ag.h"
 #include "xfs_mount.h"
 #include "xfs_da_format.h"
+#include "xfs_da_btree.h"
 #include "xfs_inode.h"
 #include "xfs_dir2.h"
 #include "xfs_ialloc.h"
@@ -41,7 +40,6 @@
 #include "xfs_fsops.h"
 #include "xfs_trace.h"
 #include "xfs_icache.h"
-#include "xfs_dinode.h"
 #include "xfs_sysfs.h"
@@ -1074,11 +1072,23 @@ xfs_unmountfs(
        xfs_sysfs_del(&mp->m_kobj);
 }
-int
+/*
-xfs_fs_writable(xfs_mount_t *mp)
+ * Determine whether modifications can proceed. The caller specifies the minimum
+ * freeze level for which modifications should not be allowed. This allows
+ * certain operations to proceed while the freeze sequence is in progress, if
+ * necessary.
+ */
+bool
+xfs_fs_writable(
+        struct xfs_mount        *mp,
+        int                     level)
 {
-        return !(mp->m_super->s_writers.frozen || XFS_FORCED_SHUTDOWN(mp) ||
+        ASSERT(level > SB_UNFROZEN);
-                (mp->m_flags & XFS_MOUNT_RDONLY));
+        if ((mp->m_super->s_writers.frozen >= level) ||
+            XFS_FORCED_SHUTDOWN(mp) || (mp->m_flags & XFS_MOUNT_RDONLY))
+                return false;
+        return true;
 }
 /*
@@ -1086,9 +1096,9 @@ xfs_fs_writable(xfs_mount_t *mp)
 *
 * Sync the superblock counters to disk.
 *
- * Note this code can be called during the process of freezing, so
+ * Note this code can be called during the process of freezing, so we use the
- * we may need to use the transaction allocator which does not
+ * transaction allocator that does not block when the transaction subsystem is
- * block when the transaction subsystem is in its frozen state.
+ * in its frozen state.
 */
 int
 xfs_log_sbcount(xfs_mount_t *mp)
@@ -1096,7 +1106,8 @@ xfs_log_sbcount(xfs_mount_t *mp)
        xfs_trans_t     *tp;
        int             error;
-        if (!xfs_fs_writable(mp))
+        /* allow this to proceed during the freeze sequence... */
+        if (!xfs_fs_writable(mp, SB_FREEZE_COMPLETE))
                return 0;
        xfs_icsb_sync_counters(mp, 0);
diff --git a/fs/xfs/xfs_mount.h b/fs/xfs/xfs_mount.h
index b0447c86e7e2..22ccf69d4d3c 100644
--- a/fs/xfs/xfs_mount.h
+++ b/fs/xfs/xfs_mount.h
@@ -168,6 +168,7 @@ typedef struct xfs_mount {
                                                /* low free space thresholds */
        struct xfs_kobj         m_kobj;
+        struct workqueue_struct *m_buf_workqueue;
        struct workqueue_struct *m_data_workqueue;
        struct workqueue_struct *m_unwritten_workqueue;
        struct workqueue_struct *m_cil_workqueue;
@@ -320,10 +321,7 @@ typedef struct xfs_mod_sb {
 /*
 * Per-ag incore structure, copies of information in agf and agi, to improve the
- * performance of allocation group selection. This is defined for the kernel
+ * performance of allocation group selection.
- * only, and hence is defined here instead of in xfs_ag.h. You need the struct
- * xfs_mount to be defined to look up a xfs_perag anyway (via mp->m_perag_tree),
- * so this doesn't introduce any strange header file dependencies.
 */
 typedef struct xfs_perag {
        struct xfs_mount *pag_mount;    /* owner filesystem */
@@ -384,7 +382,7 @@ extern int	xfs_mount_log_sb(xfs_mount_t *, __int64_t);
 extern struct xfs_buf *xfs_getsb(xfs_mount_t *, int);
 extern int      xfs_readsb(xfs_mount_t *, int);
 extern void     xfs_freesb(xfs_mount_t *);
-extern int      xfs_fs_writable(xfs_mount_t *);
+extern bool     xfs_fs_writable(struct xfs_mount *mp, int level);
 extern int      xfs_sb_validate_fsb_count(struct xfs_sb *, __uint64_t);
 extern int      xfs_dev_is_read_only(struct xfs_mount *, char *);
diff --git a/fs/xfs/xfs_qm.c b/fs/xfs/xfs_qm.c
index d68f23021af3..79fb19dd9c83 100644
--- a/fs/xfs/xfs_qm.c
+++ b/fs/xfs/xfs_qm.c
@@ -23,7 +23,6 @@
 #include "xfs_trans_resv.h"
 #include "xfs_bit.h"
 #include "xfs_sb.h"
-#include "xfs_ag.h"
 #include "xfs_mount.h"
 #include "xfs_inode.h"
 #include "xfs_ialloc.h"
@@ -38,7 +37,6 @@
 #include "xfs_trace.h"
 #include "xfs_icache.h"
 #include "xfs_cksum.h"
-#include "xfs_dinode.h"
 /*
 * The global quota manager. There is only one of these for the entire
@@ -1749,23 +1747,21 @@ xfs_qm_vop_dqalloc(
        xfs_iunlock(ip, lockflags);
        if (O_udqpp)
                *O_udqpp = uq;
-        else if (uq)
+        else
                xfs_qm_dqrele(uq);
        if (O_gdqpp)
                *O_gdqpp = gq;
-        else if (gq)
+        else
                xfs_qm_dqrele(gq);
        if (O_pdqpp)
                *O_pdqpp = pq;
-        else if (pq)
+        else
                xfs_qm_dqrele(pq);
        return 0;
 error_rele:
-        if (gq)
+        xfs_qm_dqrele(gq);
-                xfs_qm_dqrele(gq);
+        xfs_qm_dqrele(uq);
-        if (uq)
-                xfs_qm_dqrele(uq);
        return error;
 }
diff --git a/fs/xfs/xfs_qm_bhv.c b/fs/xfs/xfs_qm_bhv.c
index 2c61e61b0205..3e52d5de7ae1 100644
--- a/fs/xfs/xfs_qm_bhv.c
+++ b/fs/xfs/xfs_qm_bhv.c
@@ -20,8 +20,6 @@
 #include "xfs_format.h"
 #include "xfs_log_format.h"
 #include "xfs_trans_resv.h"
-#include "xfs_sb.h"
-#include "xfs_ag.h"
 #include "xfs_quota.h"
 #include "xfs_mount.h"
 #include "xfs_inode.h"
diff --git a/fs/xfs/xfs_qm_syscalls.c b/fs/xfs/xfs_qm_syscalls.c
index 80f2d77d929a..74fca68e43b6 100644
--- a/fs/xfs/xfs_qm_syscalls.c
+++ b/fs/xfs/xfs_qm_syscalls.c
@@ -26,7 +26,6 @@
 #include "xfs_trans_resv.h"
 #include "xfs_bit.h"
 #include "xfs_sb.h"
-#include "xfs_ag.h"
 #include "xfs_mount.h"
 #include "xfs_inode.h"
 #include "xfs_trans.h"
@@ -784,19 +783,21 @@ xfs_qm_log_quotaoff(
 {
        xfs_trans_t            *tp;
        int                     error;
-        xfs_qoff_logitem_t     *qoffi=NULL;
+        xfs_qoff_logitem_t     *qoffi;
-        uint                    oldsbqflag=0;
+        *qoffstartp = NULL;
        tp = xfs_trans_alloc(mp, XFS_TRANS_QM_QUOTAOFF);
        error = xfs_trans_reserve(tp, &M_RES(mp)->tr_qm_quotaoff, 0, 0);
-        if (error)
+        if (error) {
-                goto error0;
+                xfs_trans_cancel(tp, 0);
+                goto out;
+        }
        qoffi = xfs_trans_get_qoff_item(tp, NULL, flags & XFS_ALL_QUOTA_ACCT);
        xfs_trans_log_quotaoff_item(tp, qoffi);
        spin_lock(&mp->m_sb_lock);
-        oldsbqflag = mp->m_sb.sb_qflags;
        mp->m_sb.sb_qflags = (mp->m_qflags & ~(flags)) & XFS_MOUNT_QUOTA_ALL;
        spin_unlock(&mp->m_sb_lock);
@@ -809,19 +810,11 @@ xfs_qm_log_quotaoff(
         */
        xfs_trans_set_sync(tp);
        error = xfs_trans_commit(tp, 0);
+        if (error)
+                goto out;
-error0:
-        if (error) {
-                xfs_trans_cancel(tp, 0);
-                /*
-                 * No one else is modifying sb_qflags, so this is OK.
-                 * We still hold the quotaofflock.
-                 */
-                spin_lock(&mp->m_sb_lock);
-                mp->m_sb.sb_qflags = oldsbqflag;
-                spin_unlock(&mp->m_sb_lock);
-        }
        *qoffstartp = qoffi;
+out:
        return error;
 }
diff --git a/fs/xfs/xfs_quotaops.c b/fs/xfs/xfs_quotaops.c
index b238027df987..7542bbeca6a1 100644
--- a/fs/xfs/xfs_quotaops.c
+++ b/fs/xfs/xfs_quotaops.c
@@ -19,8 +19,6 @@
 #include "xfs_format.h"
 #include "xfs_log_format.h"
 #include "xfs_trans_resv.h"
-#include "xfs_sb.h"
-#include "xfs_ag.h"
 #include "xfs_mount.h"
 #include "xfs_inode.h"
 #include "xfs_quota.h"
diff --git a/fs/xfs/xfs_rtalloc.c b/fs/xfs/xfs_rtalloc.c
index e1175ea9b551..f2079b6911cc 100644
--- a/fs/xfs/xfs_rtalloc.c
+++ b/fs/xfs/xfs_rtalloc.c
@@ -22,8 +22,6 @@
 #include "xfs_log_format.h"
 #include "xfs_trans_resv.h"
 #include "xfs_bit.h"
-#include "xfs_sb.h"
-#include "xfs_ag.h"
 #include "xfs_mount.h"
 #include "xfs_inode.h"
 #include "xfs_bmap.h"
@@ -36,7 +34,6 @@
 #include "xfs_trace.h"
 #include "xfs_buf.h"
 #include "xfs_icache.h"
-#include "xfs_dinode.h"
 #include "xfs_rtalloc.h"
diff --git a/fs/xfs/xfs_super.c b/fs/xfs/xfs_super.c
index 206b97fd1d8a..19cbda196369 100644
--- a/fs/xfs/xfs_super.c
+++ b/fs/xfs/xfs_super.c
@@ -21,9 +21,7 @@
 #include "xfs_format.h"
 #include "xfs_log_format.h"
 #include "xfs_trans_resv.h"
-#include "xfs_inum.h"
 #include "xfs_sb.h"
-#include "xfs_ag.h"
 #include "xfs_mount.h"
 #include "xfs_da_format.h"
 #include "xfs_inode.h"
@@ -44,7 +42,6 @@
 #include "xfs_icache.h"
 #include "xfs_trace.h"
 #include "xfs_icreate_item.h"
-#include "xfs_dinode.h"
 #include "xfs_filestream.h"
 #include "xfs_quota.h"
 #include "xfs_sysfs.h"
@@ -796,8 +793,7 @@ xfs_open_devices(
 out_free_ddev_targ:
        xfs_free_buftarg(mp, mp->m_ddev_targp);
 out_close_rtdev:
-        if (rtdev)
+        xfs_blkdev_put(rtdev);
-                xfs_blkdev_put(rtdev);
 out_close_logdev:
        if (logdev && logdev != ddev)
                xfs_blkdev_put(logdev);
@@ -842,10 +838,15 @@ STATIC int
 xfs_init_mount_workqueues(
        struct xfs_mount        *mp)
 {
+        mp->m_buf_workqueue = alloc_workqueue("xfs-buf/%s",
+                        WQ_MEM_RECLAIM|WQ_FREEZABLE, 1, mp->m_fsname);
+        if (!mp->m_buf_workqueue)
+                goto out;
        mp->m_data_workqueue = alloc_workqueue("xfs-data/%s",
                        WQ_MEM_RECLAIM|WQ_FREEZABLE, 0, mp->m_fsname);
        if (!mp->m_data_workqueue)
-                goto out;
+                goto out_destroy_buf;
        mp->m_unwritten_workqueue = alloc_workqueue("xfs-conv/%s",
                        WQ_MEM_RECLAIM|WQ_FREEZABLE, 0, mp->m_fsname);
@@ -863,7 +864,7 @@ xfs_init_mount_workqueues(
                goto out_destroy_cil;
        mp->m_log_workqueue = alloc_workqueue("xfs-log/%s",
-                        WQ_FREEZABLE, 0, mp->m_fsname);
+                        WQ_FREEZABLE|WQ_HIGHPRI, 0, mp->m_fsname);
        if (!mp->m_log_workqueue)
                goto out_destroy_reclaim;
@@ -884,6 +885,8 @@ out_destroy_unwritten:
        destroy_workqueue(mp->m_unwritten_workqueue);
 out_destroy_data_iodone_queue:
        destroy_workqueue(mp->m_data_workqueue);
+out_destroy_buf:
+        destroy_workqueue(mp->m_buf_workqueue);
 out:
        return -ENOMEM;
 }
@@ -898,6 +901,7 @@ xfs_destroy_mount_workqueues(
        destroy_workqueue(mp->m_cil_workqueue);
        destroy_workqueue(mp->m_data_workqueue);
        destroy_workqueue(mp->m_unwritten_workqueue);
+        destroy_workqueue(mp->m_buf_workqueue);
 }
 /*
@@ -1000,7 +1004,6 @@ xfs_fs_evict_inode(
        clear_inode(inode);
        XFS_STATS_INC(vn_rele);
        XFS_STATS_INC(vn_remove);
-        XFS_STATS_DEC(vn_active);
        xfs_inactive(ip);
 }
diff --git a/fs/xfs/xfs_symlink.c b/fs/xfs/xfs_symlink.c
index 02ae62a998e0..25791df6f638 100644
--- a/fs/xfs/xfs_symlink.c
+++ b/fs/xfs/xfs_symlink.c
@@ -23,8 +23,6 @@
 #include "xfs_log_format.h"
 #include "xfs_trans_resv.h"
 #include "xfs_bit.h"
-#include "xfs_sb.h"
-#include "xfs_ag.h"
 #include "xfs_mount.h"
 #include "xfs_da_format.h"
 #include "xfs_da_btree.h"
@@ -42,7 +40,6 @@
 #include "xfs_symlink.h"
 #include "xfs_trans.h"
 #include "xfs_log.h"
-#include "xfs_dinode.h"
 /* ----- Kernel only functions below ----- */
 STATIC int
diff --git a/fs/xfs/xfs_trace.c b/fs/xfs/xfs_trace.c
index 1e85bcd0e418..13a029806805 100644
--- a/fs/xfs/xfs_trace.c
+++ b/fs/xfs/xfs_trace.c
@@ -21,8 +21,6 @@
 #include "xfs_format.h"
 #include "xfs_log_format.h"
 #include "xfs_trans_resv.h"
-#include "xfs_sb.h"
-#include "xfs_ag.h"
 #include "xfs_mount.h"
 #include "xfs_da_format.h"
 #include "xfs_inode.h"
diff --git a/fs/xfs/xfs_trans.c b/fs/xfs/xfs_trans.c
index 30e8e3410955..fa3135b9bf04 100644
--- a/fs/xfs/xfs_trans.c
+++ b/fs/xfs/xfs_trans.c
@@ -22,8 +22,6 @@
 #include "xfs_format.h"
 #include "xfs_log_format.h"
 #include "xfs_trans_resv.h"
-#include "xfs_sb.h"
-#include "xfs_ag.h"
 #include "xfs_mount.h"
 #include "xfs_inode.h"
 #include "xfs_extent_busy.h"
diff --git a/fs/xfs/xfs_trans_ail.c b/fs/xfs/xfs_trans_ail.c
index 859482f53b5a..573aefb5a573 100644
--- a/fs/xfs/xfs_trans_ail.c
+++ b/fs/xfs/xfs_trans_ail.c
@@ -18,10 +18,9 @@
 */
 #include "xfs.h"
 #include "xfs_fs.h"
+#include "xfs_format.h"
 #include "xfs_log_format.h"
 #include "xfs_trans_resv.h"
-#include "xfs_sb.h"
-#include "xfs_ag.h"
 #include "xfs_mount.h"
 #include "xfs_trans.h"
 #include "xfs_trans_priv.h"
diff --git a/fs/xfs/xfs_trans_buf.c b/fs/xfs/xfs_trans_buf.c
index e2b2216b1635..0a4d4ab6d9a9 100644
--- a/fs/xfs/xfs_trans_buf.c
+++ b/fs/xfs/xfs_trans_buf.c
@@ -21,8 +21,6 @@
 #include "xfs_format.h"
 #include "xfs_log_format.h"
 #include "xfs_trans_resv.h"
-#include "xfs_sb.h"
-#include "xfs_ag.h"
 #include "xfs_mount.h"
 #include "xfs_inode.h"
 #include "xfs_trans.h"
@@ -229,13 +227,6 @@ xfs_trans_getsb(xfs_trans_t	*tp,
        return bp;
 }
-#ifdef DEBUG
-xfs_buftarg_t *xfs_error_target;
-int     xfs_do_error;
-int     xfs_req_num;
-int     xfs_error_mod = 33;
-#endif
 /*
 * Get and lock the buffer for the caller if it is not already
 * locked within the given transaction.  If it has not yet been
@@ -257,46 +248,11 @@ xfs_trans_read_buf_map(
        struct xfs_buf          **bpp,
        const struct xfs_buf_ops *ops)
 {
-        xfs_buf_t               *bp;
+        struct xfs_buf          *bp = NULL;
-        xfs_buf_log_item_t      *bip;
+        struct xfs_buf_log_item *bip;
        int                     error;
        *bpp = NULL;
-        if (!tp) {
-                bp = xfs_buf_read_map(target, map, nmaps, flags, ops);
-                if (!bp)
-                        return (flags & XBF_TRYLOCK) ?
-                                        -EAGAIN : -ENOMEM;
-                if (bp->b_error) {
-                        error = bp->b_error;
-                        xfs_buf_ioerror_alert(bp, __func__);
-                        XFS_BUF_UNDONE(bp);
-                        xfs_buf_stale(bp);
-                        xfs_buf_relse(bp);
-                        /* bad CRC means corrupted metadata */
-                        if (error == -EFSBADCRC)
-                                error = -EFSCORRUPTED;
-                        return error;
-                }
-#ifdef DEBUG
-                if (xfs_do_error) {
-                        if (xfs_error_target == target) {
-                                if (((xfs_req_num++) % xfs_error_mod) == 0) {
-                                        xfs_buf_relse(bp);
-                                        xfs_debug(mp, "Returning error!");
-                                        return -EIO;
-                                }
-                        }
-                }
-#endif
-                if (XFS_FORCED_SHUTDOWN(mp))
-                        goto shutdown_abort;
-                *bpp = bp;
-                return 0;
-        }
        /*
         * If we find the buffer in the cache with this transaction
         * pointer in its b_fsprivate2 field, then we know we already
@@ -305,49 +261,24 @@ xfs_trans_read_buf_map(
         * If the buffer is not yet read in, then we read it in, increment
         * the lock recursion count, and return it to the caller.
         */
-        bp = xfs_trans_buf_item_match(tp, target, map, nmaps);
+        if (tp)
-        if (bp != NULL) {
+                bp = xfs_trans_buf_item_match(tp, target, map, nmaps);
+        if (bp) {
                ASSERT(xfs_buf_islocked(bp));
                ASSERT(bp->b_transp == tp);
                ASSERT(bp->b_fspriv != NULL);
                ASSERT(!bp->b_error);
-                if (!(XFS_BUF_ISDONE(bp))) {
+                ASSERT(bp->b_flags & XBF_DONE);
-                        trace_xfs_trans_read_buf_io(bp, _RET_IP_);
-                        ASSERT(!XFS_BUF_ISASYNC(bp));
-                        ASSERT(bp->b_iodone == NULL);
-                        XFS_BUF_READ(bp);
-                        bp->b_ops = ops;
-                        error = xfs_buf_submit_wait(bp);
-                        if (error) {
-                                if (!XFS_FORCED_SHUTDOWN(mp))
-                                        xfs_buf_ioerror_alert(bp, __func__);
-                                xfs_buf_relse(bp);
-                                /*
-                                 * We can gracefully recover from most read
-                                 * errors. Ones we can't are those that happen
-                                 * after the transaction's already dirty.
-                                 */
-                                if (tp->t_flags & XFS_TRANS_DIRTY)
-                                        xfs_force_shutdown(tp->t_mountp,
-                                                        SHUTDOWN_META_IO_ERROR);
-                                /* bad CRC means corrupted metadata */
-                                if (error == -EFSBADCRC)
-                                        error = -EFSCORRUPTED;
-                                return error;
-                        }
-                }
                /*
                 * We never locked this buf ourselves, so we shouldn't
                 * brelse it either. Just get out.
                 */
                if (XFS_FORCED_SHUTDOWN(mp)) {
                        trace_xfs_trans_read_buf_shut(bp, _RET_IP_);
-                        *bpp = NULL;
                        return -EIO;
                }
                bip = bp->b_fspriv;
                bip->bli_recur++;
@@ -358,17 +289,29 @@ xfs_trans_read_buf_map(
        }
        bp = xfs_buf_read_map(target, map, nmaps, flags, ops);
-        if (bp == NULL) {
+        if (!bp) {
-                *bpp = NULL;
+                if (!(flags & XBF_TRYLOCK))
-                return (flags & XBF_TRYLOCK) ?
+                        return -ENOMEM;
-                                        0 : -ENOMEM;
+                return tp ? 0 : -EAGAIN;
        }
+        /*
+         * If we've had a read error, then the contents of the buffer are
+         * invalid and should not be used. To ensure that a followup read tries
+         * to pull the buffer from disk again, we clear the XBF_DONE flag and
+         * mark the buffer stale. This ensures that anyone who has a current
+         * reference to the buffer will interpret it's contents correctly and
+         * future cache lookups will also treat it as an empty, uninitialised
+         * buffer.
+         */
        if (bp->b_error) {
                error = bp->b_error;
+                if (!XFS_FORCED_SHUTDOWN(mp))
+                        xfs_buf_ioerror_alert(bp, __func__);
+                bp->b_flags &= ~XBF_DONE;
                xfs_buf_stale(bp);
-                XFS_BUF_DONE(bp);
-                xfs_buf_ioerror_alert(bp, __func__);
+                if (tp && (tp->t_flags & XFS_TRANS_DIRTY))
-                if (tp->t_flags & XFS_TRANS_DIRTY)
                        xfs_force_shutdown(tp->t_mountp, SHUTDOWN_META_IO_ERROR);
                xfs_buf_relse(bp);
@@ -377,33 +320,19 @@ xfs_trans_read_buf_map(
                        error = -EFSCORRUPTED;
                return error;
        }
-#ifdef DEBUG
-        if (xfs_do_error && !(tp->t_flags & XFS_TRANS_DIRTY)) {
+        if (XFS_FORCED_SHUTDOWN(mp)) {
-                if (xfs_error_target == target) {
+                xfs_buf_relse(bp);
-                        if (((xfs_req_num++) % xfs_error_mod) == 0) {
+                trace_xfs_trans_read_buf_shut(bp, _RET_IP_);
-                                xfs_force_shutdown(tp->t_mountp,
+                return -EIO;
-                                                   SHUTDOWN_META_IO_ERROR);
-                                xfs_buf_relse(bp);
-                                xfs_debug(mp, "Returning trans error!");
-                                return -EIO;
-                        }
-                }
        }
-#endif
-        if (XFS_FORCED_SHUTDOWN(mp))
-                goto shutdown_abort;
-        _xfs_trans_bjoin(tp, bp, 1);
+        if (tp)
+                _xfs_trans_bjoin(tp, bp, 1);
        trace_xfs_trans_read_buf(bp->b_fspriv);
        *bpp = bp;
        return 0;
-shutdown_abort:
-        trace_xfs_trans_read_buf_shut(bp, _RET_IP_);
-        xfs_buf_relse(bp);
-        *bpp = NULL;
-        return -EIO;
 }
 /*
diff --git a/fs/xfs/xfs_trans_dquot.c b/fs/xfs/xfs_trans_dquot.c
index 846e061c2e98..76a16df55ef7 100644
--- a/fs/xfs/xfs_trans_dquot.c
+++ b/fs/xfs/xfs_trans_dquot.c
@@ -21,8 +21,6 @@
 #include "xfs_format.h"
 #include "xfs_log_format.h"
 #include "xfs_trans_resv.h"
-#include "xfs_sb.h"
-#include "xfs_ag.h"
 #include "xfs_mount.h"
 #include "xfs_inode.h"
 #include "xfs_error.h"
diff --git a/fs/xfs/xfs_trans_extfree.c b/fs/xfs/xfs_trans_extfree.c
index 47978ba89dae..284397dd7990 100644
--- a/fs/xfs/xfs_trans_extfree.c
+++ b/fs/xfs/xfs_trans_extfree.c
@@ -18,10 +18,9 @@
 #include "xfs.h"
 #include "xfs_fs.h"
 #include "xfs_shared.h"
+#include "xfs_format.h"
 #include "xfs_log_format.h"
 #include "xfs_trans_resv.h"
-#include "xfs_sb.h"
-#include "xfs_ag.h"
 #include "xfs_mount.h"
 #include "xfs_trans.h"
 #include "xfs_trans_priv.h"
diff --git a/fs/xfs/xfs_trans_inode.c b/fs/xfs/xfs_trans_inode.c
index cdb4d86520e1..17280cd71934 100644
--- a/fs/xfs/xfs_trans_inode.c
+++ b/fs/xfs/xfs_trans_inode.c
@@ -21,8 +21,6 @@
 #include "xfs_format.h"
 #include "xfs_log_format.h"
 #include "xfs_trans_resv.h"
-#include "xfs_sb.h"
-#include "xfs_ag.h"
 #include "xfs_mount.h"
 #include "xfs_inode.h"
 #include "xfs_trans.h"
diff --git a/fs/xfs/xfs_xattr.c b/fs/xfs/xfs_xattr.c
index 93455b998041..69f6e475de97 100644
--- a/fs/xfs/xfs_xattr.c
+++ b/fs/xfs/xfs_xattr.c
@@ -20,8 +20,6 @@
 #include "xfs_format.h"
 #include "xfs_log_format.h"
 #include "xfs_trans_resv.h"
-#include "xfs_sb.h"
-#include "xfs_ag.h"
 #include "xfs_mount.h"
 #include "xfs_da_format.h"
 #include "xfs_inode.h"