43 files changed, 5600 insertions, 1855 deletions
diff --git a/fs/btrfs/Makefile b/fs/btrfs/Makefile
index d7fcdba141a2..7df3e0f0ee51 100644
--- a/fs/btrfs/Makefile
+++ b/fs/btrfs/Makefile
@@ -8,7 +8,7 @@ btrfs-y += super.o ctree.o extent-tree.o print-tree.o root-tree.o dir-item.o \
           extent_io.o volumes.o async-thread.o ioctl.o locking.o orphan.o \
           export.o tree-log.o free-space-cache.o zlib.o lzo.o \
           compression.o delayed-ref.o relocation.o delayed-inode.o scrub.o \
-           reada.o backref.o ulist.o qgroup.o send.o
+           reada.o backref.o ulist.o qgroup.o send.o dev-replace.o
 btrfs-$(CONFIG_BTRFS_FS_POSIX_ACL) += acl.o
 btrfs-$(CONFIG_BTRFS_FS_CHECK_INTEGRITY) += check-integrity.o
diff --git a/fs/btrfs/acl.c b/fs/btrfs/acl.c
index 0c16e3dbfd56..e15d2b0d8d3b 100644
--- a/fs/btrfs/acl.c
+++ b/fs/btrfs/acl.c
@@ -121,6 +121,8 @@ static int btrfs_set_acl(struct btrfs_trans_handle *trans,
                        ret = posix_acl_equiv_mode(acl, &inode->i_mode);
                        if (ret < 0)
                                return ret;
+                        if (ret == 0)
+                                acl = NULL;
                }
                ret = 0;
                break;
diff --git a/fs/btrfs/backref.c b/fs/btrfs/backref.c
index 208d8aa5b07e..04edf69be875 100644
--- a/fs/btrfs/backref.c
+++ b/fs/btrfs/backref.c
@@ -461,6 +461,7 @@ static int __merge_refs(struct list_head *head, int mode)
                     pos2 = n2, n2 = pos2->next) {
                        struct __prelim_ref *ref2;
                        struct __prelim_ref *xchg;
+                        struct extent_inode_elem *eie;
                        ref2 = list_entry(pos2, struct __prelim_ref, list);
@@ -472,12 +473,20 @@ static int __merge_refs(struct list_head *head, int mode)
                                        ref1 = ref2;
                                        ref2 = xchg;
                                }
-                                ref1->count += ref2->count;
                        } else {
                                if (ref1->parent != ref2->parent)
                                        continue;
-                                ref1->count += ref2->count;
                        }
+                        eie = ref1->inode_list;
+                        while (eie && eie->next)
+                                eie = eie->next;
+                        if (eie)
+                                eie->next = ref2->inode_list;
+                        else
+                                ref1->inode_list = ref2->inode_list;
+                        ref1->count += ref2->count;
                        list_del(&ref2->list);
                        kfree(ref2);
                }
@@ -890,8 +899,7 @@ again:
        while (!list_empty(&prefs)) {
                ref = list_first_entry(&prefs, struct __prelim_ref, list);
                list_del(&ref->list);
-                if (ref->count < 0)
+                WARN_ON(ref->count < 0);
-                        WARN_ON(1);
                if (ref->count && ref->root_id && ref->parent == 0) {
                        /* no parent == root of tree */
                        ret = ulist_add(roots, ref->root_id, 0, GFP_NOFS);
diff --git a/fs/btrfs/btrfs_inode.h b/fs/btrfs/btrfs_inode.h
index ed8ca7ca5eff..2a8c242bc4f5 100644
--- a/fs/btrfs/btrfs_inode.h
+++ b/fs/btrfs/btrfs_inode.h
@@ -39,6 +39,7 @@
 #define BTRFS_INODE_HAS_ORPHAN_ITEM             5
 #define BTRFS_INODE_HAS_ASYNC_EXTENT            6
 #define BTRFS_INODE_NEEDS_FULL_SYNC             7
+#define BTRFS_INODE_COPY_EVERYTHING             8
 /* in memory btrfs inode */
 struct btrfs_inode {
@@ -90,6 +91,9 @@ struct btrfs_inode {
        unsigned long runtime_flags;
+        /* Keep track of who's O_SYNC/fsycing currently */
+        atomic_t sync_writers;
        /* full 64 bit generation number, struct vfs_inode doesn't have a big
         * enough field for this.
         */
diff --git a/fs/btrfs/check-integrity.c b/fs/btrfs/check-integrity.c
index 5a3e45db642a..11d47bfb62b4 100644
--- a/fs/btrfs/check-integrity.c
+++ b/fs/btrfs/check-integrity.c
@@ -137,7 +137,7 @@ struct btrfsic_block {
        unsigned int never_written:1;   /* block was added because it was
                                         * referenced, not because it was
                                         * written */
-        unsigned int mirror_num:2;      /* large enough to hold
+        unsigned int mirror_num;        /* large enough to hold
                                         * BTRFS_SUPER_MIRROR_MAX */
        struct btrfsic_dev_state *dev_state;
        u64 dev_bytenr;         /* key, physical byte num on disk */
@@ -723,7 +723,7 @@ static int btrfsic_process_superblock(struct btrfsic_state *state,
                }
                num_copies =
-                    btrfs_num_copies(&state->root->fs_info->mapping_tree,
+                    btrfs_num_copies(state->root->fs_info,
                                     next_bytenr, state->metablock_size);
                if (state->print_mask & BTRFSIC_PRINT_MASK_NUM_COPIES)
                        printk(KERN_INFO "num_copies(log_bytenr=%llu) = %d\n",
@@ -903,7 +903,7 @@ static int btrfsic_process_superblock_dev_mirror(
                }
                num_copies =
-                    btrfs_num_copies(&state->root->fs_info->mapping_tree,
+                    btrfs_num_copies(state->root->fs_info,
                                     next_bytenr, state->metablock_size);
                if (state->print_mask & BTRFSIC_PRINT_MASK_NUM_COPIES)
                        printk(KERN_INFO "num_copies(log_bytenr=%llu) = %d\n",
@@ -1287,7 +1287,7 @@ static int btrfsic_create_link_to_next_block(
        *next_blockp = NULL;
        if (0 == *num_copiesp) {
                *num_copiesp =
-                    btrfs_num_copies(&state->root->fs_info->mapping_tree,
+                    btrfs_num_copies(state->root->fs_info,
                                     next_bytenr, state->metablock_size);
                if (state->print_mask & BTRFSIC_PRINT_MASK_NUM_COPIES)
                        printk(KERN_INFO "num_copies(log_bytenr=%llu) = %d\n",
@@ -1489,7 +1489,7 @@ static int btrfsic_handle_extent_data(
                        chunk_len = num_bytes;
                num_copies =
-                    btrfs_num_copies(&state->root->fs_info->mapping_tree,
+                    btrfs_num_copies(state->root->fs_info,
                                     next_bytenr, state->datablock_size);
                if (state->print_mask & BTRFSIC_PRINT_MASK_NUM_COPIES)
                        printk(KERN_INFO "num_copies(log_bytenr=%llu) = %d\n",
@@ -1582,9 +1582,21 @@ static int btrfsic_map_block(struct btrfsic_state *state, u64 bytenr, u32 len,
        struct btrfs_device *device;
        length = len;
-        ret = btrfs_map_block(&state->root->fs_info->mapping_tree, READ,
+        ret = btrfs_map_block(state->root->fs_info, READ,
                              bytenr, &length, &multi, mirror_num);
+        if (ret) {
+                block_ctx_out->start = 0;
+                block_ctx_out->dev_bytenr = 0;
+                block_ctx_out->len = 0;
+                block_ctx_out->dev = NULL;
+                block_ctx_out->datav = NULL;
+                block_ctx_out->pagev = NULL;
+                block_ctx_out->mem_to_free = NULL;
+                return ret;
+        }
        device = multi->stripes[0].dev;
        block_ctx_out->dev = btrfsic_dev_state_lookup(device->bdev);
        block_ctx_out->dev_bytenr = multi->stripes[0].physical;
@@ -1594,8 +1606,7 @@ static int btrfsic_map_block(struct btrfsic_state *state, u64 bytenr, u32 len,
        block_ctx_out->pagev = NULL;
        block_ctx_out->mem_to_free = NULL;
-        if (0 == ret)
+        kfree(multi);
-                kfree(multi);
        if (NULL == block_ctx_out->dev) {
                ret = -ENXIO;
                printk(KERN_INFO "btrfsic: error, cannot lookup dev (#1)!\n");
@@ -2463,7 +2474,7 @@ static int btrfsic_process_written_superblock(
                }
                num_copies =
-                    btrfs_num_copies(&state->root->fs_info->mapping_tree,
+                    btrfs_num_copies(state->root->fs_info,
                                     next_bytenr, BTRFS_SUPER_INFO_SIZE);
                if (state->print_mask & BTRFSIC_PRINT_MASK_NUM_COPIES)
                        printk(KERN_INFO "num_copies(log_bytenr=%llu) = %d\n",
@@ -2960,7 +2971,7 @@ static void btrfsic_cmp_log_and_dev_bytenr(struct btrfsic_state *state,
        struct btrfsic_block_data_ctx block_ctx;
        int match = 0;
-        num_copies = btrfs_num_copies(&state->root->fs_info->mapping_tree,
+        num_copies = btrfs_num_copies(state->root->fs_info,
                                      bytenr, state->metablock_size);
        for (mirror_num = 1; mirror_num <= num_copies; mirror_num++) {
diff --git a/fs/btrfs/compression.c b/fs/btrfs/compression.c
index c6467aa88bee..94ab2f80e7e3 100644
--- a/fs/btrfs/compression.c
+++ b/fs/btrfs/compression.c
@@ -687,7 +687,8 @@ int btrfs_submit_compressed_read(struct inode *inode, struct bio *bio,
                        ret = btrfs_map_bio(root, READ, comp_bio,
                                            mirror_num, 0);
-                        BUG_ON(ret); /* -ENOMEM */
+                        if (ret)
+                                bio_endio(comp_bio, ret);
                        bio_put(comp_bio);
@@ -712,7 +713,8 @@ int btrfs_submit_compressed_read(struct inode *inode, struct bio *bio,
        }
        ret = btrfs_map_bio(root, READ, comp_bio, mirror_num, 0);
-        BUG_ON(ret); /* -ENOMEM */
+        if (ret)
+                bio_endio(comp_bio, ret);
        bio_put(comp_bio);
        return 0;
diff --git a/fs/btrfs/ctree.c b/fs/btrfs/ctree.c
index cdfb4c49a806..eea5da7a2b9a 100644
--- a/fs/btrfs/ctree.c
+++ b/fs/btrfs/ctree.c
@@ -38,8 +38,7 @@ static int balance_node_right(struct btrfs_trans_handle *trans,
                              struct extent_buffer *dst_buf,
                              struct extent_buffer *src_buf);
 static void del_ptr(struct btrfs_trans_handle *trans, struct btrfs_root *root,
-                    struct btrfs_path *path, int level, int slot,
+                    struct btrfs_path *path, int level, int slot);
-                    int tree_mod_log);
 static void tree_mod_log_free_eb(struct btrfs_fs_info *fs_info,
                                 struct extent_buffer *eb);
 struct extent_buffer *read_old_tree_block(struct btrfs_root *root, u64 bytenr,
@@ -776,8 +775,7 @@ tree_mod_log_eb_move(struct btrfs_fs_info *fs_info, struct extent_buffer *dst,
 static noinline void
 tree_mod_log_set_node_key(struct btrfs_fs_info *fs_info,
-                          struct extent_buffer *eb,
+                          struct extent_buffer *eb, int slot, int atomic)
-                          struct btrfs_disk_key *disk_key, int slot, int atomic)
 {
        int ret;
@@ -1361,19 +1359,16 @@ noinline int btrfs_cow_block(struct btrfs_trans_handle *trans,
        u64 search_start;
        int ret;
-        if (trans->transaction != root->fs_info->running_transaction) {
+        if (trans->transaction != root->fs_info->running_transaction)
-                printk(KERN_CRIT "trans %llu running %llu\n",
+                WARN(1, KERN_CRIT "trans %llu running %llu\n",
                       (unsigned long long)trans->transid,
                       (unsigned long long)
                       root->fs_info->running_transaction->transid);
-                WARN_ON(1);
-        }
+        if (trans->transid != root->fs_info->generation)
-        if (trans->transid != root->fs_info->generation) {
+                WARN(1, KERN_CRIT "trans %llu running %llu\n",
-                printk(KERN_CRIT "trans %llu running %llu\n",
                       (unsigned long long)trans->transid,
                       (unsigned long long)root->fs_info->generation);
-                WARN_ON(1);
-        }
        if (!should_cow_block(trans, root, buf)) {
                *cow_ret = buf;
@@ -1469,10 +1464,8 @@ int btrfs_realloc_node(struct btrfs_trans_handle *trans,
        if (cache_only && parent_level != 1)
                return 0;
-        if (trans->transaction != root->fs_info->running_transaction)
+        WARN_ON(trans->transaction != root->fs_info->running_transaction);
-                WARN_ON(1);
+        WARN_ON(trans->transid != root->fs_info->generation);
-        if (trans->transid != root->fs_info->generation)
-                WARN_ON(1);
        parent_nritems = btrfs_header_nritems(parent);
        blocksize = btrfs_level_size(root, parent_level - 1);
@@ -1827,7 +1820,7 @@ static noinline int balance_level(struct btrfs_trans_handle *trans,
                if (btrfs_header_nritems(right) == 0) {
                        clean_tree_block(trans, root, right);
                        btrfs_tree_unlock(right);
-                        del_ptr(trans, root, path, level + 1, pslot + 1, 1);
+                        del_ptr(trans, root, path, level + 1, pslot + 1);
                        root_sub_used(root, right->len);
                        btrfs_free_tree_block(trans, root, right, 0, 1);
                        free_extent_buffer_stale(right);
@@ -1836,7 +1829,7 @@ static noinline int balance_level(struct btrfs_trans_handle *trans,
                        struct btrfs_disk_key right_key;
                        btrfs_node_key(right, &right_key, 0);
                        tree_mod_log_set_node_key(root->fs_info, parent,
-                                                  &right_key, pslot + 1, 0);
+                                                  pslot + 1, 0);
                        btrfs_set_node_key(parent, &right_key, pslot + 1);
                        btrfs_mark_buffer_dirty(parent);
                }
@@ -1871,7 +1864,7 @@ static noinline int balance_level(struct btrfs_trans_handle *trans,
        if (btrfs_header_nritems(mid) == 0) {
                clean_tree_block(trans, root, mid);
                btrfs_tree_unlock(mid);
-                del_ptr(trans, root, path, level + 1, pslot, 1);
+                del_ptr(trans, root, path, level + 1, pslot);
                root_sub_used(root, mid->len);
                btrfs_free_tree_block(trans, root, mid, 0, 1);
                free_extent_buffer_stale(mid);
@@ -1880,7 +1873,7 @@ static noinline int balance_level(struct btrfs_trans_handle *trans,
                /* update the parent key to reflect our changes */
                struct btrfs_disk_key mid_key;
                btrfs_node_key(mid, &mid_key, 0);
-                tree_mod_log_set_node_key(root->fs_info, parent, &mid_key,
+                tree_mod_log_set_node_key(root->fs_info, parent,
                                          pslot, 0);
                btrfs_set_node_key(parent, &mid_key, pslot);
                btrfs_mark_buffer_dirty(parent);
@@ -1980,7 +1973,7 @@ static noinline int push_nodes_for_insert(struct btrfs_trans_handle *trans,
                        orig_slot += left_nr;
                        btrfs_node_key(mid, &disk_key, 0);
                        tree_mod_log_set_node_key(root->fs_info, parent,
-                                                  &disk_key, pslot, 0);
+                                                  pslot, 0);
                        btrfs_set_node_key(parent, &disk_key, pslot);
                        btrfs_mark_buffer_dirty(parent);
                        if (btrfs_header_nritems(left) > orig_slot) {
@@ -2033,7 +2026,7 @@ static noinline int push_nodes_for_insert(struct btrfs_trans_handle *trans,
                        btrfs_node_key(right, &disk_key, 0);
                        tree_mod_log_set_node_key(root->fs_info, parent,
-                                                  &disk_key, pslot + 1, 0);
+                                                  pslot + 1, 0);
                        btrfs_set_node_key(parent, &disk_key, pslot + 1);
                        btrfs_mark_buffer_dirty(parent);
@@ -2219,6 +2212,9 @@ static noinline void unlock_up(struct btrfs_path *path, int level,
        int no_skips = 0;
        struct extent_buffer *t;
+        if (path->really_keep_locks)
+                return;
        for (i = level; i < BTRFS_MAX_LEVEL; i++) {
                if (!path->nodes[i])
                        break;
@@ -2266,7 +2262,7 @@ noinline void btrfs_unlock_up_safe(struct btrfs_path *path, int level)
 {
        int i;
-        if (path->keep_locks)
+        if (path->keep_locks || path->really_keep_locks)
                return;
        for (i = level; i < BTRFS_MAX_LEVEL; i++) {
@@ -2499,7 +2495,7 @@ int btrfs_search_slot(struct btrfs_trans_handle *trans, struct btrfs_root
        if (!cow)
                write_lock_level = -1;
-        if (cow && (p->keep_locks || p->lowest_level))
+        if (cow && (p->really_keep_locks || p->keep_locks || p->lowest_level))
                write_lock_level = BTRFS_MAX_LEVEL;
        min_write_lock_level = write_lock_level;
@@ -2568,7 +2564,10 @@ again:
                         * must have write locks on this node and the
                         * parent
                         */
-                        if (level + 1 > write_lock_level) {
+                        if (level > write_lock_level ||
+                            (level + 1 > write_lock_level &&
+                            level + 1 < BTRFS_MAX_LEVEL &&
+                            p->nodes[level + 1])) {
                                write_lock_level = level + 1;
                                btrfs_release_path(p);
                                goto again;
@@ -2917,7 +2916,7 @@ static void fixup_low_keys(struct btrfs_trans_handle *trans,
                if (!path->nodes[i])
                        break;
                t = path->nodes[i];
-                tree_mod_log_set_node_key(root->fs_info, t, key, tslot, 1);
+                tree_mod_log_set_node_key(root->fs_info, t, tslot, 1);
                btrfs_set_node_key(t, key, tslot);
                btrfs_mark_buffer_dirty(path->nodes[i]);
                if (tslot != 0)
@@ -3302,14 +3301,21 @@ static noinline int split_node(struct btrfs_trans_handle *trans,
 */
 static int leaf_space_used(struct extent_buffer *l, int start, int nr)
 {
+        struct btrfs_item *start_item;
+        struct btrfs_item *end_item;
+        struct btrfs_map_token token;
        int data_len;
        int nritems = btrfs_header_nritems(l);
        int end = min(nritems, start + nr) - 1;
        if (!nr)
                return 0;
-        data_len = btrfs_item_end_nr(l, start);
+        btrfs_init_map_token(&token);
-        data_len = data_len - btrfs_item_offset_nr(l, end);
+        start_item = btrfs_item_nr(l, start);
+        end_item = btrfs_item_nr(l, end);
+        data_len = btrfs_token_item_offset(l, start_item, &token) +
+                btrfs_token_item_size(l, start_item, &token);
+        data_len = data_len - btrfs_token_item_offset(l, end_item, &token);
        data_len += sizeof(struct btrfs_item) * nr;
        WARN_ON(data_len < 0);
        return data_len;
@@ -3403,8 +3409,7 @@ static noinline int __push_leaf_right(struct btrfs_trans_handle *trans,
        if (push_items == 0)
                goto out_unlock;
-        if (!empty && push_items == left_nritems)
+        WARN_ON(!empty && push_items == left_nritems);
-                WARN_ON(1);
        /* push left to right */
        right_nritems = btrfs_header_nritems(right);
@@ -3642,11 +3647,9 @@ static noinline int __push_leaf_left(struct btrfs_trans_handle *trans,
        btrfs_set_header_nritems(left, old_left_nritems + push_items);
        /* fixup right node */
-        if (push_items > right_nritems) {
+        if (push_items > right_nritems)
-                printk(KERN_CRIT "push items %d nr %u\n", push_items,
+                WARN(1, KERN_CRIT "push items %d nr %u\n", push_items,
                       right_nritems);
-                WARN_ON(1);
-        }
        if (push_items < right_nritems) {
                push_space = btrfs_item_offset_nr(right, push_items - 1) -
@@ -4602,8 +4605,7 @@ int btrfs_insert_item(struct btrfs_trans_handle *trans, struct btrfs_root
 * empty a node.
 */
 static void del_ptr(struct btrfs_trans_handle *trans, struct btrfs_root *root,
-                    struct btrfs_path *path, int level, int slot,
+                    struct btrfs_path *path, int level, int slot)
-                    int tree_mod_log)
 {
        struct extent_buffer *parent = path->nodes[level];
        u32 nritems;
@@ -4611,7 +4613,7 @@ static void del_ptr(struct btrfs_trans_handle *trans, struct btrfs_root *root,
        nritems = btrfs_header_nritems(parent);
        if (slot != nritems - 1) {
-                if (tree_mod_log && level)
+                if (level)
                        tree_mod_log_eb_move(root->fs_info, parent, slot,
                                             slot + 1, nritems - slot - 1);
                memmove_extent_buffer(parent,
@@ -4619,7 +4621,7 @@ static void del_ptr(struct btrfs_trans_handle *trans, struct btrfs_root *root,
                              btrfs_node_key_ptr_offset(slot + 1),
                              sizeof(struct btrfs_key_ptr) *
                              (nritems - slot - 1));
-        } else if (tree_mod_log && level) {
+        } else if (level) {
                ret = tree_mod_log_insert_key(root->fs_info, parent, slot,
                                              MOD_LOG_KEY_REMOVE);
                BUG_ON(ret < 0);
@@ -4656,7 +4658,7 @@ static noinline void btrfs_del_leaf(struct btrfs_trans_handle *trans,
                                    struct extent_buffer *leaf)
 {
        WARN_ON(btrfs_header_generation(leaf) != trans->transid);
-        del_ptr(trans, root, path, 1, path->slots[1], 1);
+        del_ptr(trans, root, path, 1, path->slots[1]);
        /*
         * btrfs_free_extent is expensive, we want to make sure we
@@ -5123,13 +5125,13 @@ int btrfs_compare_trees(struct btrfs_root *left_root,
        right_path->search_commit_root = 1;
        right_path->skip_locking = 1;
-        spin_lock(&left_root->root_times_lock);
+        spin_lock(&left_root->root_item_lock);
        left_start_ctransid = btrfs_root_ctransid(&left_root->root_item);
-        spin_unlock(&left_root->root_times_lock);
+        spin_unlock(&left_root->root_item_lock);
-        spin_lock(&right_root->root_times_lock);
+        spin_lock(&right_root->root_item_lock);
        right_start_ctransid = btrfs_root_ctransid(&right_root->root_item);
-        spin_unlock(&right_root->root_times_lock);
+        spin_unlock(&right_root->root_item_lock);
        trans = btrfs_join_transaction(left_root);
        if (IS_ERR(trans)) {
@@ -5224,15 +5226,15 @@ int btrfs_compare_trees(struct btrfs_root *left_root,
                                goto out;
                        }
-                        spin_lock(&left_root->root_times_lock);
+                        spin_lock(&left_root->root_item_lock);
                        ctransid = btrfs_root_ctransid(&left_root->root_item);
-                        spin_unlock(&left_root->root_times_lock);
+                        spin_unlock(&left_root->root_item_lock);
                        if (ctransid != left_start_ctransid)
                                left_start_ctransid = 0;
-                        spin_lock(&right_root->root_times_lock);
+                        spin_lock(&right_root->root_item_lock);
                        ctransid = btrfs_root_ctransid(&right_root->root_item);
-                        spin_unlock(&right_root->root_times_lock);
+                        spin_unlock(&right_root->root_item_lock);
                        if (ctransid != right_start_ctransid)
                                right_start_ctransid = 0;
@@ -5496,6 +5498,139 @@ int btrfs_next_leaf(struct btrfs_root *root, struct btrfs_path *path)
        return btrfs_next_old_leaf(root, path, 0);
 }
+/* Release the path up to but not including the given level */
+static void btrfs_release_level(struct btrfs_path *path, int level)
+{
+        int i;
+        for (i = 0; i < level; i++) {
+                path->slots[i] = 0;
+                if (!path->nodes[i])
+                        continue;
+                if (path->locks[i]) {
+                        btrfs_tree_unlock_rw(path->nodes[i], path->locks[i]);
+                        path->locks[i] = 0;
+                }
+                free_extent_buffer(path->nodes[i]);
+                path->nodes[i] = NULL;
+        }
+}
+/*
+ * This function assumes 2 things
+ *
+ * 1) You are using path->keep_locks
+ * 2) You are not inserting items.
+ *
+ * If either of these are not true do not use this function. If you need a next
+ * leaf with either of these not being true then this function can be easily
+ * adapted to do that, but at the moment these are the limitations.
+ */
+int btrfs_next_leaf_write(struct btrfs_trans_handle *trans,
+                          struct btrfs_root *root, struct btrfs_path *path,
+                          int del)
+{
+        struct extent_buffer *b;
+        struct btrfs_key key;
+        u32 nritems;
+        int level = 1;
+        int slot;
+        int ret = 1;
+        int write_lock_level = BTRFS_MAX_LEVEL;
+        int ins_len = del ? -1 : 0;
+        WARN_ON(!(path->keep_locks || path->really_keep_locks));
+        nritems = btrfs_header_nritems(path->nodes[0]);
+        btrfs_item_key_to_cpu(path->nodes[0], &key, nritems - 1);
+        while (path->nodes[level]) {
+                nritems = btrfs_header_nritems(path->nodes[level]);
+                if (!(path->locks[level] & BTRFS_WRITE_LOCK)) {
+search:
+                        btrfs_release_path(path);
+                        ret = btrfs_search_slot(trans, root, &key, path,
+                                                ins_len, 1);
+                        if (ret < 0)
+                                goto out;
+                        level = 1;
+                        continue;
+                }
+                if (path->slots[level] >= nritems - 1) {
+                        level++;
+                        continue;
+                }
+                btrfs_release_level(path, level);
+                break;
+        }
+        if (!path->nodes[level]) {
+                ret = 1;
+                goto out;
+        }
+        path->slots[level]++;
+        b = path->nodes[level];
+        while (b) {
+                level = btrfs_header_level(b);
+                if (!should_cow_block(trans, root, b))
+                        goto cow_done;
+                btrfs_set_path_blocking(path);
+                ret = btrfs_cow_block(trans, root, b,
+                                      path->nodes[level + 1],
+                                      path->slots[level + 1], &b);
+                if (ret)
+                        goto out;
+cow_done:
+                path->nodes[level] = b;
+                btrfs_clear_path_blocking(path, NULL, 0);
+                if (level != 0) {
+                        ret = setup_nodes_for_search(trans, root, path, b,
+                                                     level, ins_len,
+                                                     &write_lock_level);
+                        if (ret == -EAGAIN)
+                                goto search;
+                        if (ret)
+                                goto out;
+                        b = path->nodes[level];
+                        slot = path->slots[level];
+                        ret = read_block_for_search(trans, root, path,
+                                                    &b, level, slot, &key, 0);
+                        if (ret == -EAGAIN)
+                                goto search;
+                        if (ret)
+                                goto out;
+                        level = btrfs_header_level(b);
+                        if (!btrfs_try_tree_write_lock(b)) {
+                                btrfs_set_path_blocking(path);
+                                btrfs_tree_lock(b);
+                                btrfs_clear_path_blocking(path, b,
+                                                          BTRFS_WRITE_LOCK);
+                        }
+                        path->locks[level] = BTRFS_WRITE_LOCK;
+                        path->nodes[level] = b;
+                        path->slots[level] = 0;
+                } else {
+                        path->slots[level] = 0;
+                        ret = 0;
+                        break;
+                }
+        }
+out:
+        if (ret)
+                btrfs_release_path(path);
+        return ret;
+}
 int btrfs_next_old_leaf(struct btrfs_root *root, struct btrfs_path *path,
                        u64 time_seq)
 {
diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h
index c72ead869507..547b7b05727f 100644
--- a/fs/btrfs/ctree.h
+++ b/fs/btrfs/ctree.h
@@ -48,7 +48,7 @@ struct btrfs_ordered_sum;
 #define BTRFS_MAGIC "_BHRfS_M"
-#define BTRFS_MAX_MIRRORS 2
+#define BTRFS_MAX_MIRRORS 3
 #define BTRFS_MAX_LEVEL 8
@@ -142,6 +142,8 @@ struct btrfs_ordered_sum;
 #define BTRFS_EMPTY_SUBVOL_DIR_OBJECTID 2
+#define BTRFS_DEV_REPLACE_DEVID 0
 /*
 * the max metadata block size.  This limit is somewhat artificial,
 * but the memmove costs go through the roof for larger blocks.
@@ -172,6 +174,9 @@ static int btrfs_csum_sizes[] = { 4, 0 };
 /* four bytes for CRC32 */
 #define BTRFS_EMPTY_DIR_SIZE 0
+/* spefic to btrfs_map_block(), therefore not in include/linux/blk_types.h */
+#define REQ_GET_READ_MIRRORS    (1 << 30)
 #define BTRFS_FT_UNKNOWN        0
 #define BTRFS_FT_REG_FILE       1
 #define BTRFS_FT_DIR            2
@@ -413,7 +418,7 @@ struct btrfs_root_backup {
        __le64 bytes_used;
        __le64 num_devices;
        /* future */
-        __le64 unsed_64[4];
+        __le64 unused_64[4];
        u8 tree_root_level;
        u8 chunk_root_level;
@@ -571,6 +576,7 @@ struct btrfs_path {
        unsigned int skip_locking:1;
        unsigned int leave_spinning:1;
        unsigned int search_commit_root:1;
+        unsigned int really_keep_locks:1;
 };
 /*
@@ -885,6 +891,59 @@ struct btrfs_dev_stats_item {
        __le64 values[BTRFS_DEV_STAT_VALUES_MAX];
 } __attribute__ ((__packed__));
+#define BTRFS_DEV_REPLACE_ITEM_CONT_READING_FROM_SRCDEV_MODE_ALWAYS     0
+#define BTRFS_DEV_REPLACE_ITEM_CONT_READING_FROM_SRCDEV_MODE_AVOID      1
+#define BTRFS_DEV_REPLACE_ITEM_STATE_NEVER_STARTED      0
+#define BTRFS_DEV_REPLACE_ITEM_STATE_STARTED            1
+#define BTRFS_DEV_REPLACE_ITEM_STATE_SUSPENDED          2
+#define BTRFS_DEV_REPLACE_ITEM_STATE_FINISHED           3
+#define BTRFS_DEV_REPLACE_ITEM_STATE_CANCELED           4
+struct btrfs_dev_replace {
+        u64 replace_state;      /* see #define above */
+        u64 time_started;       /* seconds since 1-Jan-1970 */
+        u64 time_stopped;       /* seconds since 1-Jan-1970 */
+        atomic64_t num_write_errors;
+        atomic64_t num_uncorrectable_read_errors;
+        u64 cursor_left;
+        u64 committed_cursor_left;
+        u64 cursor_left_last_write_of_item;
+        u64 cursor_right;
+        u64 cont_reading_from_srcdev_mode;      /* see #define above */
+        int is_valid;
+        int item_needs_writeback;
+        struct btrfs_device *srcdev;
+        struct btrfs_device *tgtdev;
+        pid_t lock_owner;
+        atomic_t nesting_level;
+        struct mutex lock_finishing_cancel_unmount;
+        struct mutex lock_management_lock;
+        struct mutex lock;
+        struct btrfs_scrub_progress scrub_progress;
+};
+struct btrfs_dev_replace_item {
+        /*
+         * grow this item struct at the end for future enhancements and keep
+         * the existing values unchanged
+         */
+        __le64 src_devid;
+        __le64 cursor_left;
+        __le64 cursor_right;
+        __le64 cont_reading_from_srcdev_mode;
+        __le64 replace_state;
+        __le64 time_started;
+        __le64 time_stopped;
+        __le64 num_write_errors;
+        __le64 num_uncorrectable_read_errors;
+} __attribute__ ((__packed__));
 /* different types of block groups (and chunks) */
 #define BTRFS_BLOCK_GROUP_DATA          (1ULL << 0)
 #define BTRFS_BLOCK_GROUP_SYSTEM        (1ULL << 1)
@@ -1333,6 +1392,7 @@ struct btrfs_fs_info {
        struct btrfs_workers generic_worker;
        struct btrfs_workers workers;
        struct btrfs_workers delalloc_workers;
+        struct btrfs_workers flush_workers;
        struct btrfs_workers endio_workers;
        struct btrfs_workers endio_meta_workers;
        struct btrfs_workers endio_meta_write_workers;
@@ -1429,6 +1489,8 @@ struct btrfs_fs_info {
        struct rw_semaphore scrub_super_lock;
        int scrub_workers_refcnt;
        struct btrfs_workers scrub_workers;
+        struct btrfs_workers scrub_wr_completion_workers;
+        struct btrfs_workers scrub_nocow_workers;
 #ifdef CONFIG_BTRFS_FS_CHECK_INTEGRITY
        u32 check_integrity_print_mask;
@@ -1470,6 +1532,11 @@ struct btrfs_fs_info {
        int backup_root_index;
        int num_tolerated_disk_barrier_failures;
+        /* device replace state */
+        struct btrfs_dev_replace dev_replace;
+        atomic_t mutually_exclusive_operation_running;
 };
 /*
@@ -1579,7 +1646,7 @@ struct btrfs_root {
        int force_cow;
-        spinlock_t root_times_lock;
+        spinlock_t root_item_lock;
 };
 struct btrfs_ioctl_defrag_range_args {
@@ -1723,6 +1790,12 @@ struct btrfs_ioctl_defrag_range_args {
 #define BTRFS_DEV_STATS_KEY     249
 /*
+ * Persistantly stores the device replace state in the device tree.
+ * The key is built like this: (0, BTRFS_DEV_REPLACE_KEY, 0).
+ */
+#define BTRFS_DEV_REPLACE_KEY   250
+/*
 * string items are for debugging.  They just store a short string of
 * data in the FS
 */
@@ -1787,7 +1860,7 @@ struct btrfs_map_token {
 static inline void btrfs_init_map_token (struct btrfs_map_token *token)
 {
-        memset(token, 0, sizeof(*token));
+        token->kaddr = NULL;
 }
 /* some macros to generate set/get funcs for the struct fields.  This
@@ -2755,6 +2828,49 @@ BTRFS_SETGET_FUNCS(qgroup_limit_rsv_rfer, struct btrfs_qgroup_limit_item,
 BTRFS_SETGET_FUNCS(qgroup_limit_rsv_excl, struct btrfs_qgroup_limit_item,
                   rsv_excl, 64);
+/* btrfs_dev_replace_item */
+BTRFS_SETGET_FUNCS(dev_replace_src_devid,
+                   struct btrfs_dev_replace_item, src_devid, 64);
+BTRFS_SETGET_FUNCS(dev_replace_cont_reading_from_srcdev_mode,
+                   struct btrfs_dev_replace_item, cont_reading_from_srcdev_mode,
+                   64);
+BTRFS_SETGET_FUNCS(dev_replace_replace_state, struct btrfs_dev_replace_item,
+                   replace_state, 64);
+BTRFS_SETGET_FUNCS(dev_replace_time_started, struct btrfs_dev_replace_item,
+                   time_started, 64);
+BTRFS_SETGET_FUNCS(dev_replace_time_stopped, struct btrfs_dev_replace_item,
+                   time_stopped, 64);
+BTRFS_SETGET_FUNCS(dev_replace_num_write_errors, struct btrfs_dev_replace_item,
+                   num_write_errors, 64);
+BTRFS_SETGET_FUNCS(dev_replace_num_uncorrectable_read_errors,
+                   struct btrfs_dev_replace_item, num_uncorrectable_read_errors,
+                   64);
+BTRFS_SETGET_FUNCS(dev_replace_cursor_left, struct btrfs_dev_replace_item,
+                   cursor_left, 64);
+BTRFS_SETGET_FUNCS(dev_replace_cursor_right, struct btrfs_dev_replace_item,
+                   cursor_right, 64);
+BTRFS_SETGET_STACK_FUNCS(stack_dev_replace_src_devid,
+                         struct btrfs_dev_replace_item, src_devid, 64);
+BTRFS_SETGET_STACK_FUNCS(stack_dev_replace_cont_reading_from_srcdev_mode,
+                         struct btrfs_dev_replace_item,
+                         cont_reading_from_srcdev_mode, 64);
+BTRFS_SETGET_STACK_FUNCS(stack_dev_replace_replace_state,
+                         struct btrfs_dev_replace_item, replace_state, 64);
+BTRFS_SETGET_STACK_FUNCS(stack_dev_replace_time_started,
+                         struct btrfs_dev_replace_item, time_started, 64);
+BTRFS_SETGET_STACK_FUNCS(stack_dev_replace_time_stopped,
+                         struct btrfs_dev_replace_item, time_stopped, 64);
+BTRFS_SETGET_STACK_FUNCS(stack_dev_replace_num_write_errors,
+                         struct btrfs_dev_replace_item, num_write_errors, 64);
+BTRFS_SETGET_STACK_FUNCS(stack_dev_replace_num_uncorrectable_read_errors,
+                         struct btrfs_dev_replace_item,
+                         num_uncorrectable_read_errors, 64);
+BTRFS_SETGET_STACK_FUNCS(stack_dev_replace_cursor_left,
+                         struct btrfs_dev_replace_item, cursor_left, 64);
+BTRFS_SETGET_STACK_FUNCS(stack_dev_replace_cursor_right,
+                         struct btrfs_dev_replace_item, cursor_right, 64);
 static inline struct btrfs_fs_info *btrfs_sb(struct super_block *sb)
 {
        return sb->s_fs_info;
@@ -2900,6 +3016,18 @@ void btrfs_create_pending_block_groups(struct btrfs_trans_handle *trans,
 u64 btrfs_reduce_alloc_profile(struct btrfs_root *root, u64 flags);
 u64 btrfs_get_alloc_profile(struct btrfs_root *root, int data);
 void btrfs_clear_space_info_full(struct btrfs_fs_info *info);
+enum btrfs_reserve_flush_enum {
+        /* If we are in the transaction, we can't flush anything.*/
+        BTRFS_RESERVE_NO_FLUSH,
+        /*
+         * Flushing delalloc may cause deadlock somewhere, in this
+         * case, use FLUSH LIMIT
+         */
+        BTRFS_RESERVE_FLUSH_LIMIT,
+        BTRFS_RESERVE_FLUSH_ALL,
+};
 int btrfs_check_data_free_space(struct inode *inode, u64 bytes);
 void btrfs_free_reserved_data_space(struct inode *inode, u64 bytes);
 void btrfs_trans_release_metadata(struct btrfs_trans_handle *trans,
@@ -2919,19 +3047,13 @@ struct btrfs_block_rsv *btrfs_alloc_block_rsv(struct btrfs_root *root,
 void btrfs_free_block_rsv(struct btrfs_root *root,
                          struct btrfs_block_rsv *rsv);
 int btrfs_block_rsv_add(struct btrfs_root *root,
-                        struct btrfs_block_rsv *block_rsv,
+                        struct btrfs_block_rsv *block_rsv, u64 num_bytes,
-                        u64 num_bytes);
+                        enum btrfs_reserve_flush_enum flush);
-int btrfs_block_rsv_add_noflush(struct btrfs_root *root,
-                                struct btrfs_block_rsv *block_rsv,
-                                u64 num_bytes);
 int btrfs_block_rsv_check(struct btrfs_root *root,
                          struct btrfs_block_rsv *block_rsv, int min_factor);
 int btrfs_block_rsv_refill(struct btrfs_root *root,
-                          struct btrfs_block_rsv *block_rsv,
+                           struct btrfs_block_rsv *block_rsv, u64 min_reserved,
-                          u64 min_reserved);
+                           enum btrfs_reserve_flush_enum flush);
-int btrfs_block_rsv_refill_noflush(struct btrfs_root *root,
-                                   struct btrfs_block_rsv *block_rsv,
-                                   u64 min_reserved);
 int btrfs_block_rsv_migrate(struct btrfs_block_rsv *src_rsv,
                            struct btrfs_block_rsv *dst_rsv,
                            u64 num_bytes);
@@ -2955,6 +3077,7 @@ int btrfs_trim_fs(struct btrfs_root *root, struct fstrim_range *range);
 int btrfs_init_space_info(struct btrfs_fs_info *fs_info);
 int btrfs_delayed_refs_qgroup_accounting(struct btrfs_trans_handle *trans,
                                         struct btrfs_fs_info *fs_info);
+int __get_raid_index(u64 flags);
 /* ctree.c */
 int btrfs_bin_search(struct extent_buffer *eb, struct btrfs_key *key,
                     int level, int *slot);
@@ -3065,6 +3188,9 @@ static inline int btrfs_insert_empty_item(struct btrfs_trans_handle *trans,
 }
 int btrfs_next_leaf(struct btrfs_root *root, struct btrfs_path *path);
+int btrfs_next_leaf_write(struct btrfs_trans_handle *trans,
+                          struct btrfs_root *root, struct btrfs_path *path,
+                          int del);
 int btrfs_next_old_leaf(struct btrfs_root *root, struct btrfs_path *path,
                        u64 time_seq);
 static inline int btrfs_next_old_item(struct btrfs_root *root,
@@ -3157,6 +3283,8 @@ void btrfs_update_root_times(struct btrfs_trans_handle *trans,
                             struct btrfs_root *root);
 /* dir-item.c */
+int btrfs_check_dir_item_collision(struct btrfs_root *root, u64 dir,
+                          const char *name, int name_len);
 int btrfs_insert_dir_item(struct btrfs_trans_handle *trans,
                          struct btrfs_root *root, const char *name,
                          int name_len, struct inode *dir,
@@ -3256,6 +3384,7 @@ int btrfs_lookup_file_extent(struct btrfs_trans_handle *trans,
                             struct btrfs_root *root,
                             struct btrfs_path *path, u64 objectid,
                             u64 bytenr, int mod);
+u64 btrfs_file_extent_length(struct btrfs_path *path);
 int btrfs_csum_file_blocks(struct btrfs_trans_handle *trans,
                           struct btrfs_root *root,
                           struct btrfs_ordered_sum *sums);
@@ -3271,6 +3400,19 @@ int btrfs_csum_truncate(struct btrfs_trans_handle *trans,
 int btrfs_lookup_csums_range(struct btrfs_root *root, u64 start, u64 end,
                             struct list_head *list, int search_commit);
 /* inode.c */
+struct btrfs_delalloc_work {
+        struct inode *inode;
+        int wait;
+        int delay_iput;
+        struct completion completion;
+        struct list_head list;
+        struct btrfs_work work;
+};
+struct btrfs_delalloc_work *btrfs_alloc_delalloc_work(struct inode *inode,
+                                                    int wait, int delay_iput);
+void btrfs_wait_and_free_delalloc_work(struct btrfs_delalloc_work *work);
 struct extent_map *btrfs_get_extent_fiemap(struct inode *inode, struct page *page,
                                           size_t pg_offset, u64 start, u64 len,
                                           int create);
@@ -3370,9 +3512,12 @@ void btrfs_get_block_group_info(struct list_head *groups_list,
                                struct btrfs_ioctl_space_info *space);
 /* file.c */
+int btrfs_auto_defrag_init(void);
+void btrfs_auto_defrag_exit(void);
 int btrfs_add_inode_defrag(struct btrfs_trans_handle *trans,
                           struct inode *inode);
 int btrfs_run_defrag_inodes(struct btrfs_fs_info *fs_info);
+void btrfs_cleanup_defrag_inodes(struct btrfs_fs_info *fs_info);
 int btrfs_sync_file(struct file *file, loff_t start, loff_t end, int datasync);
 void btrfs_drop_extent_cache(struct inode *inode, u64 start, u64 end,
                             int skip_pinned);
@@ -3519,15 +3664,16 @@ int btrfs_reloc_post_snapshot(struct btrfs_trans_handle *trans,
                              struct btrfs_pending_snapshot *pending);
 /* scrub.c */
-int btrfs_scrub_dev(struct btrfs_root *root, u64 devid, u64 start, u64 end,
+int btrfs_scrub_dev(struct btrfs_fs_info *fs_info, u64 devid, u64 start,
-                    struct btrfs_scrub_progress *progress, int readonly);
+                    u64 end, struct btrfs_scrub_progress *progress,
+                    int readonly, int is_dev_replace);
 void btrfs_scrub_pause(struct btrfs_root *root);
 void btrfs_scrub_pause_super(struct btrfs_root *root);
 void btrfs_scrub_continue(struct btrfs_root *root);
 void btrfs_scrub_continue_super(struct btrfs_root *root);
-int __btrfs_scrub_cancel(struct btrfs_fs_info *info);
+int btrfs_scrub_cancel(struct btrfs_fs_info *info);
-int btrfs_scrub_cancel(struct btrfs_root *root);
+int btrfs_scrub_cancel_dev(struct btrfs_fs_info *info,
-int btrfs_scrub_cancel_dev(struct btrfs_root *root, struct btrfs_device *dev);
+                           struct btrfs_device *dev);
 int btrfs_scrub_cancel_devid(struct btrfs_root *root, u64 devid);
 int btrfs_scrub_progress(struct btrfs_root *root, u64 devid,
                         struct btrfs_scrub_progress *progress);
diff --git a/fs/btrfs/delayed-inode.c b/fs/btrfs/delayed-inode.c
index 478f66bdc57b..34836036f01b 100644
--- a/fs/btrfs/delayed-inode.c
+++ b/fs/btrfs/delayed-inode.c
@@ -651,7 +651,8 @@ static int btrfs_delayed_inode_reserve_metadata(
         */
        if (!src_rsv || (!trans->bytes_reserved &&
                         src_rsv->type != BTRFS_BLOCK_RSV_DELALLOC)) {
-                ret = btrfs_block_rsv_add_noflush(root, dst_rsv, num_bytes);
+                ret = btrfs_block_rsv_add(root, dst_rsv, num_bytes,
+                                          BTRFS_RESERVE_NO_FLUSH);
                /*
                 * Since we're under a transaction reserve_metadata_bytes could
                 * try to commit the transaction which will make it return
@@ -686,7 +687,8 @@ static int btrfs_delayed_inode_reserve_metadata(
                 * reserve something strictly for us.  If not be a pain and try
                 * to steal from the delalloc block rsv.
                 */
-                ret = btrfs_block_rsv_add_noflush(root, dst_rsv, num_bytes);
+                ret = btrfs_block_rsv_add(root, dst_rsv, num_bytes,
+                                          BTRFS_RESERVE_NO_FLUSH);
                if (!ret)
                        goto out;
@@ -1255,7 +1257,6 @@ static void btrfs_async_run_delayed_node_done(struct btrfs_work *work)
        struct btrfs_delayed_node *delayed_node = NULL;
        struct btrfs_root *root;
        struct btrfs_block_rsv *block_rsv;
-        unsigned long nr = 0;
        int need_requeue = 0;
        int ret;
@@ -1316,11 +1317,9 @@ static void btrfs_async_run_delayed_node_done(struct btrfs_work *work)
                                           delayed_node);
        mutex_unlock(&delayed_node->mutex);
-        nr = trans->blocks_used;
        trans->block_rsv = block_rsv;
        btrfs_end_transaction_dmeta(trans, root);
-        __btrfs_btree_balance_dirty(root, nr);
+        btrfs_btree_balance_dirty_nodelay(root);
 free_path:
        btrfs_free_path(path);
 out:
diff --git a/fs/btrfs/dev-replace.c b/fs/btrfs/dev-replace.c
new file mode 100644
index 000000000000..66dbc8dbddf7
--- /dev/null
+++ b/fs/btrfs/dev-replace.c
@@ -0,0 +1,856 @@
+/*
+ * Copyright (C) STRATO AG 2012.  All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public
+ * License v2 as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public
+ * License along with this program; if not, write to the
+ * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
+ * Boston, MA 021110-1307, USA.
+ */
+#include <linux/sched.h>
+#include <linux/bio.h>
+#include <linux/slab.h>
+#include <linux/buffer_head.h>
+#include <linux/blkdev.h>
+#include <linux/random.h>
+#include <linux/iocontext.h>
+#include <linux/capability.h>
+#include <linux/kthread.h>
+#include <linux/math64.h>
+#include <asm/div64.h>
+#include "compat.h"
+#include "ctree.h"
+#include "extent_map.h"
+#include "disk-io.h"
+#include "transaction.h"
+#include "print-tree.h"
+#include "volumes.h"
+#include "async-thread.h"
+#include "check-integrity.h"
+#include "rcu-string.h"
+#include "dev-replace.h"
+static u64 btrfs_get_seconds_since_1970(void);
+static int btrfs_dev_replace_finishing(struct btrfs_fs_info *fs_info,
+                                       int scrub_ret);
+static void btrfs_dev_replace_update_device_in_mapping_tree(
+                                                struct btrfs_fs_info *fs_info,
+                                                struct btrfs_device *srcdev,
+                                                struct btrfs_device *tgtdev);
+static int btrfs_dev_replace_find_srcdev(struct btrfs_root *root, u64 srcdevid,
+                                         char *srcdev_name,
+                                         struct btrfs_device **device);
+static u64 __btrfs_dev_replace_cancel(struct btrfs_fs_info *fs_info);
+static int btrfs_dev_replace_kthread(void *data);
+static int btrfs_dev_replace_continue_on_mount(struct btrfs_fs_info *fs_info);
+int btrfs_init_dev_replace(struct btrfs_fs_info *fs_info)
+{
+        struct btrfs_key key;
+        struct btrfs_root *dev_root = fs_info->dev_root;
+        struct btrfs_dev_replace *dev_replace = &fs_info->dev_replace;
+        struct extent_buffer *eb;
+        int slot;
+        int ret = 0;
+        struct btrfs_path *path = NULL;
+        int item_size;
+        struct btrfs_dev_replace_item *ptr;
+        u64 src_devid;
+        path = btrfs_alloc_path();
+        if (!path) {
+                ret = -ENOMEM;
+                goto out;
+        }
+        key.objectid = 0;
+        key.type = BTRFS_DEV_REPLACE_KEY;
+        key.offset = 0;
+        ret = btrfs_search_slot(NULL, dev_root, &key, path, 0, 0);
+        if (ret) {
+no_valid_dev_replace_entry_found:
+                ret = 0;
+                dev_replace->replace_state =
+                        BTRFS_DEV_REPLACE_ITEM_STATE_NEVER_STARTED;
+                dev_replace->cont_reading_from_srcdev_mode =
+                    BTRFS_DEV_REPLACE_ITEM_CONT_READING_FROM_SRCDEV_MODE_ALWAYS;
+                dev_replace->replace_state = 0;
+                dev_replace->time_started = 0;
+                dev_replace->time_stopped = 0;
+                atomic64_set(&dev_replace->num_write_errors, 0);
+                atomic64_set(&dev_replace->num_uncorrectable_read_errors, 0);
+                dev_replace->cursor_left = 0;
+                dev_replace->committed_cursor_left = 0;
+                dev_replace->cursor_left_last_write_of_item = 0;
+                dev_replace->cursor_right = 0;
+                dev_replace->srcdev = NULL;
+                dev_replace->tgtdev = NULL;
+                dev_replace->is_valid = 0;
+                dev_replace->item_needs_writeback = 0;
+                goto out;
+        }
+        slot = path->slots[0];
+        eb = path->nodes[0];
+        item_size = btrfs_item_size_nr(eb, slot);
+        ptr = btrfs_item_ptr(eb, slot, struct btrfs_dev_replace_item);
+        if (item_size != sizeof(struct btrfs_dev_replace_item)) {
+                pr_warn("btrfs: dev_replace entry found has unexpected size, ignore entry\n");
+                goto no_valid_dev_replace_entry_found;
+        }
+        src_devid = btrfs_dev_replace_src_devid(eb, ptr);
+        dev_replace->cont_reading_from_srcdev_mode =
+                btrfs_dev_replace_cont_reading_from_srcdev_mode(eb, ptr);
+        dev_replace->replace_state = btrfs_dev_replace_replace_state(eb, ptr);
+        dev_replace->time_started = btrfs_dev_replace_time_started(eb, ptr);
+        dev_replace->time_stopped =
+                btrfs_dev_replace_time_stopped(eb, ptr);
+        atomic64_set(&dev_replace->num_write_errors,
+                     btrfs_dev_replace_num_write_errors(eb, ptr));
+        atomic64_set(&dev_replace->num_uncorrectable_read_errors,
+                     btrfs_dev_replace_num_uncorrectable_read_errors(eb, ptr));
+        dev_replace->cursor_left = btrfs_dev_replace_cursor_left(eb, ptr);
+        dev_replace->committed_cursor_left = dev_replace->cursor_left;
+        dev_replace->cursor_left_last_write_of_item = dev_replace->cursor_left;
+        dev_replace->cursor_right = btrfs_dev_replace_cursor_right(eb, ptr);
+        dev_replace->is_valid = 1;
+        dev_replace->item_needs_writeback = 0;
+        switch (dev_replace->replace_state) {
+        case BTRFS_IOCTL_DEV_REPLACE_STATE_NEVER_STARTED:
+        case BTRFS_IOCTL_DEV_REPLACE_STATE_FINISHED:
+        case BTRFS_IOCTL_DEV_REPLACE_STATE_CANCELED:
+                dev_replace->srcdev = NULL;
+                dev_replace->tgtdev = NULL;
+                break;
+        case BTRFS_IOCTL_DEV_REPLACE_STATE_STARTED:
+        case BTRFS_IOCTL_DEV_REPLACE_STATE_SUSPENDED:
+                dev_replace->srcdev = btrfs_find_device(fs_info, src_devid,
+                                                        NULL, NULL);
+                dev_replace->tgtdev = btrfs_find_device(fs_info,
+                                                        BTRFS_DEV_REPLACE_DEVID,
+                                                        NULL, NULL);
+                /*
+                 * allow 'btrfs dev replace_cancel' if src/tgt device is
+                 * missing
+                 */
+                if (!dev_replace->srcdev &&
+                    !btrfs_test_opt(dev_root, DEGRADED)) {
+                        ret = -EIO;
+                        pr_warn("btrfs: cannot mount because device replace operation is ongoing and\n" "srcdev (devid %llu) is missing, need to run 'btrfs dev scan'?\n",
+                                (unsigned long long)src_devid);
+                }
+                if (!dev_replace->tgtdev &&
+                    !btrfs_test_opt(dev_root, DEGRADED)) {
+                        ret = -EIO;
+                        pr_warn("btrfs: cannot mount because device replace operation is ongoing and\n" "tgtdev (devid %llu) is missing, need to run btrfs dev scan?\n",
+                                (unsigned long long)BTRFS_DEV_REPLACE_DEVID);
+                }
+                if (dev_replace->tgtdev) {
+                        if (dev_replace->srcdev) {
+                                dev_replace->tgtdev->total_bytes =
+                                        dev_replace->srcdev->total_bytes;
+                                dev_replace->tgtdev->disk_total_bytes =
+                                        dev_replace->srcdev->disk_total_bytes;
+                                dev_replace->tgtdev->bytes_used =
+                                        dev_replace->srcdev->bytes_used;
+                        }
+                        dev_replace->tgtdev->is_tgtdev_for_dev_replace = 1;
+                        btrfs_init_dev_replace_tgtdev_for_resume(fs_info,
+                                dev_replace->tgtdev);
+                }
+                break;
+        }
+out:
+        if (path)
+                btrfs_free_path(path);
+        return ret;
+}
+/*
+ * called from commit_transaction. Writes changed device replace state to
+ * disk.
+ */
+int btrfs_run_dev_replace(struct btrfs_trans_handle *trans,
+                          struct btrfs_fs_info *fs_info)
+{
+        int ret;
+        struct btrfs_root *dev_root = fs_info->dev_root;
+        struct btrfs_path *path;
+        struct btrfs_key key;
+        struct extent_buffer *eb;
+        struct btrfs_dev_replace_item *ptr;
+        struct btrfs_dev_replace *dev_replace = &fs_info->dev_replace;
+        btrfs_dev_replace_lock(dev_replace);
+        if (!dev_replace->is_valid ||
+            !dev_replace->item_needs_writeback) {
+                btrfs_dev_replace_unlock(dev_replace);
+                return 0;
+        }
+        btrfs_dev_replace_unlock(dev_replace);
+        key.objectid = 0;
+        key.type = BTRFS_DEV_REPLACE_KEY;
+        key.offset = 0;
+        path = btrfs_alloc_path();
+        if (!path) {
+                ret = -ENOMEM;
+                goto out;
+        }
+        ret = btrfs_search_slot(trans, dev_root, &key, path, -1, 1);
+        if (ret < 0) {
+                pr_warn("btrfs: error %d while searching for dev_replace item!\n",
+                        ret);
+                goto out;
+        }
+        if (ret == 0 &&
+            btrfs_item_size_nr(path->nodes[0], path->slots[0]) < sizeof(*ptr)) {
+                /*
+                 * need to delete old one and insert a new one.
+                 * Since no attempt is made to recover any old state, if the
+                 * dev_replace state is 'running', the data on the target
+                 * drive is lost.
+                 * It would be possible to recover the state: just make sure
+                 * that the beginning of the item is never changed and always
+                 * contains all the essential information. Then read this
+                 * minimal set of information and use it as a base for the
+                 * new state.
+                 */
+                ret = btrfs_del_item(trans, dev_root, path);
+                if (ret != 0) {
+                        pr_warn("btrfs: delete too small dev_replace item failed %d!\n",
+                                ret);
+                        goto out;
+                }
+                ret = 1;
+        }
+        if (ret == 1) {
+                /* need to insert a new item */
+                btrfs_release_path(path);
+                ret = btrfs_insert_empty_item(trans, dev_root, path,
+                                              &key, sizeof(*ptr));
+                if (ret < 0) {
+                        pr_warn("btrfs: insert dev_replace item failed %d!\n",
+                                ret);
+                        goto out;
+                }
+        }
+        eb = path->nodes[0];
+        ptr = btrfs_item_ptr(eb, path->slots[0],
+                             struct btrfs_dev_replace_item);
+        btrfs_dev_replace_lock(dev_replace);
+        if (dev_replace->srcdev)
+                btrfs_set_dev_replace_src_devid(eb, ptr,
+                        dev_replace->srcdev->devid);
+        else
+                btrfs_set_dev_replace_src_devid(eb, ptr, (u64)-1);
+        btrfs_set_dev_replace_cont_reading_from_srcdev_mode(eb, ptr,
+                dev_replace->cont_reading_from_srcdev_mode);
+        btrfs_set_dev_replace_replace_state(eb, ptr,
+                dev_replace->replace_state);
+        btrfs_set_dev_replace_time_started(eb, ptr, dev_replace->time_started);
+        btrfs_set_dev_replace_time_stopped(eb, ptr, dev_replace->time_stopped);
+        btrfs_set_dev_replace_num_write_errors(eb, ptr,
+                atomic64_read(&dev_replace->num_write_errors));
+        btrfs_set_dev_replace_num_uncorrectable_read_errors(eb, ptr,
+                atomic64_read(&dev_replace->num_uncorrectable_read_errors));
+        dev_replace->cursor_left_last_write_of_item =
+                dev_replace->cursor_left;
+        btrfs_set_dev_replace_cursor_left(eb, ptr,
+                dev_replace->cursor_left_last_write_of_item);
+        btrfs_set_dev_replace_cursor_right(eb, ptr,
+                dev_replace->cursor_right);
+        dev_replace->item_needs_writeback = 0;
+        btrfs_dev_replace_unlock(dev_replace);
+        btrfs_mark_buffer_dirty(eb);
+out:
+        btrfs_free_path(path);
+        return ret;
+}
+void btrfs_after_dev_replace_commit(struct btrfs_fs_info *fs_info)
+{
+        struct btrfs_dev_replace *dev_replace = &fs_info->dev_replace;
+        dev_replace->committed_cursor_left =
+                dev_replace->cursor_left_last_write_of_item;
+}
+static u64 btrfs_get_seconds_since_1970(void)
+{
+        struct timespec t = CURRENT_TIME_SEC;
+        return t.tv_sec;
+}
+int btrfs_dev_replace_start(struct btrfs_root *root,
+                            struct btrfs_ioctl_dev_replace_args *args)
+{
+        struct btrfs_trans_handle *trans;
+        struct btrfs_fs_info *fs_info = root->fs_info;
+        struct btrfs_dev_replace *dev_replace = &fs_info->dev_replace;
+        int ret;
+        struct btrfs_device *tgt_device = NULL;
+        struct btrfs_device *src_device = NULL;
+        switch (args->start.cont_reading_from_srcdev_mode) {
+        case BTRFS_IOCTL_DEV_REPLACE_CONT_READING_FROM_SRCDEV_MODE_ALWAYS:
+        case BTRFS_IOCTL_DEV_REPLACE_CONT_READING_FROM_SRCDEV_MODE_AVOID:
+                break;
+        default:
+                return -EINVAL;
+        }
+        if ((args->start.srcdevid == 0 && args->start.srcdev_name[0] == '\0') ||
+            args->start.tgtdev_name[0] == '\0')
+                return -EINVAL;
+        mutex_lock(&fs_info->volume_mutex);
+        ret = btrfs_init_dev_replace_tgtdev(root, args->start.tgtdev_name,
+                                            &tgt_device);
+        if (ret) {
+                pr_err("btrfs: target device %s is invalid!\n",
+                       args->start.tgtdev_name);
+                mutex_unlock(&fs_info->volume_mutex);
+                return -EINVAL;
+        }
+        ret = btrfs_dev_replace_find_srcdev(root, args->start.srcdevid,
+                                            args->start.srcdev_name,
+                                            &src_device);
+        mutex_unlock(&fs_info->volume_mutex);
+        if (ret) {
+                ret = -EINVAL;
+                goto leave_no_lock;
+        }
+        if (tgt_device->total_bytes < src_device->total_bytes) {
+                pr_err("btrfs: target device is smaller than source device!\n");
+                ret = -EINVAL;
+                goto leave_no_lock;
+        }
+        btrfs_dev_replace_lock(dev_replace);
+        switch (dev_replace->replace_state) {
+        case BTRFS_IOCTL_DEV_REPLACE_STATE_NEVER_STARTED:
+        case BTRFS_IOCTL_DEV_REPLACE_STATE_FINISHED:
+        case BTRFS_IOCTL_DEV_REPLACE_STATE_CANCELED:
+                break;
+        case BTRFS_IOCTL_DEV_REPLACE_STATE_STARTED:
+        case BTRFS_IOCTL_DEV_REPLACE_STATE_SUSPENDED:
+                args->result = BTRFS_IOCTL_DEV_REPLACE_RESULT_ALREADY_STARTED;
+                goto leave;
+        }
+        dev_replace->cont_reading_from_srcdev_mode =
+                args->start.cont_reading_from_srcdev_mode;
+        WARN_ON(!src_device);
+        dev_replace->srcdev = src_device;
+        WARN_ON(!tgt_device);
+        dev_replace->tgtdev = tgt_device;
+        printk_in_rcu(KERN_INFO
+                      "btrfs: dev_replace from %s (devid %llu) to %s) started\n",
+                      src_device->missing ? "<missing disk>" :
+                        rcu_str_deref(src_device->name),
+                      src_device->devid,
+                      rcu_str_deref(tgt_device->name));
+        tgt_device->total_bytes = src_device->total_bytes;
+        tgt_device->disk_total_bytes = src_device->disk_total_bytes;
+        tgt_device->bytes_used = src_device->bytes_used;
+        /*
+         * from now on, the writes to the srcdev are all duplicated to
+         * go to the tgtdev as well (refer to btrfs_map_block()).
+         */
+        dev_replace->replace_state = BTRFS_IOCTL_DEV_REPLACE_STATE_STARTED;
+        dev_replace->time_started = btrfs_get_seconds_since_1970();
+        dev_replace->cursor_left = 0;
+        dev_replace->committed_cursor_left = 0;
+        dev_replace->cursor_left_last_write_of_item = 0;
+        dev_replace->cursor_right = 0;
+        dev_replace->is_valid = 1;
+        dev_replace->item_needs_writeback = 1;
+        args->result = BTRFS_IOCTL_DEV_REPLACE_RESULT_NO_ERROR;
+        btrfs_dev_replace_unlock(dev_replace);
+        btrfs_wait_ordered_extents(root, 0);
+        /* force writing the updated state information to disk */
+        trans = btrfs_start_transaction(root, 0);
+        if (IS_ERR(trans)) {
+                ret = PTR_ERR(trans);
+                btrfs_dev_replace_lock(dev_replace);
+                goto leave;
+        }
+        ret = btrfs_commit_transaction(trans, root);
+        WARN_ON(ret);
+        /* the disk copy procedure reuses the scrub code */
+        ret = btrfs_scrub_dev(fs_info, src_device->devid, 0,
+                              src_device->total_bytes,
+                              &dev_replace->scrub_progress, 0, 1);
+        ret = btrfs_dev_replace_finishing(root->fs_info, ret);
+        WARN_ON(ret);
+        return 0;
+leave:
+        dev_replace->srcdev = NULL;
+        dev_replace->tgtdev = NULL;
+        btrfs_dev_replace_unlock(dev_replace);
+leave_no_lock:
+        if (tgt_device)
+                btrfs_destroy_dev_replace_tgtdev(fs_info, tgt_device);
+        return ret;
+}
+static int btrfs_dev_replace_finishing(struct btrfs_fs_info *fs_info,
+                                       int scrub_ret)
+{
+        struct btrfs_dev_replace *dev_replace = &fs_info->dev_replace;
+        struct btrfs_device *tgt_device;
+        struct btrfs_device *src_device;
+        struct btrfs_root *root = fs_info->tree_root;
+        u8 uuid_tmp[BTRFS_UUID_SIZE];
+        struct btrfs_trans_handle *trans;
+        int ret = 0;
+        /* don't allow cancel or unmount to disturb the finishing procedure */
+        mutex_lock(&dev_replace->lock_finishing_cancel_unmount);
+        btrfs_dev_replace_lock(dev_replace);
+        /* was the operation canceled, or is it finished? */
+        if (dev_replace->replace_state !=
+            BTRFS_IOCTL_DEV_REPLACE_STATE_STARTED) {
+                btrfs_dev_replace_unlock(dev_replace);
+                mutex_unlock(&dev_replace->lock_finishing_cancel_unmount);
+                return 0;
+        }
+        tgt_device = dev_replace->tgtdev;
+        src_device = dev_replace->srcdev;
+        btrfs_dev_replace_unlock(dev_replace);
+        /* replace old device with new one in mapping tree */
+        if (!scrub_ret)
+                btrfs_dev_replace_update_device_in_mapping_tree(fs_info,
+                                                                src_device,
+                                                                tgt_device);
+        /*
+         * flush all outstanding I/O and inode extent mappings before the
+         * copy operation is declared as being finished
+         */
+        btrfs_start_delalloc_inodes(root, 0);
+        btrfs_wait_ordered_extents(root, 0);
+        trans = btrfs_start_transaction(root, 0);
+        if (IS_ERR(trans)) {
+                mutex_unlock(&dev_replace->lock_finishing_cancel_unmount);
+                return PTR_ERR(trans);
+        }
+        ret = btrfs_commit_transaction(trans, root);
+        WARN_ON(ret);
+        /* keep away write_all_supers() during the finishing procedure */
+        mutex_lock(&root->fs_info->fs_devices->device_list_mutex);
+        btrfs_dev_replace_lock(dev_replace);
+        dev_replace->replace_state =
+                scrub_ret ? BTRFS_IOCTL_DEV_REPLACE_STATE_CANCELED
+                          : BTRFS_IOCTL_DEV_REPLACE_STATE_FINISHED;
+        dev_replace->tgtdev = NULL;
+        dev_replace->srcdev = NULL;
+        dev_replace->time_stopped = btrfs_get_seconds_since_1970();
+        dev_replace->item_needs_writeback = 1;
+        if (scrub_ret) {
+                printk_in_rcu(KERN_ERR
+                              "btrfs: btrfs_scrub_dev(%s, %llu, %s) failed %d\n",
+                              src_device->missing ? "<missing disk>" :
+                                rcu_str_deref(src_device->name),
+                              src_device->devid,
+                              rcu_str_deref(tgt_device->name), scrub_ret);
+                btrfs_dev_replace_unlock(dev_replace);
+                mutex_unlock(&root->fs_info->fs_devices->device_list_mutex);
+                if (tgt_device)
+                        btrfs_destroy_dev_replace_tgtdev(fs_info, tgt_device);
+                mutex_unlock(&dev_replace->lock_finishing_cancel_unmount);
+                return 0;
+        }
+        printk_in_rcu(KERN_INFO
+                      "btrfs: dev_replace from %s (devid %llu) to %s) finished\n",
+                      src_device->missing ? "<missing disk>" :
+                        rcu_str_deref(src_device->name),
+                      src_device->devid,
+                      rcu_str_deref(tgt_device->name));
+        tgt_device->is_tgtdev_for_dev_replace = 0;
+        tgt_device->devid = src_device->devid;
+        src_device->devid = BTRFS_DEV_REPLACE_DEVID;
+        tgt_device->bytes_used = src_device->bytes_used;
+        memcpy(uuid_tmp, tgt_device->uuid, sizeof(uuid_tmp));
+        memcpy(tgt_device->uuid, src_device->uuid, sizeof(tgt_device->uuid));
+        memcpy(src_device->uuid, uuid_tmp, sizeof(src_device->uuid));
+        tgt_device->total_bytes = src_device->total_bytes;
+        tgt_device->disk_total_bytes = src_device->disk_total_bytes;
+        tgt_device->bytes_used = src_device->bytes_used;
+        if (fs_info->sb->s_bdev == src_device->bdev)
+                fs_info->sb->s_bdev = tgt_device->bdev;
+        if (fs_info->fs_devices->latest_bdev == src_device->bdev)
+                fs_info->fs_devices->latest_bdev = tgt_device->bdev;
+        list_add(&tgt_device->dev_alloc_list, &fs_info->fs_devices->alloc_list);
+        btrfs_rm_dev_replace_srcdev(fs_info, src_device);
+        if (src_device->bdev) {
+                /* zero out the old super */
+                btrfs_scratch_superblock(src_device);
+        }
+        /*
+         * this is again a consistent state where no dev_replace procedure
+         * is running, the target device is part of the filesystem, the
+         * source device is not part of the filesystem anymore and its 1st
+         * superblock is scratched out so that it is no longer marked to
+         * belong to this filesystem.
+         */
+        btrfs_dev_replace_unlock(dev_replace);
+        mutex_unlock(&root->fs_info->fs_devices->device_list_mutex);
+        /* write back the superblocks */
+        trans = btrfs_start_transaction(root, 0);
+        if (!IS_ERR(trans))
+                btrfs_commit_transaction(trans, root);
+        mutex_unlock(&dev_replace->lock_finishing_cancel_unmount);
+        return 0;
+}
+static void btrfs_dev_replace_update_device_in_mapping_tree(
+                                                struct btrfs_fs_info *fs_info,
+                                                struct btrfs_device *srcdev,
+                                                struct btrfs_device *tgtdev)
+{
+        struct extent_map_tree *em_tree = &fs_info->mapping_tree.map_tree;
+        struct extent_map *em;
+        struct map_lookup *map;
+        u64 start = 0;
+        int i;
+        write_lock(&em_tree->lock);
+        do {
+                em = lookup_extent_mapping(em_tree, start, (u64)-1);
+                if (!em)
+                        break;
+                map = (struct map_lookup *)em->bdev;
+                for (i = 0; i < map->num_stripes; i++)
+                        if (srcdev == map->stripes[i].dev)
+                                map->stripes[i].dev = tgtdev;
+                start = em->start + em->len;
+                free_extent_map(em);
+        } while (start);
+        write_unlock(&em_tree->lock);
+}
+static int btrfs_dev_replace_find_srcdev(struct btrfs_root *root, u64 srcdevid,
+                                         char *srcdev_name,
+                                         struct btrfs_device **device)
+{
+        int ret;
+        if (srcdevid) {
+                ret = 0;
+                *device = btrfs_find_device(root->fs_info, srcdevid, NULL,
+                                            NULL);
+                if (!*device)
+                        ret = -ENOENT;
+        } else {
+                ret = btrfs_find_device_missing_or_by_path(root, srcdev_name,
+                                                           device);
+        }
+        return ret;
+}
+void btrfs_dev_replace_status(struct btrfs_fs_info *fs_info,
+                              struct btrfs_ioctl_dev_replace_args *args)
+{
+        struct btrfs_dev_replace *dev_replace = &fs_info->dev_replace;
+        btrfs_dev_replace_lock(dev_replace);
+        /* even if !dev_replace_is_valid, the values are good enough for
+         * the replace_status ioctl */
+        args->result = BTRFS_IOCTL_DEV_REPLACE_RESULT_NO_ERROR;
+        args->status.replace_state = dev_replace->replace_state;
+        args->status.time_started = dev_replace->time_started;
+        args->status.time_stopped = dev_replace->time_stopped;
+        args->status.num_write_errors =
+                atomic64_read(&dev_replace->num_write_errors);
+        args->status.num_uncorrectable_read_errors =
+                atomic64_read(&dev_replace->num_uncorrectable_read_errors);
+        switch (dev_replace->replace_state) {
+        case BTRFS_IOCTL_DEV_REPLACE_STATE_NEVER_STARTED:
+        case BTRFS_IOCTL_DEV_REPLACE_STATE_CANCELED:
+                args->status.progress_1000 = 0;
+                break;
+        case BTRFS_IOCTL_DEV_REPLACE_STATE_FINISHED:
+                args->status.progress_1000 = 1000;
+                break;
+        case BTRFS_IOCTL_DEV_REPLACE_STATE_STARTED:
+        case BTRFS_IOCTL_DEV_REPLACE_STATE_SUSPENDED:
+                args->status.progress_1000 = div64_u64(dev_replace->cursor_left,
+                        div64_u64(dev_replace->srcdev->total_bytes, 1000));
+                break;
+        }
+        btrfs_dev_replace_unlock(dev_replace);
+}
+int btrfs_dev_replace_cancel(struct btrfs_fs_info *fs_info,
+                             struct btrfs_ioctl_dev_replace_args *args)
+{
+        args->result = __btrfs_dev_replace_cancel(fs_info);
+        return 0;
+}
+static u64 __btrfs_dev_replace_cancel(struct btrfs_fs_info *fs_info)
+{
+        struct btrfs_dev_replace *dev_replace = &fs_info->dev_replace;
+        struct btrfs_device *tgt_device = NULL;
+        struct btrfs_trans_handle *trans;
+        struct btrfs_root *root = fs_info->tree_root;
+        u64 result;
+        int ret;
+        mutex_lock(&dev_replace->lock_finishing_cancel_unmount);
+        btrfs_dev_replace_lock(dev_replace);
+        switch (dev_replace->replace_state) {
+        case BTRFS_IOCTL_DEV_REPLACE_STATE_NEVER_STARTED:
+        case BTRFS_IOCTL_DEV_REPLACE_STATE_FINISHED:
+        case BTRFS_IOCTL_DEV_REPLACE_STATE_CANCELED:
+                result = BTRFS_IOCTL_DEV_REPLACE_RESULT_NOT_STARTED;
+                btrfs_dev_replace_unlock(dev_replace);
+                goto leave;
+        case BTRFS_IOCTL_DEV_REPLACE_STATE_STARTED:
+        case BTRFS_IOCTL_DEV_REPLACE_STATE_SUSPENDED:
+                result = BTRFS_IOCTL_DEV_REPLACE_RESULT_NO_ERROR;
+                tgt_device = dev_replace->tgtdev;
+                dev_replace->tgtdev = NULL;
+                dev_replace->srcdev = NULL;
+                break;
+        }
+        dev_replace->replace_state = BTRFS_IOCTL_DEV_REPLACE_STATE_CANCELED;
+        dev_replace->time_stopped = btrfs_get_seconds_since_1970();
+        dev_replace->item_needs_writeback = 1;
+        btrfs_dev_replace_unlock(dev_replace);
+        btrfs_scrub_cancel(fs_info);
+        trans = btrfs_start_transaction(root, 0);
+        if (IS_ERR(trans)) {
+                mutex_unlock(&dev_replace->lock_finishing_cancel_unmount);
+                return PTR_ERR(trans);
+        }
+        ret = btrfs_commit_transaction(trans, root);
+        WARN_ON(ret);
+        if (tgt_device)
+                btrfs_destroy_dev_replace_tgtdev(fs_info, tgt_device);
+leave:
+        mutex_unlock(&dev_replace->lock_finishing_cancel_unmount);
+        return result;
+}
+void btrfs_dev_replace_suspend_for_unmount(struct btrfs_fs_info *fs_info)
+{
+        struct btrfs_dev_replace *dev_replace = &fs_info->dev_replace;
+        mutex_lock(&dev_replace->lock_finishing_cancel_unmount);
+        btrfs_dev_replace_lock(dev_replace);
+        switch (dev_replace->replace_state) {
+        case BTRFS_IOCTL_DEV_REPLACE_STATE_NEVER_STARTED:
+        case BTRFS_IOCTL_DEV_REPLACE_STATE_FINISHED:
+        case BTRFS_IOCTL_DEV_REPLACE_STATE_CANCELED:
+        case BTRFS_IOCTL_DEV_REPLACE_STATE_SUSPENDED:
+                break;
+        case BTRFS_IOCTL_DEV_REPLACE_STATE_STARTED:
+                dev_replace->replace_state =
+                        BTRFS_IOCTL_DEV_REPLACE_STATE_SUSPENDED;
+                dev_replace->time_stopped = btrfs_get_seconds_since_1970();
+                dev_replace->item_needs_writeback = 1;
+                pr_info("btrfs: suspending dev_replace for unmount\n");
+                break;
+        }
+        btrfs_dev_replace_unlock(dev_replace);
+        mutex_unlock(&dev_replace->lock_finishing_cancel_unmount);
+}
+/* resume dev_replace procedure that was interrupted by unmount */
+int btrfs_resume_dev_replace_async(struct btrfs_fs_info *fs_info)
+{
+        struct task_struct *task;
+        struct btrfs_dev_replace *dev_replace = &fs_info->dev_replace;
+        btrfs_dev_replace_lock(dev_replace);
+        switch (dev_replace->replace_state) {
+        case BTRFS_IOCTL_DEV_REPLACE_STATE_NEVER_STARTED:
+        case BTRFS_IOCTL_DEV_REPLACE_STATE_FINISHED:
+        case BTRFS_IOCTL_DEV_REPLACE_STATE_CANCELED:
+                btrfs_dev_replace_unlock(dev_replace);
+                return 0;
+        case BTRFS_IOCTL_DEV_REPLACE_STATE_STARTED:
+                break;
+        case BTRFS_IOCTL_DEV_REPLACE_STATE_SUSPENDED:
+                dev_replace->replace_state =
+                        BTRFS_IOCTL_DEV_REPLACE_STATE_STARTED;
+                break;
+        }
+        if (!dev_replace->tgtdev || !dev_replace->tgtdev->bdev) {
+                pr_info("btrfs: cannot continue dev_replace, tgtdev is missing\n"
+                        "btrfs: you may cancel the operation after 'mount -o degraded'\n");
+                btrfs_dev_replace_unlock(dev_replace);
+                return 0;
+        }
+        btrfs_dev_replace_unlock(dev_replace);
+        WARN_ON(atomic_xchg(
+                &fs_info->mutually_exclusive_operation_running, 1));
+        task = kthread_run(btrfs_dev_replace_kthread, fs_info, "btrfs-devrepl");
+        return PTR_RET(task);
+}
+static int btrfs_dev_replace_kthread(void *data)
+{
+        struct btrfs_fs_info *fs_info = data;
+        struct btrfs_dev_replace *dev_replace = &fs_info->dev_replace;
+        struct btrfs_ioctl_dev_replace_args *status_args;
+        u64 progress;
+        status_args = kzalloc(sizeof(*status_args), GFP_NOFS);
+        if (status_args) {
+                btrfs_dev_replace_status(fs_info, status_args);
+                progress = status_args->status.progress_1000;
+                kfree(status_args);
+                do_div(progress, 10);
+                printk_in_rcu(KERN_INFO
+                              "btrfs: continuing dev_replace from %s (devid %llu) to %s @%u%%\n",
+                              dev_replace->srcdev->missing ? "<missing disk>" :
+                                rcu_str_deref(dev_replace->srcdev->name),
+                              dev_replace->srcdev->devid,
+                              dev_replace->tgtdev ?
+                                rcu_str_deref(dev_replace->tgtdev->name) :
+                                "<missing target disk>",
+                              (unsigned int)progress);
+        }
+        btrfs_dev_replace_continue_on_mount(fs_info);
+        atomic_set(&fs_info->mutually_exclusive_operation_running, 0);
+        return 0;
+}
+static int btrfs_dev_replace_continue_on_mount(struct btrfs_fs_info *fs_info)
+{
+        struct btrfs_dev_replace *dev_replace = &fs_info->dev_replace;
+        int ret;
+        ret = btrfs_scrub_dev(fs_info, dev_replace->srcdev->devid,
+                              dev_replace->committed_cursor_left,
+                              dev_replace->srcdev->total_bytes,
+                              &dev_replace->scrub_progress, 0, 1);
+        ret = btrfs_dev_replace_finishing(fs_info, ret);
+        WARN_ON(ret);
+        return 0;
+}
+int btrfs_dev_replace_is_ongoing(struct btrfs_dev_replace *dev_replace)
+{
+        if (!dev_replace->is_valid)
+                return 0;
+        switch (dev_replace->replace_state) {
+        case BTRFS_IOCTL_DEV_REPLACE_STATE_NEVER_STARTED:
+        case BTRFS_IOCTL_DEV_REPLACE_STATE_FINISHED:
+        case BTRFS_IOCTL_DEV_REPLACE_STATE_CANCELED:
+                return 0;
+        case BTRFS_IOCTL_DEV_REPLACE_STATE_STARTED:
+        case BTRFS_IOCTL_DEV_REPLACE_STATE_SUSPENDED:
+                /*
+                 * return true even if tgtdev is missing (this is
+                 * something that can happen if the dev_replace
+                 * procedure is suspended by an umount and then
+                 * the tgtdev is missing (or "btrfs dev scan") was
+                 * not called and the the filesystem is remounted
+                 * in degraded state. This does not stop the
+                 * dev_replace procedure. It needs to be canceled
+                 * manually if the cancelation is wanted.
+                 */
+                break;
+        }
+        return 1;
+}
+void btrfs_dev_replace_lock(struct btrfs_dev_replace *dev_replace)
+{
+        /* the beginning is just an optimization for the typical case */
+        if (atomic_read(&dev_replace->nesting_level) == 0) {
+acquire_lock:
+                /* this is not a nested case where the same thread
+                 * is trying to acqurire the same lock twice */
+                mutex_lock(&dev_replace->lock);
+                mutex_lock(&dev_replace->lock_management_lock);
+                dev_replace->lock_owner = current->pid;
+                atomic_inc(&dev_replace->nesting_level);
+                mutex_unlock(&dev_replace->lock_management_lock);
+                return;
+        }
+        mutex_lock(&dev_replace->lock_management_lock);
+        if (atomic_read(&dev_replace->nesting_level) > 0 &&
+            dev_replace->lock_owner == current->pid) {
+                WARN_ON(!mutex_is_locked(&dev_replace->lock));
+                atomic_inc(&dev_replace->nesting_level);
+                mutex_unlock(&dev_replace->lock_management_lock);
+                return;
+        }
+        mutex_unlock(&dev_replace->lock_management_lock);
+        goto acquire_lock;
+}
+void btrfs_dev_replace_unlock(struct btrfs_dev_replace *dev_replace)
+{
+        WARN_ON(!mutex_is_locked(&dev_replace->lock));
+        mutex_lock(&dev_replace->lock_management_lock);
+        WARN_ON(atomic_read(&dev_replace->nesting_level) < 1);
+        WARN_ON(dev_replace->lock_owner != current->pid);
+        atomic_dec(&dev_replace->nesting_level);
+        if (atomic_read(&dev_replace->nesting_level) == 0) {
+                dev_replace->lock_owner = 0;
+                mutex_unlock(&dev_replace->lock_management_lock);
+                mutex_unlock(&dev_replace->lock);
+        } else {
+                mutex_unlock(&dev_replace->lock_management_lock);
+        }
+}
diff --git a/fs/btrfs/dev-replace.h b/fs/btrfs/dev-replace.h
new file mode 100644
index 000000000000..20035cbbf021
--- /dev/null
+++ b/fs/btrfs/dev-replace.h
@@ -0,0 +1,44 @@
+/*
+ * Copyright (C) STRATO AG 2012.  All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public
+ * License v2 as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public
+ * License along with this program; if not, write to the
+ * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
+ * Boston, MA 021110-1307, USA.
+ */
+#if !defined(__BTRFS_DEV_REPLACE__)
+#define __BTRFS_DEV_REPLACE__
+struct btrfs_ioctl_dev_replace_args;
+int btrfs_init_dev_replace(struct btrfs_fs_info *fs_info);
+int btrfs_run_dev_replace(struct btrfs_trans_handle *trans,
+                          struct btrfs_fs_info *fs_info);
+void btrfs_after_dev_replace_commit(struct btrfs_fs_info *fs_info);
+int btrfs_dev_replace_start(struct btrfs_root *root,
+                            struct btrfs_ioctl_dev_replace_args *args);
+void btrfs_dev_replace_status(struct btrfs_fs_info *fs_info,
+                              struct btrfs_ioctl_dev_replace_args *args);
+int btrfs_dev_replace_cancel(struct btrfs_fs_info *fs_info,
+                             struct btrfs_ioctl_dev_replace_args *args);
+void btrfs_dev_replace_suspend_for_unmount(struct btrfs_fs_info *fs_info);
+int btrfs_resume_dev_replace_async(struct btrfs_fs_info *fs_info);
+int btrfs_dev_replace_is_ongoing(struct btrfs_dev_replace *dev_replace);
+void btrfs_dev_replace_lock(struct btrfs_dev_replace *dev_replace);
+void btrfs_dev_replace_unlock(struct btrfs_dev_replace *dev_replace);
+static inline void btrfs_dev_replace_stats_inc(atomic64_t *stat_value)
+{
+        atomic64_inc(stat_value);
+}
+#endif
diff --git a/fs/btrfs/dir-item.c b/fs/btrfs/dir-item.c
index c1a074d0696f..502c2158167c 100644
--- a/fs/btrfs/dir-item.c
+++ b/fs/btrfs/dir-item.c
@@ -213,6 +213,65 @@ struct btrfs_dir_item *btrfs_lookup_dir_item(struct btrfs_trans_handle *trans,
        return btrfs_match_dir_item_name(root, path, name, name_len);
 }
+int btrfs_check_dir_item_collision(struct btrfs_root *root, u64 dir,
+                                   const char *name, int name_len)
+{
+        int ret;
+        struct btrfs_key key;
+        struct btrfs_dir_item *di;
+        int data_size;
+        struct extent_buffer *leaf;
+        int slot;
+        struct btrfs_path *path;
+        path = btrfs_alloc_path();
+        if (!path)
+                return -ENOMEM;
+        key.objectid = dir;
+        btrfs_set_key_type(&key, BTRFS_DIR_ITEM_KEY);
+        key.offset = btrfs_name_hash(name, name_len);
+        ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
+        /* return back any errors */
+        if (ret < 0)
+                goto out;
+        /* nothing found, we're safe */
+        if (ret > 0) {
+                ret = 0;
+                goto out;
+        }
+        /* we found an item, look for our name in the item */
+        di = btrfs_match_dir_item_name(root, path, name, name_len);
+        if (di) {
+                /* our exact name was found */
+                ret = -EEXIST;
+                goto out;
+        }
+        /*
+         * see if there is room in the item to insert this
+         * name
+         */
+        data_size = sizeof(*di) + name_len + sizeof(struct btrfs_item);
+        leaf = path->nodes[0];
+        slot = path->slots[0];
+        if (data_size + btrfs_item_size_nr(leaf, slot) +
+            sizeof(struct btrfs_item) > BTRFS_LEAF_DATA_SIZE(root)) {
+                ret = -EOVERFLOW;
+        } else {
+                /* plenty of insertion room */
+                ret = 0;
+        }
+out:
+        btrfs_free_path(path);
+        return ret;
+}
 /*
 * lookup a directory item based on index.  'dir' is the objectid
 * we're searching in, and 'mod' tells us if you plan on deleting the
diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
index 7cda51995c1e..a8f652dc940b 100644
--- a/fs/btrfs/disk-io.c
+++ b/fs/btrfs/disk-io.c
@@ -45,6 +45,7 @@
 #include "inode-map.h"
 #include "check-integrity.h"
 #include "rcu-string.h"
+#include "dev-replace.h"
 #ifdef CONFIG_X86
 #include <asm/cpufeature.h>
@@ -387,7 +388,7 @@ static int btree_read_extent_buffer_pages(struct btrfs_root *root,
                if (test_bit(EXTENT_BUFFER_CORRUPT, &eb->bflags))
                        break;
-                num_copies = btrfs_num_copies(&root->fs_info->mapping_tree,
+                num_copies = btrfs_num_copies(root->fs_info,
                                              eb->start, eb->len);
                if (num_copies == 1)
                        break;
@@ -852,11 +853,16 @@ static int __btree_submit_bio_done(struct inode *inode, int rw, struct bio *bio,
                                 int mirror_num, unsigned long bio_flags,
                                 u64 bio_offset)
 {
+        int ret;
        /*
         * when we're called for a write, we're already in the async
         * submission context.  Just jump into btrfs_map_bio
         */
-        return btrfs_map_bio(BTRFS_I(inode)->root, rw, bio, mirror_num, 1);
+        ret = btrfs_map_bio(BTRFS_I(inode)->root, rw, bio, mirror_num, 1);
+        if (ret)
+                bio_endio(bio, ret);
+        return ret;
 }
 static int check_async_write(struct inode *inode, unsigned long bio_flags)
@@ -878,7 +884,6 @@ static int btree_submit_bio_hook(struct inode *inode, int rw, struct bio *bio,
        int ret;
        if (!(rw & REQ_WRITE)) {
                /*
                 * called for a read, do the setup so that checksum validation
                 * can happen in the async kernel threads
@@ -886,26 +891,32 @@ static int btree_submit_bio_hook(struct inode *inode, int rw, struct bio *bio,
                ret = btrfs_bio_wq_end_io(BTRFS_I(inode)->root->fs_info,
                                          bio, 1);
                if (ret)
-                        return ret;
+                        goto out_w_error;
-                return btrfs_map_bio(BTRFS_I(inode)->root, rw, bio,
+                ret = btrfs_map_bio(BTRFS_I(inode)->root, rw, bio,
-                                     mirror_num, 0);
+                                    mirror_num, 0);
        } else if (!async) {
                ret = btree_csum_one_bio(bio);
                if (ret)
-                        return ret;
+                        goto out_w_error;
-                return btrfs_map_bio(BTRFS_I(inode)->root, rw, bio,
+                ret = btrfs_map_bio(BTRFS_I(inode)->root, rw, bio,
-                                     mirror_num, 0);
+                                    mirror_num, 0);
+        } else {
+                /*
+                 * kthread helpers are used to submit writes so that
+                 * checksumming can happen in parallel across all CPUs
+                 */
+                ret = btrfs_wq_submit_bio(BTRFS_I(inode)->root->fs_info,
+                                          inode, rw, bio, mirror_num, 0,
+                                          bio_offset,
+                                          __btree_submit_bio_start,
+                                          __btree_submit_bio_done);
        }
-        /*
+        if (ret) {
-         * kthread helpers are used to submit writes so that checksumming
+out_w_error:
-         * can happen in parallel across all CPUs
+                bio_endio(bio, ret);
-         */
+        }
-        return btrfs_wq_submit_bio(BTRFS_I(inode)->root->fs_info,
+        return ret;
-                                   inode, rw, bio, mirror_num, 0,
-                                   bio_offset,
-                                   __btree_submit_bio_start,
-                                   __btree_submit_bio_done);
 }
 #ifdef CONFIG_MIGRATION
@@ -990,6 +1001,7 @@ static void btree_invalidatepage(struct page *page, unsigned long offset)
 static int btree_set_page_dirty(struct page *page)
 {
+#ifdef DEBUG
        struct extent_buffer *eb;
        BUG_ON(!PagePrivate(page));
@@ -998,6 +1010,7 @@ static int btree_set_page_dirty(struct page *page)
        BUG_ON(!test_bit(EXTENT_BUFFER_DIRTY, &eb->bflags));
        BUG_ON(!atomic_read(&eb->refs));
        btrfs_assert_tree_locked(eb);
+#endif
        return __set_page_dirty_nobuffers(page);
 }
@@ -1129,11 +1142,11 @@ void clean_tree_block(struct btrfs_trans_handle *trans, struct btrfs_root *root,
                                          root->fs_info->dirty_metadata_bytes);
                        }
                        spin_unlock(&root->fs_info->delalloc_lock);
-                }
-                /* ugh, clear_extent_buffer_dirty needs to lock the page */
+                        /* ugh, clear_extent_buffer_dirty needs to lock the page */
-                btrfs_set_lock_blocking(buf);
+                        btrfs_set_lock_blocking(buf);
-                clear_extent_buffer_dirty(buf);
+                        clear_extent_buffer_dirty(buf);
+                }
        }
 }
@@ -1193,7 +1206,7 @@ static void __setup_root(u32 nodesize, u32 leafsize, u32 sectorsize,
        root->root_key.objectid = objectid;
        root->anon_dev = 0;
-        spin_lock_init(&root->root_times_lock);
+        spin_lock_init(&root->root_item_lock);
 }
 static int __must_check find_and_setup_root(struct btrfs_root *tree_root,
@@ -2131,6 +2144,11 @@ int open_ctree(struct super_block *sb,
        init_rwsem(&fs_info->extent_commit_sem);
        init_rwsem(&fs_info->cleanup_work_sem);
        init_rwsem(&fs_info->subvol_sem);
+        fs_info->dev_replace.lock_owner = 0;
+        atomic_set(&fs_info->dev_replace.nesting_level, 0);
+        mutex_init(&fs_info->dev_replace.lock_finishing_cancel_unmount);
+        mutex_init(&fs_info->dev_replace.lock_management_lock);
+        mutex_init(&fs_info->dev_replace.lock);
        spin_lock_init(&fs_info->qgroup_lock);
        fs_info->qgroup_tree = RB_ROOT;
@@ -2279,6 +2297,10 @@ int open_ctree(struct super_block *sb,
                           fs_info->thread_pool_size,
                           &fs_info->generic_worker);
+        btrfs_init_workers(&fs_info->flush_workers, "flush_delalloc",
+                           fs_info->thread_pool_size,
+                           &fs_info->generic_worker);
        btrfs_init_workers(&fs_info->submit_workers, "submit",
                           min_t(u64, fs_devices->num_devices,
                           fs_info->thread_pool_size),
@@ -2350,6 +2372,7 @@ int open_ctree(struct super_block *sb,
        ret |= btrfs_start_workers(&fs_info->delayed_workers);
        ret |= btrfs_start_workers(&fs_info->caching_workers);
        ret |= btrfs_start_workers(&fs_info->readahead_workers);
+        ret |= btrfs_start_workers(&fs_info->flush_workers);
        if (ret) {
                err = -ENOMEM;
                goto fail_sb_buffer;
@@ -2418,7 +2441,11 @@ int open_ctree(struct super_block *sb,
                goto fail_tree_roots;
        }
-        btrfs_close_extra_devices(fs_devices);
+        /*
+         * keep the device that is marked to be the target device for the
+         * dev_replace procedure
+         */
+        btrfs_close_extra_devices(fs_info, fs_devices, 0);
        if (!fs_devices->latest_bdev) {
                printk(KERN_CRIT "btrfs: failed to read devices on %s\n",
@@ -2490,6 +2517,14 @@ retry_root_backup:
                goto fail_block_groups;
        }
+        ret = btrfs_init_dev_replace(fs_info);
+        if (ret) {
+                pr_err("btrfs: failed to init dev_replace: %d\n", ret);
+                goto fail_block_groups;
+        }
+        btrfs_close_extra_devices(fs_info, fs_devices, 1);
        ret = btrfs_init_space_info(fs_info);
        if (ret) {
                printk(KERN_ERR "Failed to initial space info: %d\n", ret);
@@ -2503,6 +2538,13 @@ retry_root_backup:
        }
        fs_info->num_tolerated_disk_barrier_failures =
                btrfs_calc_num_tolerated_disk_barrier_failures(fs_info);
+        if (fs_info->fs_devices->missing_devices >
+             fs_info->num_tolerated_disk_barrier_failures &&
+            !(sb->s_flags & MS_RDONLY)) {
+                printk(KERN_WARNING
+                       "Btrfs: too many missing devices, writeable mount is not allowed\n");
+                goto fail_block_groups;
+        }
        fs_info->cleaner_kthread = kthread_run(cleaner_kthread, tree_root,
                                               "btrfs-cleaner");
@@ -2631,6 +2673,13 @@ retry_root_backup:
                return ret;
        }
+        ret = btrfs_resume_dev_replace_async(fs_info);
+        if (ret) {
+                pr_warn("btrfs: failed to resume dev_replace\n");
+                close_ctree(tree_root);
+                return ret;
+        }
        return 0;
 fail_qgroup:
@@ -2667,6 +2716,7 @@ fail_sb_buffer:
        btrfs_stop_workers(&fs_info->submit_workers);
        btrfs_stop_workers(&fs_info->delayed_workers);
        btrfs_stop_workers(&fs_info->caching_workers);
+        btrfs_stop_workers(&fs_info->flush_workers);
 fail_alloc:
 fail_iput:
        btrfs_mapping_tree_free(&fs_info->mapping_tree);
@@ -3270,16 +3320,18 @@ int close_ctree(struct btrfs_root *root)
        smp_mb();
        /* pause restriper - we want to resume on mount */
-        btrfs_pause_balance(root->fs_info);
+        btrfs_pause_balance(fs_info);
+        btrfs_dev_replace_suspend_for_unmount(fs_info);
-        btrfs_scrub_cancel(root);
+        btrfs_scrub_cancel(fs_info);
        /* wait for any defraggers to finish */
        wait_event(fs_info->transaction_wait,
                   (atomic_read(&fs_info->defrag_running) == 0));
        /* clear out the rbtree of defraggable inodes */
-        btrfs_run_defrag_inodes(fs_info);
+        btrfs_cleanup_defrag_inodes(fs_info);
        if (!(fs_info->sb->s_flags & MS_RDONLY)) {
                ret = btrfs_commit_super(root);
@@ -3339,6 +3391,7 @@ int close_ctree(struct btrfs_root *root)
        btrfs_stop_workers(&fs_info->delayed_workers);
        btrfs_stop_workers(&fs_info->caching_workers);
        btrfs_stop_workers(&fs_info->readahead_workers);
+        btrfs_stop_workers(&fs_info->flush_workers);
 #ifdef CONFIG_BTRFS_FS_CHECK_INTEGRITY
        if (btrfs_test_opt(root, CHECK_INTEGRITY))
@@ -3383,14 +3436,12 @@ void btrfs_mark_buffer_dirty(struct extent_buffer *buf)
        int was_dirty;
        btrfs_assert_tree_locked(buf);
-        if (transid != root->fs_info->generation) {
+        if (transid != root->fs_info->generation)
-                printk(KERN_CRIT "btrfs transid mismatch buffer %llu, "
+                WARN(1, KERN_CRIT "btrfs transid mismatch buffer %llu, "
                       "found %llu running %llu\n",
                        (unsigned long long)buf->start,
                        (unsigned long long)transid,
                        (unsigned long long)root->fs_info->generation);
-                WARN_ON(1);
-        }
        was_dirty = set_extent_buffer_dirty(buf);
        if (!was_dirty) {
                spin_lock(&root->fs_info->delalloc_lock);
@@ -3399,7 +3450,8 @@ void btrfs_mark_buffer_dirty(struct extent_buffer *buf)
        }
 }
-void btrfs_btree_balance_dirty(struct btrfs_root *root, unsigned long nr)
+static void __btrfs_btree_balance_dirty(struct btrfs_root *root,
+                                        int flush_delayed)
 {
        /*
         * looks as though older kernels can get into trouble with
@@ -3411,36 +3463,26 @@ void btrfs_btree_balance_dirty(struct btrfs_root *root, unsigned long nr)
        if (current->flags & PF_MEMALLOC)
                return;
-        btrfs_balance_delayed_items(root);
+        if (flush_delayed)
+                btrfs_balance_delayed_items(root);
        num_dirty = root->fs_info->dirty_metadata_bytes;
        if (num_dirty > thresh) {
-                balance_dirty_pages_ratelimited_nr(
+                balance_dirty_pages_ratelimited(
-                                   root->fs_info->btree_inode->i_mapping, 1);
+                                   root->fs_info->btree_inode->i_mapping);
        }
        return;
 }
-void __btrfs_btree_balance_dirty(struct btrfs_root *root, unsigned long nr)
+void btrfs_btree_balance_dirty(struct btrfs_root *root)
 {
-        /*
+        __btrfs_btree_balance_dirty(root, 1);
-         * looks as though older kernels can get into trouble with
+}
-         * this code, they end up stuck in balance_dirty_pages forever
-         */
-        u64 num_dirty;
-        unsigned long thresh = 32 * 1024 * 1024;
-        if (current->flags & PF_MEMALLOC)
-                return;
-        num_dirty = root->fs_info->dirty_metadata_bytes;
-        if (num_dirty > thresh) {
+void btrfs_btree_balance_dirty_nodelay(struct btrfs_root *root)
-                balance_dirty_pages_ratelimited_nr(
+{
-                                   root->fs_info->btree_inode->i_mapping, 1);
+        __btrfs_btree_balance_dirty(root, 0);
-        }
-        return;
 }
 int btrfs_read_buffer(struct extent_buffer *buf, u64 parent_transid)
diff --git a/fs/btrfs/disk-io.h b/fs/btrfs/disk-io.h
index 2025a9132c16..305c33efb0e3 100644
--- a/fs/btrfs/disk-io.h
+++ b/fs/btrfs/disk-io.h
@@ -62,8 +62,8 @@ struct btrfs_root *btrfs_read_fs_root_no_radix(struct btrfs_root *tree_root,
 struct btrfs_root *btrfs_read_fs_root_no_name(struct btrfs_fs_info *fs_info,
                                              struct btrfs_key *location);
 int btrfs_cleanup_fs_roots(struct btrfs_fs_info *fs_info);
-void btrfs_btree_balance_dirty(struct btrfs_root *root, unsigned long nr);
+void btrfs_btree_balance_dirty(struct btrfs_root *root);
-void __btrfs_btree_balance_dirty(struct btrfs_root *root, unsigned long nr);
+void btrfs_btree_balance_dirty_nodelay(struct btrfs_root *root);
 void btrfs_free_fs_root(struct btrfs_fs_info *fs_info, struct btrfs_root *root);
 void btrfs_mark_buffer_dirty(struct extent_buffer *buf);
 int btrfs_buffer_uptodate(struct extent_buffer *buf, u64 parent_transid,
diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
index 3d3e2c17d8d1..5a3327b8f90d 100644
--- a/fs/btrfs/extent-tree.c
+++ b/fs/btrfs/extent-tree.c
@@ -33,6 +33,7 @@
 #include "volumes.h"
 #include "locking.h"
 #include "free-space-cache.h"
+#include "math.h"
 #undef SCRAMBLE_DELAYED_REFS
@@ -649,24 +650,6 @@ void btrfs_clear_space_info_full(struct btrfs_fs_info *info)
        rcu_read_unlock();
 }
-static u64 div_factor(u64 num, int factor)
-{
-        if (factor == 10)
-                return num;
-        num *= factor;
-        do_div(num, 10);
-        return num;
-}
-static u64 div_factor_fine(u64 num, int factor)
-{
-        if (factor == 100)
-                return num;
-        num *= factor;
-        do_div(num, 100);
-        return num;
-}
 u64 btrfs_find_block_group(struct btrfs_root *root,
                           u64 search_start, u64 search_hint, int owner)
 {
@@ -1835,7 +1818,7 @@ static int btrfs_discard_extent(struct btrfs_root *root, u64 bytenr,
        /* Tell the block device(s) that the sectors can be discarded */
-        ret = btrfs_map_block(&root->fs_info->mapping_tree, REQ_DISCARD,
+        ret = btrfs_map_block(root->fs_info, REQ_DISCARD,
                              bytenr, &num_bytes, &bbio, 0);
        /* Error condition is -ENOMEM */
        if (!ret) {
@@ -2314,6 +2297,9 @@ static noinline int run_clustered_refs(struct btrfs_trans_handle *trans,
                                kfree(extent_op);
                                if (ret) {
+                                        list_del_init(&locked_ref->cluster);
+                                        mutex_unlock(&locked_ref->mutex);
                                        printk(KERN_DEBUG "btrfs: run_delayed_extent_op returned %d\n", ret);
                                        spin_lock(&delayed_refs->lock);
                                        return ret;
@@ -2356,6 +2342,10 @@ static noinline int run_clustered_refs(struct btrfs_trans_handle *trans,
                count++;
                if (ret) {
+                        if (locked_ref) {
+                                list_del_init(&locked_ref->cluster);
+                                mutex_unlock(&locked_ref->mutex);
+                        }
                        printk(KERN_DEBUG "btrfs: run_one_delayed_ref returned %d\n", ret);
                        spin_lock(&delayed_refs->lock);
                        return ret;
@@ -3661,7 +3651,7 @@ out:
 static int can_overcommit(struct btrfs_root *root,
                          struct btrfs_space_info *space_info, u64 bytes,
-                          int flush)
+                          enum btrfs_reserve_flush_enum flush)
 {
        u64 profile = btrfs_get_alloc_profile(root, 0);
        u64 avail;
@@ -3685,11 +3675,11 @@ static int can_overcommit(struct btrfs_root *root,
                avail >>= 1;
        /*
-         * If we aren't flushing don't let us overcommit too much, say
+         * If we aren't flushing all things, let us overcommit up to
-         * 1/8th of the space.  If we can flush, let it overcommit up to
+         * 1/2th of the space. If we can flush, don't let us overcommit
-         * 1/2 of the space.
+         * too much, let it overcommit up to 1/8 of the space.
         */
-        if (flush)
+        if (flush == BTRFS_RESERVE_FLUSH_ALL)
                avail >>= 3;
        else
                avail >>= 1;
@@ -3699,6 +3689,20 @@ static int can_overcommit(struct btrfs_root *root,
        return 0;
 }
+static int writeback_inodes_sb_nr_if_idle_safe(struct super_block *sb,
+                                               unsigned long nr_pages,
+                                               enum wb_reason reason)
+{
+        if (!writeback_in_progress(sb->s_bdi) &&
+            down_read_trylock(&sb->s_umount)) {
+                writeback_inodes_sb_nr(sb, nr_pages, reason);
+                up_read(&sb->s_umount);
+                return 1;
+        }
+        return 0;
+}
 /*
 * shrink metadata reservation for delalloc
 */
@@ -3713,6 +3717,7 @@ static void shrink_delalloc(struct btrfs_root *root, u64 to_reclaim, u64 orig,
        long time_left;
        unsigned long nr_pages = (2 * 1024 * 1024) >> PAGE_CACHE_SHIFT;
        int loops = 0;
+        enum btrfs_reserve_flush_enum flush;
        trans = (struct btrfs_trans_handle *)current->journal_info;
        block_rsv = &root->fs_info->delalloc_block_rsv;
@@ -3730,8 +3735,9 @@ static void shrink_delalloc(struct btrfs_root *root, u64 to_reclaim, u64 orig,
        while (delalloc_bytes && loops < 3) {
                max_reclaim = min(delalloc_bytes, to_reclaim);
                nr_pages = max_reclaim >> PAGE_CACHE_SHIFT;
-                writeback_inodes_sb_nr_if_idle(root->fs_info->sb, nr_pages,
+                writeback_inodes_sb_nr_if_idle_safe(root->fs_info->sb,
-                                               WB_REASON_FS_FREE_SPACE);
+                                                    nr_pages,
+                                                    WB_REASON_FS_FREE_SPACE);
                /*
                 * We need to wait for the async pages to actually start before
@@ -3740,8 +3746,12 @@ static void shrink_delalloc(struct btrfs_root *root, u64 to_reclaim, u64 orig,
                wait_event(root->fs_info->async_submit_wait,
                           !atomic_read(&root->fs_info->async_delalloc_pages));
+                if (!trans)
+                        flush = BTRFS_RESERVE_FLUSH_ALL;
+                else
+                        flush = BTRFS_RESERVE_NO_FLUSH;
                spin_lock(&space_info->lock);
-                if (can_overcommit(root, space_info, orig, !trans)) {
+                if (can_overcommit(root, space_info, orig, flush)) {
                        spin_unlock(&space_info->lock);
                        break;
                }
@@ -3888,7 +3898,7 @@ static int flush_space(struct btrfs_root *root,
 * @root - the root we're allocating for
 * @block_rsv - the block_rsv we're allocating for
 * @orig_bytes - the number of bytes we want
- * @flush - wether or not we can flush to make our reservation
+ * @flush - whether or not we can flush to make our reservation
 *
 * This will reserve orgi_bytes number of bytes from the space info associated
 * with the block_rsv.  If there is not enough space it will make an attempt to
@@ -3899,7 +3909,8 @@ static int flush_space(struct btrfs_root *root,
 */
 static int reserve_metadata_bytes(struct btrfs_root *root,
                                  struct btrfs_block_rsv *block_rsv,
-                                  u64 orig_bytes, int flush)
+                                  u64 orig_bytes,
+                                  enum btrfs_reserve_flush_enum flush)
 {
        struct btrfs_space_info *space_info = block_rsv->space_info;
        u64 used;
@@ -3912,10 +3923,11 @@ again:
        ret = 0;
        spin_lock(&space_info->lock);
        /*
-         * We only want to wait if somebody other than us is flushing and we are
+         * We only want to wait if somebody other than us is flushing and we
-         * actually alloed to flush.
+         * are actually allowed to flush all things.
         */
-        while (flush && !flushing && space_info->flush) {
+        while (flush == BTRFS_RESERVE_FLUSH_ALL && !flushing &&
+               space_info->flush) {
                spin_unlock(&space_info->lock);
                /*
                 * If we have a trans handle we can't wait because the flusher
@@ -3981,23 +3993,40 @@ again:
         * Couldn't make our reservation, save our place so while we're trying
         * to reclaim space we can actually use it instead of somebody else
         * stealing it from us.
+         *
+         * We make the other tasks wait for the flush only when we can flush
+         * all things.
         */
-        if (ret && flush) {
+        if (ret && flush != BTRFS_RESERVE_NO_FLUSH) {
                flushing = true;
                space_info->flush = 1;
        }
        spin_unlock(&space_info->lock);
-        if (!ret || !flush)
+        if (!ret || flush == BTRFS_RESERVE_NO_FLUSH)
                goto out;
        ret = flush_space(root, space_info, num_bytes, orig_bytes,
                          flush_state);
        flush_state++;
+        /*
+         * If we are FLUSH_LIMIT, we can not flush delalloc, or the deadlock
+         * would happen. So skip delalloc flush.
+         */
+        if (flush == BTRFS_RESERVE_FLUSH_LIMIT &&
+            (flush_state == FLUSH_DELALLOC ||
+             flush_state == FLUSH_DELALLOC_WAIT))
+                flush_state = ALLOC_CHUNK;
        if (!ret)
                goto again;
-        else if (flush_state <= COMMIT_TRANS)
+        else if (flush == BTRFS_RESERVE_FLUSH_LIMIT &&
+                 flush_state < COMMIT_TRANS)
+                goto again;
+        else if (flush == BTRFS_RESERVE_FLUSH_ALL &&
+                 flush_state <= COMMIT_TRANS)
                goto again;
 out:
@@ -4148,9 +4177,9 @@ void btrfs_free_block_rsv(struct btrfs_root *root,
        kfree(rsv);
 }
-static inline int __block_rsv_add(struct btrfs_root *root,
+int btrfs_block_rsv_add(struct btrfs_root *root,
-                                  struct btrfs_block_rsv *block_rsv,
+                        struct btrfs_block_rsv *block_rsv, u64 num_bytes,
-                                  u64 num_bytes, int flush)
+                        enum btrfs_reserve_flush_enum flush)
 {
        int ret;
@@ -4166,20 +4195,6 @@ static inline int __block_rsv_add(struct btrfs_root *root,
        return ret;
 }
-int btrfs_block_rsv_add(struct btrfs_root *root,
-                        struct btrfs_block_rsv *block_rsv,
-                        u64 num_bytes)
-{
-        return __block_rsv_add(root, block_rsv, num_bytes, 1);
-}
-int btrfs_block_rsv_add_noflush(struct btrfs_root *root,
-                                struct btrfs_block_rsv *block_rsv,
-                                u64 num_bytes)
-{
-        return __block_rsv_add(root, block_rsv, num_bytes, 0);
-}
 int btrfs_block_rsv_check(struct btrfs_root *root,
                          struct btrfs_block_rsv *block_rsv, int min_factor)
 {
@@ -4198,9 +4213,9 @@ int btrfs_block_rsv_check(struct btrfs_root *root,
        return ret;
 }
-static inline int __btrfs_block_rsv_refill(struct btrfs_root *root,
+int btrfs_block_rsv_refill(struct btrfs_root *root,
-                                           struct btrfs_block_rsv *block_rsv,
+                           struct btrfs_block_rsv *block_rsv, u64 min_reserved,
-                                           u64 min_reserved, int flush)
+                           enum btrfs_reserve_flush_enum flush)
 {
        u64 num_bytes = 0;
        int ret = -ENOSPC;
@@ -4228,20 +4243,6 @@ static inline int __btrfs_block_rsv_refill(struct btrfs_root *root,
        return ret;
 }
-int btrfs_block_rsv_refill(struct btrfs_root *root,
-                           struct btrfs_block_rsv *block_rsv,
-                           u64 min_reserved)
-{
-        return __btrfs_block_rsv_refill(root, block_rsv, min_reserved, 1);
-}
-int btrfs_block_rsv_refill_noflush(struct btrfs_root *root,
-                                   struct btrfs_block_rsv *block_rsv,
-                                   u64 min_reserved)
-{
-        return __btrfs_block_rsv_refill(root, block_rsv, min_reserved, 0);
-}
 int btrfs_block_rsv_migrate(struct btrfs_block_rsv *src_rsv,
                            struct btrfs_block_rsv *dst_rsv,
                            u64 num_bytes)
@@ -4532,17 +4533,27 @@ int btrfs_delalloc_reserve_metadata(struct inode *inode, u64 num_bytes)
        u64 csum_bytes;
        unsigned nr_extents = 0;
        int extra_reserve = 0;
-        int flush = 1;
+        enum btrfs_reserve_flush_enum flush = BTRFS_RESERVE_FLUSH_ALL;
-        int ret;
+        int ret = 0;
+        bool delalloc_lock = true;
-        /* Need to be holding the i_mutex here if we aren't free space cache */
+        /* If we are a free space inode we need to not flush since we will be in
-        if (btrfs_is_free_space_inode(inode))
+         * the middle of a transaction commit.  We also don't need the delalloc
-                flush = 0;
+         * mutex since we won't race with anybody.  We need this mostly to make
+         * lockdep shut its filthy mouth.
+         */
+        if (btrfs_is_free_space_inode(inode)) {
+                flush = BTRFS_RESERVE_NO_FLUSH;
+                delalloc_lock = false;
+        }
-        if (flush && btrfs_transaction_in_commit(root->fs_info))
+        if (flush != BTRFS_RESERVE_NO_FLUSH &&
+            btrfs_transaction_in_commit(root->fs_info))
                schedule_timeout(1);
-        mutex_lock(&BTRFS_I(inode)->delalloc_mutex);
+        if (delalloc_lock)
+                mutex_lock(&BTRFS_I(inode)->delalloc_mutex);
        num_bytes = ALIGN(num_bytes, root->sectorsize);
        spin_lock(&BTRFS_I(inode)->lock);
@@ -4568,16 +4579,18 @@ int btrfs_delalloc_reserve_metadata(struct inode *inode, u64 num_bytes)
        csum_bytes = BTRFS_I(inode)->csum_bytes;
        spin_unlock(&BTRFS_I(inode)->lock);
-        if (root->fs_info->quota_enabled) {
+        if (root->fs_info->quota_enabled)
                ret = btrfs_qgroup_reserve(root, num_bytes +
                                           nr_extents * root->leafsize);
-                if (ret) {
-                        mutex_unlock(&BTRFS_I(inode)->delalloc_mutex);
-                        return ret;
-                }
-        }
-        ret = reserve_metadata_bytes(root, block_rsv, to_reserve, flush);
+        /*
+         * ret != 0 here means the qgroup reservation failed, we go straight to
+         * the shared error handling then.
+         */
+        if (ret == 0)
+                ret = reserve_metadata_bytes(root, block_rsv,
+                                             to_reserve, flush);
        if (ret) {
                u64 to_free = 0;
                unsigned dropped;
@@ -4607,7 +4620,12 @@ int btrfs_delalloc_reserve_metadata(struct inode *inode, u64 num_bytes)
                                                      btrfs_ino(inode),
                                                      to_free, 0);
                }
-                mutex_unlock(&BTRFS_I(inode)->delalloc_mutex);
+                if (root->fs_info->quota_enabled) {
+                        btrfs_qgroup_free(root, num_bytes +
+                                                nr_extents * root->leafsize);
+                }
+                if (delalloc_lock)
+                        mutex_unlock(&BTRFS_I(inode)->delalloc_mutex);
                return ret;
        }
@@ -4619,7 +4637,9 @@ int btrfs_delalloc_reserve_metadata(struct inode *inode, u64 num_bytes)
        }
        BTRFS_I(inode)->reserved_extents += nr_extents;
        spin_unlock(&BTRFS_I(inode)->lock);
-        mutex_unlock(&BTRFS_I(inode)->delalloc_mutex);
+        if (delalloc_lock)
+                mutex_unlock(&BTRFS_I(inode)->delalloc_mutex);
        if (to_reserve)
                trace_btrfs_space_reservation(root->fs_info,"delalloc",
@@ -4969,9 +4989,13 @@ static int unpin_extent_range(struct btrfs_root *root, u64 start, u64 end)
 {
        struct btrfs_fs_info *fs_info = root->fs_info;
        struct btrfs_block_group_cache *cache = NULL;
+        struct btrfs_space_info *space_info;
+        struct btrfs_block_rsv *global_rsv = &fs_info->global_block_rsv;
        u64 len;
+        bool readonly;
        while (start <= end) {
+                readonly = false;
                if (!cache ||
                    start >= cache->key.objectid + cache->key.offset) {
                        if (cache)
@@ -4989,15 +5013,30 @@ static int unpin_extent_range(struct btrfs_root *root, u64 start, u64 end)
                }
                start += len;
+                space_info = cache->space_info;
-                spin_lock(&cache->space_info->lock);
+                spin_lock(&space_info->lock);
                spin_lock(&cache->lock);
                cache->pinned -= len;
-                cache->space_info->bytes_pinned -= len;
+                space_info->bytes_pinned -= len;
-                if (cache->ro)
+                if (cache->ro) {
-                        cache->space_info->bytes_readonly += len;
+                        space_info->bytes_readonly += len;
+                        readonly = true;
+                }
                spin_unlock(&cache->lock);
-                spin_unlock(&cache->space_info->lock);
+                if (!readonly && global_rsv->space_info == space_info) {
+                        spin_lock(&global_rsv->lock);
+                        if (!global_rsv->full) {
+                                len = min(len, global_rsv->size -
+                                          global_rsv->reserved);
+                                global_rsv->reserved += len;
+                                space_info->bytes_may_use += len;
+                                if (global_rsv->reserved >= global_rsv->size)
+                                        global_rsv->full = 1;
+                        }
+                        spin_unlock(&global_rsv->lock);
+                }
+                spin_unlock(&space_info->lock);
        }
        if (cache)
@@ -5466,7 +5505,7 @@ wait_block_group_cache_done(struct btrfs_block_group_cache *cache)
        return 0;
 }
-static int __get_block_group_index(u64 flags)
+int __get_raid_index(u64 flags)
 {
        int index;
@@ -5486,7 +5525,7 @@ static int __get_block_group_index(u64 flags)
 static int get_block_group_index(struct btrfs_block_group_cache *cache)
 {
-        return __get_block_group_index(cache->flags);
+        return __get_raid_index(cache->flags);
 }
 enum btrfs_loop_type {
@@ -5519,7 +5558,7 @@ static noinline int find_free_extent(struct btrfs_trans_handle *trans,
        int empty_cluster = 2 * 1024 * 1024;
        struct btrfs_space_info *space_info;
        int loop = 0;
-        int index = 0;
+        int index = __get_raid_index(data);
        int alloc_type = (data & BTRFS_BLOCK_GROUP_DATA) ?
                RESERVE_ALLOC_NO_ACCOUNT : RESERVE_ALLOC;
        bool found_uncached_bg = false;
@@ -6269,7 +6308,8 @@ use_block_rsv(struct btrfs_trans_handle *trans,
        block_rsv = get_block_rsv(trans, root);
        if (block_rsv->size == 0) {
-                ret = reserve_metadata_bytes(root, block_rsv, blocksize, 0);
+                ret = reserve_metadata_bytes(root, block_rsv, blocksize,
+                                             BTRFS_RESERVE_NO_FLUSH);
                /*
                 * If we couldn't reserve metadata bytes try and use some from
                 * the global reserve.
@@ -6292,11 +6332,11 @@ use_block_rsv(struct btrfs_trans_handle *trans,
                static DEFINE_RATELIMIT_STATE(_rs,
                                DEFAULT_RATELIMIT_INTERVAL,
                                /*DEFAULT_RATELIMIT_BURST*/ 2);
-                if (__ratelimit(&_rs)) {
+                if (__ratelimit(&_rs))
-                        printk(KERN_DEBUG "btrfs: block rsv returned %d\n", ret);
+                        WARN(1, KERN_DEBUG "btrfs: block rsv returned %d\n",
-                        WARN_ON(1);
+                             ret);
-                }
+                ret = reserve_metadata_bytes(root, block_rsv, blocksize,
-                ret = reserve_metadata_bytes(root, block_rsv, blocksize, 0);
+                                             BTRFS_RESERVE_NO_FLUSH);
                if (!ret) {
                        return block_rsv;
                } else if (ret && block_rsv != global_rsv) {
@@ -6746,11 +6786,13 @@ static noinline int walk_up_proc(struct btrfs_trans_handle *trans,
                                                       &wc->flags[level]);
                        if (ret < 0) {
                                btrfs_tree_unlock_rw(eb, path->locks[level]);
+                                path->locks[level] = 0;
                                return ret;
                        }
                        BUG_ON(wc->refs[level] == 0);
                        if (wc->refs[level] == 1) {
                                btrfs_tree_unlock_rw(eb, path->locks[level]);
+                                path->locks[level] = 0;
                                return 1;
                        }
                }
@@ -7427,7 +7469,7 @@ int btrfs_can_relocate(struct btrfs_root *root, u64 bytenr)
         */
        target = get_restripe_target(root->fs_info, block_group->flags);
        if (target) {
-                index = __get_block_group_index(extended_to_chunk(target));
+                index = __get_raid_index(extended_to_chunk(target));
        } else {
                /*
                 * this is just a balance, so if we were marked as full
@@ -7461,7 +7503,8 @@ int btrfs_can_relocate(struct btrfs_root *root, u64 bytenr)
                 * check to make sure we can actually find a chunk with enough
                 * space to fit our block group in.
                 */
-                if (device->total_bytes > device->bytes_used + min_free) {
+                if (device->total_bytes > device->bytes_used + min_free &&
+                    !device->is_tgtdev_for_dev_replace) {
                        ret = find_free_dev_extent(device, min_free,
                                                   &dev_offset, NULL);
                        if (!ret)
diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c
index 472873a94d96..1b319df29eee 100644
--- a/fs/btrfs/extent_io.c
+++ b/fs/btrfs/extent_io.c
@@ -341,12 +341,10 @@ static int insert_state(struct extent_io_tree *tree,
 {
        struct rb_node *node;
-        if (end < start) {
+        if (end < start)
-                printk(KERN_ERR "btrfs end < start %llu %llu\n",
+                WARN(1, KERN_ERR "btrfs end < start %llu %llu\n",
                       (unsigned long long)end,
                       (unsigned long long)start);
-                WARN_ON(1);
-        }
        state->start = start;
        state->end = end;
@@ -1919,12 +1917,12 @@ static void repair_io_failure_callback(struct bio *bio, int err)
 * the standard behavior is to write all copies in a raid setup. here we only
 * want to write the one bad copy. so we do the mapping for ourselves and issue
 * submit_bio directly.
- * to avoid any synchonization issues, wait for the data after writing, which
+ * to avoid any synchronization issues, wait for the data after writing, which
 * actually prevents the read that triggered the error from finishing.
 * currently, there can be no more than two copies of every data bit. thus,
 * exactly one rewrite is required.
 */
-int repair_io_failure(struct btrfs_mapping_tree *map_tree, u64 start,
+int repair_io_failure(struct btrfs_fs_info *fs_info, u64 start,
                        u64 length, u64 logical, struct page *page,
                        int mirror_num)
 {
@@ -1946,7 +1944,7 @@ int repair_io_failure(struct btrfs_mapping_tree *map_tree, u64 start,
        bio->bi_size = 0;
        map_length = length;
-        ret = btrfs_map_block(map_tree, WRITE, logical,
+        ret = btrfs_map_block(fs_info, WRITE, logical,
                              &map_length, &bbio, mirror_num);
        if (ret) {
                bio_put(bio);
@@ -1984,14 +1982,13 @@ int repair_io_failure(struct btrfs_mapping_tree *map_tree, u64 start,
 int repair_eb_io_failure(struct btrfs_root *root, struct extent_buffer *eb,
                         int mirror_num)
 {
-        struct btrfs_mapping_tree *map_tree = &root->fs_info->mapping_tree;
        u64 start = eb->start;
        unsigned long i, num_pages = num_extent_pages(eb->start, eb->len);
        int ret = 0;
        for (i = 0; i < num_pages; i++) {
                struct page *p = extent_buffer_page(eb, i);
-                ret = repair_io_failure(map_tree, start, PAGE_CACHE_SIZE,
+                ret = repair_io_failure(root->fs_info, start, PAGE_CACHE_SIZE,
                                        start, p, mirror_num);
                if (ret)
                        break;
@@ -2010,7 +2007,7 @@ static int clean_io_failure(u64 start, struct page *page)
        u64 private;
        u64 private_failure;
        struct io_failure_record *failrec;
-        struct btrfs_mapping_tree *map_tree;
+        struct btrfs_fs_info *fs_info;
        struct extent_state *state;
        int num_copies;
        int did_repair = 0;
@@ -2046,11 +2043,11 @@ static int clean_io_failure(u64 start, struct page *page)
        spin_unlock(&BTRFS_I(inode)->io_tree.lock);
        if (state && state->start == failrec->start) {
-                map_tree = &BTRFS_I(inode)->root->fs_info->mapping_tree;
+                fs_info = BTRFS_I(inode)->root->fs_info;
-                num_copies = btrfs_num_copies(map_tree, failrec->logical,
+                num_copies = btrfs_num_copies(fs_info, failrec->logical,
-                                                failrec->len);
+                                              failrec->len);
                if (num_copies > 1)  {
-                        ret = repair_io_failure(map_tree, start, failrec->len,
+                        ret = repair_io_failure(fs_info, start, failrec->len,
                                                failrec->logical, page,
                                                failrec->failed_mirror);
                        did_repair = !ret;
@@ -2159,9 +2156,8 @@ static int bio_readpage_error(struct bio *failed_bio, struct page *page,
                 * clean_io_failure() clean all those errors at once.
                 */
        }
-        num_copies = btrfs_num_copies(
+        num_copies = btrfs_num_copies(BTRFS_I(inode)->root->fs_info,
-                              &BTRFS_I(inode)->root->fs_info->mapping_tree,
+                                      failrec->logical, failrec->len);
-                              failrec->logical, failrec->len);
        if (num_copies == 1) {
                /*
                 * we only have a single copy of the data, so don't bother with
@@ -2466,10 +2462,6 @@ btrfs_bio_alloc(struct block_device *bdev, u64 first_sector, int nr_vecs,
        return bio;
 }
-/*
- * Since writes are async, they will only return -ENOMEM.
- * Reads can return the full range of I/O error conditions.
- */
 static int __must_check submit_one_bio(int rw, struct bio *bio,
                                       int mirror_num, unsigned long bio_flags)
 {
@@ -4721,10 +4713,9 @@ int map_private_extent_buffer(struct extent_buffer *eb, unsigned long start,
        }
        if (start + min_len > eb->len) {
-                printk(KERN_ERR "btrfs bad mapping eb start %llu len %lu, "
+                WARN(1, KERN_ERR "btrfs bad mapping eb start %llu len %lu, "
                       "wanted %lu %lu\n", (unsigned long long)eb->start,
                       eb->len, start, min_len);
-                WARN_ON(1);
                return -EINVAL;
        }
diff --git a/fs/btrfs/extent_io.h b/fs/btrfs/extent_io.h
index 711d12b80028..2eacfabd3263 100644
--- a/fs/btrfs/extent_io.h
+++ b/fs/btrfs/extent_io.h
@@ -337,9 +337,9 @@ struct bio *
 btrfs_bio_alloc(struct block_device *bdev, u64 first_sector, int nr_vecs,
                gfp_t gfp_flags);
-struct btrfs_mapping_tree;
+struct btrfs_fs_info;
-int repair_io_failure(struct btrfs_mapping_tree *map_tree, u64 start,
+int repair_io_failure(struct btrfs_fs_info *fs_info, u64 start,
                        u64 length, u64 logical, struct page *page,
                        int mirror_num);
 int end_extent_writepage(struct page *page, int err, u64 start, u64 end);
diff --git a/fs/btrfs/extent_map.c b/fs/btrfs/extent_map.c
index b8cbc8d5c7f7..fdb7a8db3b57 100644
--- a/fs/btrfs/extent_map.c
+++ b/fs/btrfs/extent_map.c
@@ -49,7 +49,7 @@ void extent_map_tree_init(struct extent_map_tree *tree)
 struct extent_map *alloc_extent_map(void)
 {
        struct extent_map *em;
-        em = kmem_cache_alloc(extent_map_cache, GFP_NOFS);
+        em = kmem_cache_zalloc(extent_map_cache, GFP_NOFS);
        if (!em)
                return NULL;
        em->in_tree = 0;
@@ -171,6 +171,10 @@ static int mergable_maps(struct extent_map *prev, struct extent_map *next)
        if (test_bit(EXTENT_FLAG_COMPRESSED, &prev->flags))
                return 0;
+        if (test_bit(EXTENT_FLAG_LOGGING, &prev->flags) ||
+            test_bit(EXTENT_FLAG_LOGGING, &next->flags))
+                return 0;
        if (extent_map_end(prev) == next->start &&
            prev->flags == next->flags &&
            prev->bdev == next->bdev &&
@@ -198,16 +202,15 @@ static void try_merge_map(struct extent_map_tree *tree, struct extent_map *em)
                        merge = rb_entry(rb, struct extent_map, rb_node);
                if (rb && mergable_maps(merge, em)) {
                        em->start = merge->start;
+                        em->orig_start = merge->orig_start;
                        em->len += merge->len;
                        em->block_len += merge->block_len;
                        em->block_start = merge->block_start;
                        merge->in_tree = 0;
-                        if (merge->generation > em->generation) {
+                        em->mod_len = (em->mod_len + em->mod_start) - merge->mod_start;
-                                em->mod_start = em->start;
+                        em->mod_start = merge->mod_start;
-                                em->mod_len = em->len;
+                        em->generation = max(em->generation, merge->generation);
-                                em->generation = merge->generation;
+                        list_move(&em->list, &tree->modified_extents);
-                                list_move(&em->list, &tree->modified_extents);
-                        }
                        list_del_init(&merge->list);
                        rb_erase(&merge->rb_node, &tree->map);
@@ -223,23 +226,19 @@ static void try_merge_map(struct extent_map_tree *tree, struct extent_map *em)
                em->block_len += merge->len;
                rb_erase(&merge->rb_node, &tree->map);
                merge->in_tree = 0;
-                if (merge->generation > em->generation) {
+                em->mod_len = (merge->mod_start + merge->mod_len) - em->mod_start;
-                        em->mod_len = em->len;
+                em->generation = max(em->generation, merge->generation);
-                        em->generation = merge->generation;
-                        list_move(&em->list, &tree->modified_extents);
-                }
                list_del_init(&merge->list);
                free_extent_map(merge);
        }
 }
 /**
- * unpint_extent_cache - unpin an extent from the cache
+ * unpin_extent_cache - unpin an extent from the cache
 * @tree:       tree to unpin the extent in
 * @start:      logical offset in the file
 * @len:        length of the extent
 * @gen:        generation that this extent has been modified in
- * @prealloc:   if this is set we need to clear the prealloc flag
 *
 * Called after an extent has been written to disk properly.  Set the generation
 * to the generation that actually added the file item to the inode so we know
@@ -260,15 +259,16 @@ int unpin_extent_cache(struct extent_map_tree *tree, u64 start, u64 len,
        if (!em)
                goto out;
-        list_move(&em->list, &tree->modified_extents);
+        if (!test_bit(EXTENT_FLAG_LOGGING, &em->flags))
+                list_move(&em->list, &tree->modified_extents);
        em->generation = gen;
        clear_bit(EXTENT_FLAG_PINNED, &em->flags);
        em->mod_start = em->start;
        em->mod_len = em->len;
-        if (test_bit(EXTENT_FLAG_PREALLOC, &em->flags)) {
+        if (test_bit(EXTENT_FLAG_FILLING, &em->flags)) {
                prealloc = true;
-                clear_bit(EXTENT_FLAG_PREALLOC, &em->flags);
+                clear_bit(EXTENT_FLAG_FILLING, &em->flags);
        }
        try_merge_map(tree, em);
@@ -285,6 +285,13 @@ out:
 }
+void clear_em_logging(struct extent_map_tree *tree, struct extent_map *em)
+{
+        clear_bit(EXTENT_FLAG_LOGGING, &em->flags);
+        if (em->in_tree)
+                try_merge_map(tree, em);
+}
 /**
 * add_extent_mapping - add new extent map to the extent tree
 * @tree:       tree to insert new map in
diff --git a/fs/btrfs/extent_map.h b/fs/btrfs/extent_map.h
index 679225555f7b..c6598c89cff8 100644
--- a/fs/btrfs/extent_map.h
+++ b/fs/btrfs/extent_map.h
@@ -14,6 +14,7 @@
 #define EXTENT_FLAG_VACANCY 2 /* no file extent item found */
 #define EXTENT_FLAG_PREALLOC 3 /* pre-allocated extent */
 #define EXTENT_FLAG_LOGGING 4 /* Logging this extent */
+#define EXTENT_FLAG_FILLING 5 /* Filling in a preallocated extent */
 struct extent_map {
        struct rb_node rb_node;
@@ -24,6 +25,7 @@ struct extent_map {
        u64 mod_start;
        u64 mod_len;
        u64 orig_start;
+        u64 orig_block_len;
        u64 block_start;
        u64 block_len;
        u64 generation;
@@ -67,6 +69,7 @@ void free_extent_map(struct extent_map *em);
 int __init extent_map_init(void);
 void extent_map_exit(void);
 int unpin_extent_cache(struct extent_map_tree *tree, u64 start, u64 len, u64 gen);
+void clear_em_logging(struct extent_map_tree *tree, struct extent_map *em);
 struct extent_map *search_extent_mapping(struct extent_map_tree *tree,
                                         u64 start, u64 len);
 #endif
diff --git a/fs/btrfs/file-item.c b/fs/btrfs/file-item.c
index 1ad08e4e4a15..94aa53b38721 100644
--- a/fs/btrfs/file-item.c
+++ b/fs/btrfs/file-item.c
@@ -133,7 +133,6 @@ fail:
        return ERR_PTR(ret);
 }
 int btrfs_lookup_file_extent(struct btrfs_trans_handle *trans,
                             struct btrfs_root *root,
                             struct btrfs_path *path, u64 objectid,
@@ -151,6 +150,26 @@ int btrfs_lookup_file_extent(struct btrfs_trans_handle *trans,
        return ret;
 }
+u64 btrfs_file_extent_length(struct btrfs_path *path)
+{
+        int extent_type;
+        struct btrfs_file_extent_item *fi;
+        u64 len;
+        fi = btrfs_item_ptr(path->nodes[0], path->slots[0],
+                            struct btrfs_file_extent_item);
+        extent_type = btrfs_file_extent_type(path->nodes[0], fi);
+        if (extent_type == BTRFS_FILE_EXTENT_REG ||
+            extent_type == BTRFS_FILE_EXTENT_PREALLOC)
+                len = btrfs_file_extent_num_bytes(path->nodes[0], fi);
+        else if (extent_type == BTRFS_FILE_EXTENT_INLINE)
+                len = btrfs_file_extent_inline_len(path->nodes[0], fi);
+        else
+                BUG();
+        return len;
+}
 static int __btrfs_lookup_bio_sums(struct btrfs_root *root,
                                   struct inode *inode, struct bio *bio,
@@ -441,8 +460,8 @@ int btrfs_csum_one_bio(struct btrfs_root *root, struct inode *inode,
                if (!contig)
                        offset = page_offset(bvec->bv_page) + bvec->bv_offset;
-                if (!contig && (offset >= ordered->file_offset + ordered->len ||
+                if (offset >= ordered->file_offset + ordered->len ||
-                    offset < ordered->file_offset)) {
+                    offset < ordered->file_offset) {
                        unsigned long bytes_left;
                        sums->len = this_sum_bytes;
                        this_sum_bytes = 0;
diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c
index 9ab1bed88116..aeb84469d2c4 100644
--- a/fs/btrfs/file.c
+++ b/fs/btrfs/file.c
@@ -41,6 +41,7 @@
 #include "compat.h"
 #include "volumes.h"
+static struct kmem_cache *btrfs_inode_defrag_cachep;
 /*
 * when auto defrag is enabled we
 * queue up these defrag structs to remember which
@@ -90,7 +91,7 @@ static int __compare_inode_defrag(struct inode_defrag *defrag1,
 * If an existing record is found the defrag item you
 * pass in is freed
 */
-static void __btrfs_add_inode_defrag(struct inode *inode,
+static int __btrfs_add_inode_defrag(struct inode *inode,
                                    struct inode_defrag *defrag)
 {
        struct btrfs_root *root = BTRFS_I(inode)->root;
@@ -118,18 +119,24 @@ static void __btrfs_add_inode_defrag(struct inode *inode,
                                entry->transid = defrag->transid;
                        if (defrag->last_offset > entry->last_offset)
                                entry->last_offset = defrag->last_offset;
-                        goto exists;
+                        return -EEXIST;
                }
        }
        set_bit(BTRFS_INODE_IN_DEFRAG, &BTRFS_I(inode)->runtime_flags);
        rb_link_node(&defrag->rb_node, parent, p);
        rb_insert_color(&defrag->rb_node, &root->fs_info->defrag_inodes);
-        return;
+        return 0;
+}
-exists:
+static inline int __need_auto_defrag(struct btrfs_root *root)
-        kfree(defrag);
+{
-        return;
+        if (!btrfs_test_opt(root, AUTO_DEFRAG))
+                return 0;
+        if (btrfs_fs_closing(root->fs_info))
+                return 0;
+        return 1;
 }
 /*
@@ -142,11 +149,9 @@ int btrfs_add_inode_defrag(struct btrfs_trans_handle *trans,
        struct btrfs_root *root = BTRFS_I(inode)->root;
        struct inode_defrag *defrag;
        u64 transid;
+        int ret;
-        if (!btrfs_test_opt(root, AUTO_DEFRAG))
+        if (!__need_auto_defrag(root))
-                return 0;
-        if (btrfs_fs_closing(root->fs_info))
                return 0;
        if (test_bit(BTRFS_INODE_IN_DEFRAG, &BTRFS_I(inode)->runtime_flags))
@@ -157,7 +162,7 @@ int btrfs_add_inode_defrag(struct btrfs_trans_handle *trans,
        else
                transid = BTRFS_I(inode)->root->last_trans;
-        defrag = kzalloc(sizeof(*defrag), GFP_NOFS);
+        defrag = kmem_cache_zalloc(btrfs_inode_defrag_cachep, GFP_NOFS);
        if (!defrag)
                return -ENOMEM;
@@ -166,20 +171,56 @@ int btrfs_add_inode_defrag(struct btrfs_trans_handle *trans,
        defrag->root = root->root_key.objectid;
        spin_lock(&root->fs_info->defrag_inodes_lock);
-        if (!test_bit(BTRFS_INODE_IN_DEFRAG, &BTRFS_I(inode)->runtime_flags))
+        if (!test_bit(BTRFS_INODE_IN_DEFRAG, &BTRFS_I(inode)->runtime_flags)) {
-                __btrfs_add_inode_defrag(inode, defrag);
+                /*
-        else
+                 * If we set IN_DEFRAG flag and evict the inode from memory,
-                kfree(defrag);
+                 * and then re-read this inode, this new inode doesn't have
+                 * IN_DEFRAG flag. At the case, we may find the existed defrag.
+                 */
+                ret = __btrfs_add_inode_defrag(inode, defrag);
+                if (ret)
+                        kmem_cache_free(btrfs_inode_defrag_cachep, defrag);
+        } else {
+                kmem_cache_free(btrfs_inode_defrag_cachep, defrag);
+        }
        spin_unlock(&root->fs_info->defrag_inodes_lock);
        return 0;
 }
 /*
- * must be called with the defrag_inodes lock held
+ * Requeue the defrag object. If there is a defrag object that points to
+ * the same inode in the tree, we will merge them together (by
+ * __btrfs_add_inode_defrag()) and free the one that we want to requeue.
 */
-struct inode_defrag *btrfs_find_defrag_inode(struct btrfs_fs_info *info,
+void btrfs_requeue_inode_defrag(struct inode *inode,
-                                             u64 root, u64 ino,
+                                struct inode_defrag *defrag)
-                                             struct rb_node **next)
+{
+        struct btrfs_root *root = BTRFS_I(inode)->root;
+        int ret;
+        if (!__need_auto_defrag(root))
+                goto out;
+        /*
+         * Here we don't check the IN_DEFRAG flag, because we need merge
+         * them together.
+         */
+        spin_lock(&root->fs_info->defrag_inodes_lock);
+        ret = __btrfs_add_inode_defrag(inode, defrag);
+        spin_unlock(&root->fs_info->defrag_inodes_lock);
+        if (ret)
+                goto out;
+        return;
+out:
+        kmem_cache_free(btrfs_inode_defrag_cachep, defrag);
+}
+/*
+ * pick the defragable inode that we want, if it doesn't exist, we will get
+ * the next one.
+ */
+static struct inode_defrag *
+btrfs_pick_defrag_inode(struct btrfs_fs_info *fs_info, u64 root, u64 ino)
 {
        struct inode_defrag *entry = NULL;
        struct inode_defrag tmp;
@@ -190,7 +231,8 @@ struct inode_defrag *btrfs_find_defrag_inode(struct btrfs_fs_info *info,
        tmp.ino = ino;
        tmp.root = root;
-        p = info->defrag_inodes.rb_node;
+        spin_lock(&fs_info->defrag_inodes_lock);
+        p = fs_info->defrag_inodes.rb_node;
        while (p) {
                parent = p;
                entry = rb_entry(parent, struct inode_defrag, rb_node);
@@ -201,52 +243,145 @@ struct inode_defrag *btrfs_find_defrag_inode(struct btrfs_fs_info *info,
                else if (ret > 0)
                        p = parent->rb_right;
                else
-                        return entry;
+                        goto out;
        }
-        if (next) {
+        if (parent && __compare_inode_defrag(&tmp, entry) > 0) {
-                while (parent && __compare_inode_defrag(&tmp, entry) > 0) {
+                parent = rb_next(parent);
-                        parent = rb_next(parent);
+                if (parent)
                        entry = rb_entry(parent, struct inode_defrag, rb_node);
-                }
+                else
-                *next = parent;
+                        entry = NULL;
        }
-        return NULL;
+out:
+        if (entry)
+                rb_erase(parent, &fs_info->defrag_inodes);
+        spin_unlock(&fs_info->defrag_inodes_lock);
+        return entry;
 }
-/*
+void btrfs_cleanup_defrag_inodes(struct btrfs_fs_info *fs_info)
- * run through the list of inodes in the FS that need
- * defragging
- */
-int btrfs_run_defrag_inodes(struct btrfs_fs_info *fs_info)
 {
        struct inode_defrag *defrag;
+        struct rb_node *node;
+        spin_lock(&fs_info->defrag_inodes_lock);
+        node = rb_first(&fs_info->defrag_inodes);
+        while (node) {
+                rb_erase(node, &fs_info->defrag_inodes);
+                defrag = rb_entry(node, struct inode_defrag, rb_node);
+                kmem_cache_free(btrfs_inode_defrag_cachep, defrag);
+                if (need_resched()) {
+                        spin_unlock(&fs_info->defrag_inodes_lock);
+                        cond_resched();
+                        spin_lock(&fs_info->defrag_inodes_lock);
+                }
+                node = rb_first(&fs_info->defrag_inodes);
+        }
+        spin_unlock(&fs_info->defrag_inodes_lock);
+}
+#define BTRFS_DEFRAG_BATCH      1024
+static int __btrfs_run_defrag_inode(struct btrfs_fs_info *fs_info,
+                                    struct inode_defrag *defrag)
+{
        struct btrfs_root *inode_root;
        struct inode *inode;
-        struct rb_node *n;
        struct btrfs_key key;
        struct btrfs_ioctl_defrag_range_args range;
-        u64 first_ino = 0;
-        u64 root_objectid = 0;
        int num_defrag;
-        int defrag_batch = 1024;
+        int index;
+        int ret;
+        /* get the inode */
+        key.objectid = defrag->root;
+        btrfs_set_key_type(&key, BTRFS_ROOT_ITEM_KEY);
+        key.offset = (u64)-1;
+        index = srcu_read_lock(&fs_info->subvol_srcu);
+        inode_root = btrfs_read_fs_root_no_name(fs_info, &key);
+        if (IS_ERR(inode_root)) {
+                ret = PTR_ERR(inode_root);
+                goto cleanup;
+        }
+        if (btrfs_root_refs(&inode_root->root_item) == 0) {
+                ret = -ENOENT;
+                goto cleanup;
+        }
+        key.objectid = defrag->ino;
+        btrfs_set_key_type(&key, BTRFS_INODE_ITEM_KEY);
+        key.offset = 0;
+        inode = btrfs_iget(fs_info->sb, &key, inode_root, NULL);
+        if (IS_ERR(inode)) {
+                ret = PTR_ERR(inode);
+                goto cleanup;
+        }
+        srcu_read_unlock(&fs_info->subvol_srcu, index);
+        /* do a chunk of defrag */
+        clear_bit(BTRFS_INODE_IN_DEFRAG, &BTRFS_I(inode)->runtime_flags);
        memset(&range, 0, sizeof(range));
        range.len = (u64)-1;
+        range.start = defrag->last_offset;
+        sb_start_write(fs_info->sb);
+        num_defrag = btrfs_defrag_file(inode, NULL, &range, defrag->transid,
+                                       BTRFS_DEFRAG_BATCH);
+        sb_end_write(fs_info->sb);
+        /*
+         * if we filled the whole defrag batch, there
+         * must be more work to do.  Queue this defrag
+         * again
+         */
+        if (num_defrag == BTRFS_DEFRAG_BATCH) {
+                defrag->last_offset = range.start;
+                btrfs_requeue_inode_defrag(inode, defrag);
+        } else if (defrag->last_offset && !defrag->cycled) {
+                /*
+                 * we didn't fill our defrag batch, but
+                 * we didn't start at zero.  Make sure we loop
+                 * around to the start of the file.
+                 */
+                defrag->last_offset = 0;
+                defrag->cycled = 1;
+                btrfs_requeue_inode_defrag(inode, defrag);
+        } else {
+                kmem_cache_free(btrfs_inode_defrag_cachep, defrag);
+        }
+        iput(inode);
+        return 0;
+cleanup:
+        srcu_read_unlock(&fs_info->subvol_srcu, index);
+        kmem_cache_free(btrfs_inode_defrag_cachep, defrag);
+        return ret;
+}
+/*
+ * run through the list of inodes in the FS that need
+ * defragging
+ */
+int btrfs_run_defrag_inodes(struct btrfs_fs_info *fs_info)
+{
+        struct inode_defrag *defrag;
+        u64 first_ino = 0;
+        u64 root_objectid = 0;
        atomic_inc(&fs_info->defrag_running);
-        spin_lock(&fs_info->defrag_inodes_lock);
        while(1) {
-                n = NULL;
+                if (!__need_auto_defrag(fs_info->tree_root))
+                        break;
                /* find an inode to defrag */
-                defrag = btrfs_find_defrag_inode(fs_info, root_objectid,
+                defrag = btrfs_pick_defrag_inode(fs_info, root_objectid,
-                                                 first_ino, &n);
+                                                 first_ino);
                if (!defrag) {
-                        if (n) {
+                        if (root_objectid || first_ino) {
-                                defrag = rb_entry(n, struct inode_defrag,
-                                                  rb_node);
-                        } else if (root_objectid || first_ino) {
                                root_objectid = 0;
                                first_ino = 0;
                                continue;
@@ -255,70 +390,11 @@ int btrfs_run_defrag_inodes(struct btrfs_fs_info *fs_info)
                        }
                }
-                /* remove it from the rbtree */
                first_ino = defrag->ino + 1;
                root_objectid = defrag->root;
-                rb_erase(&defrag->rb_node, &fs_info->defrag_inodes);
-                if (btrfs_fs_closing(fs_info))
-                        goto next_free;
-                spin_unlock(&fs_info->defrag_inodes_lock);
-                /* get the inode */
-                key.objectid = defrag->root;
-                btrfs_set_key_type(&key, BTRFS_ROOT_ITEM_KEY);
-                key.offset = (u64)-1;
-                inode_root = btrfs_read_fs_root_no_name(fs_info, &key);
-                if (IS_ERR(inode_root))
-                        goto next;
-                key.objectid = defrag->ino;
-                btrfs_set_key_type(&key, BTRFS_INODE_ITEM_KEY);
-                key.offset = 0;
-                inode = btrfs_iget(fs_info->sb, &key, inode_root, NULL);
-                if (IS_ERR(inode))
-                        goto next;
-                /* do a chunk of defrag */
-                clear_bit(BTRFS_INODE_IN_DEFRAG, &BTRFS_I(inode)->runtime_flags);
-                range.start = defrag->last_offset;
-                num_defrag = btrfs_defrag_file(inode, NULL, &range, defrag->transid,
-                                               defrag_batch);
-                /*
-                 * if we filled the whole defrag batch, there
-                 * must be more work to do.  Queue this defrag
-                 * again
-                 */
-                if (num_defrag == defrag_batch) {
-                        defrag->last_offset = range.start;
-                        __btrfs_add_inode_defrag(inode, defrag);
-                        /*
-                         * we don't want to kfree defrag, we added it back to
-                         * the rbtree
-                         */
-                        defrag = NULL;
-                } else if (defrag->last_offset && !defrag->cycled) {
-                        /*
-                         * we didn't fill our defrag batch, but
-                         * we didn't start at zero.  Make sure we loop
-                         * around to the start of the file.
-                         */
-                        defrag->last_offset = 0;
-                        defrag->cycled = 1;
-                        __btrfs_add_inode_defrag(inode, defrag);
-                        defrag = NULL;
-                }
-                iput(inode);
+                __btrfs_run_defrag_inode(fs_info, defrag);
-next:
-                spin_lock(&fs_info->defrag_inodes_lock);
-next_free:
-                kfree(defrag);
        }
-        spin_unlock(&fs_info->defrag_inodes_lock);
        atomic_dec(&fs_info->defrag_running);
        /*
@@ -526,6 +602,8 @@ void btrfs_drop_extent_cache(struct inode *inode, u64 start, u64 end,
                                split->block_len = em->block_len;
                        else
                                split->block_len = split->len;
+                        split->orig_block_len = max(split->block_len,
+                                                    em->orig_block_len);
                        split->generation = gen;
                        split->bdev = em->bdev;
                        split->flags = flags;
@@ -547,6 +625,8 @@ void btrfs_drop_extent_cache(struct inode *inode, u64 start, u64 end,
                        split->flags = flags;
                        split->compress_type = em->compress_type;
                        split->generation = gen;
+                        split->orig_block_len = max(em->block_len,
+                                                    em->orig_block_len);
                        if (compressed) {
                                split->block_len = em->block_len;
@@ -555,7 +635,7 @@ void btrfs_drop_extent_cache(struct inode *inode, u64 start, u64 end,
                        } else {
                                split->block_len = split->len;
                                split->block_start = em->block_start + diff;
-                                split->orig_start = split->start;
+                                split->orig_start = em->orig_start;
                        }
                        ret = add_extent_mapping(em_tree, split);
@@ -1346,10 +1426,9 @@ static noinline ssize_t __btrfs_buffered_write(struct file *file,
                cond_resched();
-                balance_dirty_pages_ratelimited_nr(inode->i_mapping,
+                balance_dirty_pages_ratelimited(inode->i_mapping);
-                                                   dirty_pages);
                if (dirty_pages < (root->leafsize >> PAGE_CACHE_SHIFT) + 1)
-                        btrfs_btree_balance_dirty(root, 1);
+                        btrfs_btree_balance_dirty(root);
                pos += copied;
                num_written += copied;
@@ -1398,6 +1477,24 @@ out:
        return written ? written : err;
 }
+static void update_time_for_write(struct inode *inode)
+{
+        struct timespec now;
+        if (IS_NOCMTIME(inode))
+                return;
+        now = current_fs_time(inode->i_sb);
+        if (!timespec_equal(&inode->i_mtime, &now))
+                inode->i_mtime = now;
+        if (!timespec_equal(&inode->i_ctime, &now))
+                inode->i_ctime = now;
+        if (IS_I_VERSION(inode))
+                inode_inc_iversion(inode);
+}
 static ssize_t btrfs_file_aio_write(struct kiocb *iocb,
                                    const struct iovec *iov,
                                    unsigned long nr_segs, loff_t pos)
@@ -1410,6 +1507,7 @@ static ssize_t btrfs_file_aio_write(struct kiocb *iocb,
        ssize_t num_written = 0;
        ssize_t err = 0;
        size_t count, ocount;
+        bool sync = (file->f_flags & O_DSYNC) || IS_SYNC(file->f_mapping->host);
        sb_start_write(inode->i_sb);
@@ -1452,11 +1550,13 @@ static ssize_t btrfs_file_aio_write(struct kiocb *iocb,
                goto out;
        }
-        err = file_update_time(file);
+        /*
-        if (err) {
+         * We reserve space for updating the inode when we reserve space for the
-                mutex_unlock(&inode->i_mutex);
+         * extent we are going to write, so we will enospc out there.  We don't
-                goto out;
+         * need to start yet another transaction to update the inode as we will
-        }
+         * update the inode when we finish writing whatever data we write.
+         */
+        update_time_for_write(inode);
        start_pos = round_down(pos, root->sectorsize);
        if (start_pos > i_size_read(inode)) {
@@ -1467,6 +1567,9 @@ static ssize_t btrfs_file_aio_write(struct kiocb *iocb,
                }
        }
+        if (sync)
+                atomic_inc(&BTRFS_I(inode)->sync_writers);
        if (unlikely(file->f_flags & O_DIRECT)) {
                num_written = __btrfs_direct_write(iocb, iov, nr_segs,
                                                   pos, ppos, count, ocount);
@@ -1493,13 +1596,21 @@ static ssize_t btrfs_file_aio_write(struct kiocb *iocb,
         * this will either be one more than the running transaction
         * or the generation used for the next transaction if there isn't
         * one running right now.
+         *
+         * We also have to set last_sub_trans to the current log transid,
+         * otherwise subsequent syncs to a file that's been synced in this
+         * transaction will appear to have already occured.
         */
        BTRFS_I(inode)->last_trans = root->fs_info->generation + 1;
+        BTRFS_I(inode)->last_sub_trans = root->log_transid;
        if (num_written > 0 || num_written == -EIOCBQUEUED) {
                err = generic_write_sync(file, pos, num_written);
                if (err < 0 && num_written > 0)
                        num_written = err;
        }
+        if (sync)
+                atomic_dec(&BTRFS_I(inode)->sync_writers);
 out:
        sb_end_write(inode->i_sb);
        current->backing_dev_info = NULL;
@@ -1551,7 +1662,9 @@ int btrfs_sync_file(struct file *file, loff_t start, loff_t end, int datasync)
         * out of the ->i_mutex. If so, we can flush the dirty pages by
         * multi-task, and make the performance up.
         */
+        atomic_inc(&BTRFS_I(inode)->sync_writers);
        ret = filemap_write_and_wait_range(inode->i_mapping, start, end);
+        atomic_dec(&BTRFS_I(inode)->sync_writers);
        if (ret)
                return ret;
@@ -1562,7 +1675,7 @@ int btrfs_sync_file(struct file *file, loff_t start, loff_t end, int datasync)
         * range being left.
         */
        atomic_inc(&root->log_batch);
-        btrfs_wait_ordered_range(inode, start, end);
+        btrfs_wait_ordered_range(inode, start, end - start + 1);
        atomic_inc(&root->log_batch);
        /*
@@ -1768,6 +1881,7 @@ out:
                hole_em->block_start = EXTENT_MAP_HOLE;
                hole_em->block_len = 0;
+                hole_em->orig_block_len = 0;
                hole_em->bdev = root->fs_info->fs_devices->latest_bdev;
                hole_em->compress_type = BTRFS_COMPRESS_NONE;
                hole_em->generation = trans->transid;
@@ -1797,48 +1911,51 @@ static int btrfs_punch_hole(struct inode *inode, loff_t offset, loff_t len)
        struct btrfs_path *path;
        struct btrfs_block_rsv *rsv;
        struct btrfs_trans_handle *trans;
-        u64 mask = BTRFS_I(inode)->root->sectorsize - 1;
+        u64 lockstart = round_up(offset, BTRFS_I(inode)->root->sectorsize);
-        u64 lockstart = (offset + mask) & ~mask;
+        u64 lockend = round_down(offset + len,
-        u64 lockend = ((offset + len) & ~mask) - 1;
+                                 BTRFS_I(inode)->root->sectorsize) - 1;
        u64 cur_offset = lockstart;
        u64 min_size = btrfs_calc_trunc_metadata_size(root, 1);
        u64 drop_end;
-        unsigned long nr;
        int ret = 0;
        int err = 0;
-        bool same_page = (offset >> PAGE_CACHE_SHIFT) ==
+        bool same_page = ((offset >> PAGE_CACHE_SHIFT) ==
-                ((offset + len) >> PAGE_CACHE_SHIFT);
+                          ((offset + len - 1) >> PAGE_CACHE_SHIFT));
        btrfs_wait_ordered_range(inode, offset, len);
        mutex_lock(&inode->i_mutex);
-        if (offset >= inode->i_size) {
+        /*
-                mutex_unlock(&inode->i_mutex);
+         * We needn't truncate any page which is beyond the end of the file
-                return 0;
+         * because we are sure there is no data there.
-        }
+         */
        /*
         * Only do this if we are in the same page and we aren't doing the
         * entire page.
         */
        if (same_page && len < PAGE_CACHE_SIZE) {
-                ret = btrfs_truncate_page(inode, offset, len, 0);
+                if (offset < round_up(inode->i_size, PAGE_CACHE_SIZE))
+                        ret = btrfs_truncate_page(inode, offset, len, 0);
                mutex_unlock(&inode->i_mutex);
                return ret;
        }
        /* zero back part of the first page */
-        ret = btrfs_truncate_page(inode, offset, 0, 0);
+        if (offset < round_up(inode->i_size, PAGE_CACHE_SIZE)) {
-        if (ret) {
+                ret = btrfs_truncate_page(inode, offset, 0, 0);
-                mutex_unlock(&inode->i_mutex);
+                if (ret) {
-                return ret;
+                        mutex_unlock(&inode->i_mutex);
+                        return ret;
+                }
        }
        /* zero the front end of the last page */
-        ret = btrfs_truncate_page(inode, offset + len, 0, 1);
+        if (offset + len < round_up(inode->i_size, PAGE_CACHE_SIZE)) {
-        if (ret) {
+                ret = btrfs_truncate_page(inode, offset + len, 0, 1);
-                mutex_unlock(&inode->i_mutex);
+                if (ret) {
-                return ret;
+                        mutex_unlock(&inode->i_mutex);
+                        return ret;
+                }
        }
        if (lockend < lockstart) {
@@ -1931,9 +2048,8 @@ static int btrfs_punch_hole(struct inode *inode, loff_t offset, loff_t len)
                        break;
                }
-                nr = trans->blocks_used;
                btrfs_end_transaction(trans, root);
-                btrfs_btree_balance_dirty(root, nr);
+                btrfs_btree_balance_dirty(root);
                trans = btrfs_start_transaction(root, 3);
                if (IS_ERR(trans)) {
@@ -1964,11 +2080,13 @@ out_trans:
        if (!trans)
                goto out_free;
+        inode_inc_iversion(inode);
+        inode->i_mtime = inode->i_ctime = CURRENT_TIME;
        trans->block_rsv = &root->fs_info->trans_block_rsv;
        ret = btrfs_update_inode(trans, root, inode);
-        nr = trans->blocks_used;
        btrfs_end_transaction(trans, root);
-        btrfs_btree_balance_dirty(root, nr);
+        btrfs_btree_balance_dirty(root);
 out_free:
        btrfs_free_path(path);
        btrfs_free_block_rsv(root, rsv);
@@ -1992,12 +2110,12 @@ static long btrfs_fallocate(struct file *file, int mode,
        u64 alloc_end;
        u64 alloc_hint = 0;
        u64 locked_end;
-        u64 mask = BTRFS_I(inode)->root->sectorsize - 1;
        struct extent_map *em;
+        int blocksize = BTRFS_I(inode)->root->sectorsize;
        int ret;
-        alloc_start = offset & ~mask;
+        alloc_start = round_down(offset, blocksize);
-        alloc_end =  (offset + len + mask) & ~mask;
+        alloc_end = round_up(offset + len, blocksize);
        /* Make sure we aren't being give some crap mode */
        if (mode & ~(FALLOC_FL_KEEP_SIZE | FALLOC_FL_PUNCH_HOLE))
@@ -2010,7 +2128,7 @@ static long btrfs_fallocate(struct file *file, int mode,
         * Make sure we have enough space before we do the
         * allocation.
         */
-        ret = btrfs_check_data_free_space(inode, alloc_end - alloc_start + 1);
+        ret = btrfs_check_data_free_space(inode, alloc_end - alloc_start);
        if (ret)
                return ret;
@@ -2078,7 +2196,7 @@ static long btrfs_fallocate(struct file *file, int mode,
                }
                last_byte = min(extent_map_end(em), alloc_end);
                actual_end = min_t(u64, extent_map_end(em), offset + len);
-                last_byte = (last_byte + mask) & ~mask;
+                last_byte = ALIGN(last_byte, blocksize);
                if (em->block_start == EXTENT_MAP_HOLE ||
                    (cur_offset >= inode->i_size &&
@@ -2117,11 +2235,11 @@ static long btrfs_fallocate(struct file *file, int mode,
 out:
        mutex_unlock(&inode->i_mutex);
        /* Let go of our reservation. */
-        btrfs_free_reserved_data_space(inode, alloc_end - alloc_start + 1);
+        btrfs_free_reserved_data_space(inode, alloc_end - alloc_start);
        return ret;
 }
-static int find_desired_extent(struct inode *inode, loff_t *offset, int origin)
+static int find_desired_extent(struct inode *inode, loff_t *offset, int whence)
 {
        struct btrfs_root *root = BTRFS_I(inode)->root;
        struct extent_map *em;
@@ -2138,6 +2256,7 @@ static int find_desired_extent(struct inode *inode, loff_t *offset, int origin)
        if (lockend <= lockstart)
                lockend = lockstart + root->sectorsize;
+        lockend--;
        len = lockend - lockstart + 1;
        len = max_t(u64, len, root->sectorsize);
@@ -2155,7 +2274,7 @@ static int find_desired_extent(struct inode *inode, loff_t *offset, int origin)
         * before the position we want in case there is outstanding delalloc
         * going on here.
         */
-        if (origin == SEEK_HOLE && start != 0) {
+        if (whence == SEEK_HOLE && start != 0) {
                if (start <= root->sectorsize)
                        em = btrfs_get_extent_fiemap(inode, NULL, 0, 0,
                                                     root->sectorsize, 0);
@@ -2189,13 +2308,13 @@ static int find_desired_extent(struct inode *inode, loff_t *offset, int origin)
                                }
                        }
-                        if (origin == SEEK_HOLE) {
+                        if (whence == SEEK_HOLE) {
                                *offset = start;
                                free_extent_map(em);
                                break;
                        }
                } else {
-                        if (origin == SEEK_DATA) {
+                        if (whence == SEEK_DATA) {
                                if (em->block_start == EXTENT_MAP_DELALLOC) {
                                        if (start >= inode->i_size) {
                                                free_extent_map(em);
@@ -2204,9 +2323,12 @@ static int find_desired_extent(struct inode *inode, loff_t *offset, int origin)
                                        }
                                }
-                                *offset = start;
+                                if (!test_bit(EXTENT_FLAG_PREALLOC,
-                                free_extent_map(em);
+                                              &em->flags)) {
-                                break;
+                                        *offset = start;
+                                        free_extent_map(em);
+                                        break;
+                                }
                        }
                }
@@ -2232,16 +2354,16 @@ out:
        return ret;
 }
-static loff_t btrfs_file_llseek(struct file *file, loff_t offset, int origin)
+static loff_t btrfs_file_llseek(struct file *file, loff_t offset, int whence)
 {
        struct inode *inode = file->f_mapping->host;
        int ret;
        mutex_lock(&inode->i_mutex);
-        switch (origin) {
+        switch (whence) {
        case SEEK_END:
        case SEEK_CUR:
-                offset = generic_file_llseek(file, offset, origin);
+                offset = generic_file_llseek(file, offset, whence);
                goto out;
        case SEEK_DATA:
        case SEEK_HOLE:
@@ -2250,7 +2372,7 @@ static loff_t btrfs_file_llseek(struct file *file, loff_t offset, int origin)
                        return -ENXIO;
                }
-                ret = find_desired_extent(inode, &offset, origin);
+                ret = find_desired_extent(inode, &offset, whence);
                if (ret) {
                        mutex_unlock(&inode->i_mutex);
                        return ret;
@@ -2293,3 +2415,21 @@ const struct file_operations btrfs_file_operations = {
        .compat_ioctl   = btrfs_ioctl,
 #endif
 };
+void btrfs_auto_defrag_exit(void)
+{
+        if (btrfs_inode_defrag_cachep)
+                kmem_cache_destroy(btrfs_inode_defrag_cachep);
+}
+int btrfs_auto_defrag_init(void)
+{
+        btrfs_inode_defrag_cachep = kmem_cache_create("btrfs_inode_defrag",
+                                        sizeof(struct inode_defrag), 0,
+                                        SLAB_RECLAIM_ACCOUNT | SLAB_MEM_SPREAD,
+                                        NULL);
+        if (!btrfs_inode_defrag_cachep)
+                return -ENOMEM;
+        return 0;
+}
diff --git a/fs/btrfs/free-space-cache.c b/fs/btrfs/free-space-cache.c
index 1027b854b90c..0be7a8742a43 100644
--- a/fs/btrfs/free-space-cache.c
+++ b/fs/btrfs/free-space-cache.c
@@ -307,7 +307,6 @@ static void io_ctl_unmap_page(struct io_ctl *io_ctl)
 static void io_ctl_map_page(struct io_ctl *io_ctl, int clear)
 {
-        WARN_ON(io_ctl->cur);
        BUG_ON(io_ctl->index >= io_ctl->num_pages);
        io_ctl->page = io_ctl->pages[io_ctl->index++];
        io_ctl->cur = kmap(io_ctl->page);
@@ -1250,18 +1249,13 @@ tree_search_offset(struct btrfs_free_space_ctl *ctl,
                         * if previous extent entry covers the offset,
                         * we should return it instead of the bitmap entry
                         */
-                        n = &entry->offset_index;
+                        n = rb_prev(&entry->offset_index);
-                        while (1) {
+                        if (n) {
-                                n = rb_prev(n);
-                                if (!n)
-                                        break;
                                prev = rb_entry(n, struct btrfs_free_space,
                                                offset_index);
-                                if (!prev->bitmap) {
+                                if (!prev->bitmap &&
-                                        if (prev->offset + prev->bytes > offset)
+                                    prev->offset + prev->bytes > offset)
-                                                entry = prev;
+                                        entry = prev;
-                                        break;
-                                }
                        }
                }
                return entry;
@@ -1287,18 +1281,13 @@ tree_search_offset(struct btrfs_free_space_ctl *ctl,
        }
        if (entry->bitmap) {
-                n = &entry->offset_index;
+                n = rb_prev(&entry->offset_index);
-                while (1) {
+                if (n) {
-                        n = rb_prev(n);
-                        if (!n)
-                                break;
                        prev = rb_entry(n, struct btrfs_free_space,
                                        offset_index);
-                        if (!prev->bitmap) {
+                        if (!prev->bitmap &&
-                                if (prev->offset + prev->bytes > offset)
+                            prev->offset + prev->bytes > offset)
-                                        return prev;
+                                return prev;
-                                break;
-                        }
                }
                if (entry->offset + BITS_PER_BITMAP * ctl->unit > offset)
                        return entry;
@@ -1364,7 +1353,7 @@ static void recalculate_thresholds(struct btrfs_free_space_ctl *ctl)
        u64 bitmap_bytes;
        u64 extent_bytes;
        u64 size = block_group->key.offset;
-        u64 bytes_per_bg = BITS_PER_BITMAP * block_group->sectorsize;
+        u64 bytes_per_bg = BITS_PER_BITMAP * ctl->unit;
        int max_bitmaps = div64_u64(size + bytes_per_bg - 1, bytes_per_bg);
        BUG_ON(ctl->total_bitmaps > max_bitmaps);
@@ -1650,8 +1639,7 @@ static bool use_bitmap(struct btrfs_free_space_ctl *ctl,
         * some block groups are so tiny they can't be enveloped by a bitmap, so
         * don't even bother to create a bitmap for this
         */
-        if (BITS_PER_BITMAP * block_group->sectorsize >
+        if (BITS_PER_BITMAP * ctl->unit > block_group->key.offset)
-            block_group->key.offset)
                return false;
        return true;
@@ -1874,11 +1862,13 @@ int btrfs_remove_free_space(struct btrfs_block_group_cache *block_group,
 {
        struct btrfs_free_space_ctl *ctl = block_group->free_space_ctl;
        struct btrfs_free_space *info;
-        int ret = 0;
+        int ret;
+        bool re_search = false;
        spin_lock(&ctl->tree_lock);
 again:
+        ret = 0;
        if (!bytes)
                goto out_lock;
@@ -1891,17 +1881,17 @@ again:
                info = tree_search_offset(ctl, offset_to_bitmap(ctl, offset),
                                          1, 0);
                if (!info) {
-                        /* the tree logging code might be calling us before we
+                        /*
-                         * have fully loaded the free space rbtree for this
+                         * If we found a partial bit of our free space in a
-                         * block group.  So it is possible the entry won't
+                         * bitmap but then couldn't find the other part this may
-                         * be in the rbtree yet at all.  The caching code
+                         * be a problem, so WARN about it.
-                         * will make sure not to put it in the rbtree if
-                         * the logging code has pinned it.
                         */
+                        WARN_ON(re_search);
                        goto out_lock;
                }
        }
+        re_search = false;
        if (!info->bitmap) {
                unlink_free_space(ctl, info);
                if (offset == info->offset) {
@@ -1947,8 +1937,10 @@ again:
        }
        ret = remove_from_bitmap(ctl, info, &offset, &bytes);
-        if (ret == -EAGAIN)
+        if (ret == -EAGAIN) {
+                re_search = true;
                goto again;
+        }
        BUG_ON(ret); /* logic error */
 out_lock:
        spin_unlock(&ctl->tree_lock);
@@ -2298,10 +2290,10 @@ static int btrfs_bitmap_cluster(struct btrfs_block_group_cache *block_group,
        unsigned long total_found = 0;
        int ret;
-        i = offset_to_bit(entry->offset, block_group->sectorsize,
+        i = offset_to_bit(entry->offset, ctl->unit,
                          max_t(u64, offset, entry->offset));
-        want_bits = bytes_to_bits(bytes, block_group->sectorsize);
+        want_bits = bytes_to_bits(bytes, ctl->unit);
-        min_bits = bytes_to_bits(min_bytes, block_group->sectorsize);
+        min_bits = bytes_to_bits(min_bytes, ctl->unit);
 again:
        found_bits = 0;
@@ -2325,23 +2317,22 @@ again:
        total_found += found_bits;
-        if (cluster->max_size < found_bits * block_group->sectorsize)
+        if (cluster->max_size < found_bits * ctl->unit)
-                cluster->max_size = found_bits * block_group->sectorsize;
+                cluster->max_size = found_bits * ctl->unit;
        if (total_found < want_bits || cluster->max_size < cont1_bytes) {
                i = next_zero + 1;
                goto again;
        }
-        cluster->window_start = start * block_group->sectorsize +
+        cluster->window_start = start * ctl->unit + entry->offset;
-                entry->offset;
        rb_erase(&entry->offset_index, &ctl->free_space_offset);
        ret = tree_insert_offset(&cluster->root, entry->offset,
                                 &entry->offset_index, 1);
        BUG_ON(ret); /* -EEXIST; Logic error */
        trace_btrfs_setup_cluster(block_group, cluster,
-                                  total_found * block_group->sectorsize, 1);
+                                  total_found * ctl->unit, 1);
        return 0;
 }
diff --git a/fs/btrfs/inode-map.c b/fs/btrfs/inode-map.c
index b1a1c929ba80..d26f67a59e36 100644
--- a/fs/btrfs/inode-map.c
+++ b/fs/btrfs/inode-map.c
@@ -434,8 +434,9 @@ int btrfs_save_ino_cache(struct btrfs_root *root,
         * 3 items for pre-allocation
         */
        trans->bytes_reserved = btrfs_calc_trans_metadata_size(root, 8);
-        ret = btrfs_block_rsv_add_noflush(root, trans->block_rsv,
+        ret = btrfs_block_rsv_add(root, trans->block_rsv,
-                                          trans->bytes_reserved);
+                                  trans->bytes_reserved,
+                                  BTRFS_RESERVE_NO_FLUSH);
        if (ret)
                goto out;
        trace_btrfs_space_reservation(root->fs_info, "ino_cache",
diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index 95542a1b3dfc..cc93b23ca352 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -71,6 +71,7 @@ static const struct file_operations btrfs_dir_file_operations;
 static struct extent_io_ops btrfs_extent_io_ops;
 static struct kmem_cache *btrfs_inode_cachep;
+static struct kmem_cache *btrfs_delalloc_work_cachep;
 struct kmem_cache *btrfs_trans_handle_cachep;
 struct kmem_cache *btrfs_transaction_cachep;
 struct kmem_cache *btrfs_path_cachep;
@@ -87,13 +88,17 @@ static unsigned char btrfs_type_by_mode[S_IFMT >> S_SHIFT] = {
        [S_IFLNK >> S_SHIFT]    = BTRFS_FT_SYMLINK,
 };
-static int btrfs_setsize(struct inode *inode, loff_t newsize);
+static int btrfs_setsize(struct inode *inode, struct iattr *attr);
 static int btrfs_truncate(struct inode *inode);
 static int btrfs_finish_ordered_io(struct btrfs_ordered_extent *ordered_extent);
 static noinline int cow_file_range(struct inode *inode,
                                   struct page *locked_page,
                                   u64 start, u64 end, int *page_started,
                                   unsigned long *nr_written, int unlock);
+static struct extent_map *create_pinned_em(struct inode *inode, u64 start,
+                                           u64 len, u64 orig_start,
+                                           u64 block_start, u64 block_len,
+                                           u64 orig_block_len, int type);
 static int btrfs_init_inode_security(struct btrfs_trans_handle *trans,
                                     struct inode *inode,  struct inode *dir,
@@ -698,14 +703,19 @@ retry:
                em->block_start = ins.objectid;
                em->block_len = ins.offset;
+                em->orig_block_len = ins.offset;
                em->bdev = root->fs_info->fs_devices->latest_bdev;
                em->compress_type = async_extent->compress_type;
                set_bit(EXTENT_FLAG_PINNED, &em->flags);
                set_bit(EXTENT_FLAG_COMPRESSED, &em->flags);
+                em->generation = -1;
                while (1) {
                        write_lock(&em_tree->lock);
                        ret = add_extent_mapping(em_tree, em);
+                        if (!ret)
+                                list_move(&em->list,
+                                          &em_tree->modified_extents);
                        write_unlock(&em_tree->lock);
                        if (ret != -EEXIST) {
                                free_extent_map(em);
@@ -803,14 +813,14 @@ static u64 get_extent_allocation_hint(struct inode *inode, u64 start,
 * required to start IO on it.  It may be clean and already done with
 * IO when we return.
 */
-static noinline int cow_file_range(struct inode *inode,
+static noinline int __cow_file_range(struct btrfs_trans_handle *trans,
-                                   struct page *locked_page,
+                                     struct inode *inode,
-                                   u64 start, u64 end, int *page_started,
+                                     struct btrfs_root *root,
-                                   unsigned long *nr_written,
+                                     struct page *locked_page,
-                                   int unlock)
+                                     u64 start, u64 end, int *page_started,
+                                     unsigned long *nr_written,
+                                     int unlock)
 {
-        struct btrfs_root *root = BTRFS_I(inode)->root;
-        struct btrfs_trans_handle *trans;
        u64 alloc_hint = 0;
        u64 num_bytes;
        unsigned long ram_size;
@@ -823,25 +833,10 @@ static noinline int cow_file_range(struct inode *inode,
        int ret = 0;
        BUG_ON(btrfs_is_free_space_inode(inode));
-        trans = btrfs_join_transaction(root);
-        if (IS_ERR(trans)) {
-                extent_clear_unlock_delalloc(inode,
-                             &BTRFS_I(inode)->io_tree,
-                             start, end, locked_page,
-                             EXTENT_CLEAR_UNLOCK_PAGE |
-                             EXTENT_CLEAR_UNLOCK |
-                             EXTENT_CLEAR_DELALLOC |
-                             EXTENT_CLEAR_DIRTY |
-                             EXTENT_SET_WRITEBACK |
-                             EXTENT_END_WRITEBACK);
-                return PTR_ERR(trans);
-        }
-        trans->block_rsv = &root->fs_info->delalloc_block_rsv;
        num_bytes = (end - start + blocksize) & ~(blocksize - 1);
        num_bytes = max(blocksize,  num_bytes);
        disk_num_bytes = num_bytes;
-        ret = 0;
        /* if this is a small write inside eof, kick off defrag */
        if (num_bytes < 64 * 1024 &&
@@ -900,12 +895,17 @@ static noinline int cow_file_range(struct inode *inode,
                em->block_start = ins.objectid;
                em->block_len = ins.offset;
+                em->orig_block_len = ins.offset;
                em->bdev = root->fs_info->fs_devices->latest_bdev;
                set_bit(EXTENT_FLAG_PINNED, &em->flags);
+                em->generation = -1;
                while (1) {
                        write_lock(&em_tree->lock);
                        ret = add_extent_mapping(em_tree, em);
+                        if (!ret)
+                                list_move(&em->list,
+                                          &em_tree->modified_extents);
                        write_unlock(&em_tree->lock);
                        if (ret != -EEXIST) {
                                free_extent_map(em);
@@ -952,11 +952,9 @@ static noinline int cow_file_range(struct inode *inode,
                alloc_hint = ins.objectid + ins.offset;
                start += cur_alloc_size;
        }
-        ret = 0;
 out:
-        btrfs_end_transaction(trans, root);
        return ret;
 out_unlock:
        extent_clear_unlock_delalloc(inode,
                     &BTRFS_I(inode)->io_tree,
@@ -971,6 +969,39 @@ out_unlock:
        goto out;
 }
+static noinline int cow_file_range(struct inode *inode,
+                                   struct page *locked_page,
+                                   u64 start, u64 end, int *page_started,
+                                   unsigned long *nr_written,
+                                   int unlock)
+{
+        struct btrfs_trans_handle *trans;
+        struct btrfs_root *root = BTRFS_I(inode)->root;
+        int ret;
+        trans = btrfs_join_transaction(root);
+        if (IS_ERR(trans)) {
+                extent_clear_unlock_delalloc(inode,
+                             &BTRFS_I(inode)->io_tree,
+                             start, end, locked_page,
+                             EXTENT_CLEAR_UNLOCK_PAGE |
+                             EXTENT_CLEAR_UNLOCK |
+                             EXTENT_CLEAR_DELALLOC |
+                             EXTENT_CLEAR_DIRTY |
+                             EXTENT_SET_WRITEBACK |
+                             EXTENT_END_WRITEBACK);
+                return PTR_ERR(trans);
+        }
+        trans->block_rsv = &root->fs_info->delalloc_block_rsv;
+        ret = __cow_file_range(trans, inode, root, locked_page, start, end,
+                               page_started, nr_written, unlock);
+        btrfs_end_transaction(trans, root);
+        return ret;
+}
 /*
 * work queue call back to started compression on a file and pages
 */
@@ -1126,6 +1157,7 @@ static noinline int run_delalloc_nocow(struct inode *inode,
        u64 extent_offset;
        u64 disk_bytenr;
        u64 num_bytes;
+        u64 disk_num_bytes;
        int extent_type;
        int ret, err;
        int type;
@@ -1228,6 +1260,8 @@ next_slot:
                        extent_offset = btrfs_file_extent_offset(leaf, fi);
                        extent_end = found_key.offset +
                                btrfs_file_extent_num_bytes(leaf, fi);
+                        disk_num_bytes =
+                                btrfs_file_extent_disk_num_bytes(leaf, fi);
                        if (extent_end <= start) {
                                path->slots[0]++;
                                goto next_slot;
@@ -1281,9 +1315,9 @@ out_check:
                btrfs_release_path(path);
                if (cow_start != (u64)-1) {
-                        ret = cow_file_range(inode, locked_page, cow_start,
+                        ret = __cow_file_range(trans, inode, root, locked_page,
-                                        found_key.offset - 1, page_started,
+                                               cow_start, found_key.offset - 1,
-                                        nr_written, 1);
+                                               page_started, nr_written, 1);
                        if (ret) {
                                btrfs_abort_transaction(trans, root, ret);
                                goto error;
@@ -1298,16 +1332,21 @@ out_check:
                        em = alloc_extent_map();
                        BUG_ON(!em); /* -ENOMEM */
                        em->start = cur_offset;
-                        em->orig_start = em->start;
+                        em->orig_start = found_key.offset - extent_offset;
                        em->len = num_bytes;
                        em->block_len = num_bytes;
                        em->block_start = disk_bytenr;
+                        em->orig_block_len = disk_num_bytes;
                        em->bdev = root->fs_info->fs_devices->latest_bdev;
                        set_bit(EXTENT_FLAG_PINNED, &em->flags);
-                        set_bit(EXTENT_FLAG_PREALLOC, &em->flags);
+                        set_bit(EXTENT_FLAG_FILLING, &em->flags);
+                        em->generation = -1;
                        while (1) {
                                write_lock(&em_tree->lock);
                                ret = add_extent_mapping(em_tree, em);
+                                if (!ret)
+                                        list_move(&em->list,
+                                                  &em_tree->modified_extents);
                                write_unlock(&em_tree->lock);
                                if (ret != -EEXIST) {
                                        free_extent_map(em);
@@ -1352,8 +1391,9 @@ out_check:
        }
        if (cow_start != (u64)-1) {
-                ret = cow_file_range(inode, locked_page, cow_start, end,
+                ret = __cow_file_range(trans, inode, root, locked_page,
-                                     page_started, nr_written, 1);
+                                       cow_start, end,
+                                       page_started, nr_written, 1);
                if (ret) {
                        btrfs_abort_transaction(trans, root, ret);
                        goto error;
@@ -1531,7 +1571,6 @@ int btrfs_merge_bio_hook(struct page *page, unsigned long offset,
                         unsigned long bio_flags)
 {
        struct btrfs_root *root = BTRFS_I(page->mapping->host)->root;
-        struct btrfs_mapping_tree *map_tree;
        u64 logical = (u64)bio->bi_sector << 9;
        u64 length = 0;
        u64 map_length;
@@ -1541,11 +1580,10 @@ int btrfs_merge_bio_hook(struct page *page, unsigned long offset,
                return 0;
        length = bio->bi_size;
-        map_tree = &root->fs_info->mapping_tree;
        map_length = length;
-        ret = btrfs_map_block(map_tree, READ, logical,
+        ret = btrfs_map_block(root->fs_info, READ, logical,
                              &map_length, NULL, 0);
-        /* Will always return 0 or 1 with map_multi == NULL */
+        /* Will always return 0 with map_multi == NULL */
        BUG_ON(ret < 0);
        if (map_length < length + size)
                return 1;
@@ -1586,7 +1624,12 @@ static int __btrfs_submit_bio_done(struct inode *inode, int rw, struct bio *bio,
                          u64 bio_offset)
 {
        struct btrfs_root *root = BTRFS_I(inode)->root;
-        return btrfs_map_bio(root, rw, bio, mirror_num, 1);
+        int ret;
+        ret = btrfs_map_bio(root, rw, bio, mirror_num, 1);
+        if (ret)
+                bio_endio(bio, ret);
+        return ret;
 }
 /*
@@ -1601,6 +1644,7 @@ static int btrfs_submit_bio_hook(struct inode *inode, int rw, struct bio *bio,
        int ret = 0;
        int skip_sum;
        int metadata = 0;
+        int async = !atomic_read(&BTRFS_I(inode)->sync_writers);
        skip_sum = BTRFS_I(inode)->flags & BTRFS_INODE_NODATASUM;
@@ -1610,31 +1654,43 @@ static int btrfs_submit_bio_hook(struct inode *inode, int rw, struct bio *bio,
        if (!(rw & REQ_WRITE)) {
                ret = btrfs_bio_wq_end_io(root->fs_info, bio, metadata);
                if (ret)
-                        return ret;
+                        goto out;
                if (bio_flags & EXTENT_BIO_COMPRESSED) {
-                        return btrfs_submit_compressed_read(inode, bio,
+                        ret = btrfs_submit_compressed_read(inode, bio,
-                                                    mirror_num, bio_flags);
+                                                           mirror_num,
+                                                           bio_flags);
+                        goto out;
                } else if (!skip_sum) {
                        ret = btrfs_lookup_bio_sums(root, inode, bio, NULL);
                        if (ret)
-                                return ret;
+                                goto out;
                }
                goto mapit;
-        } else if (!skip_sum) {
+        } else if (async && !skip_sum) {
                /* csum items have already been cloned */
                if (root->root_key.objectid == BTRFS_DATA_RELOC_TREE_OBJECTID)
                        goto mapit;
                /* we're doing a write, do the async checksumming */
-                return btrfs_wq_submit_bio(BTRFS_I(inode)->root->fs_info,
+                ret = btrfs_wq_submit_bio(BTRFS_I(inode)->root->fs_info,
                                   inode, rw, bio, mirror_num,
                                   bio_flags, bio_offset,
                                   __btrfs_submit_bio_start,
                                   __btrfs_submit_bio_done);
+                goto out;
+        } else if (!skip_sum) {
+                ret = btrfs_csum_one_bio(root, inode, bio, 0, 0);
+                if (ret)
+                        goto out;
        }
 mapit:
-        return btrfs_map_bio(root, rw, bio, mirror_num, 0);
+        ret = btrfs_map_bio(root, rw, bio, mirror_num, 0);
+out:
+        if (ret < 0)
+                bio_endio(bio, ret);
+        return ret;
 }
 /*
@@ -1657,8 +1713,7 @@ static noinline int add_pending_csums(struct btrfs_trans_handle *trans,
 int btrfs_set_extent_delalloc(struct inode *inode, u64 start, u64 end,
                              struct extent_state **cached_state)
 {
-        if ((end & (PAGE_CACHE_SIZE - 1)) == 0)
+        WARN_ON((end & (PAGE_CACHE_SIZE - 1)) == 0);
-                WARN_ON(1);
        return set_extent_delalloc(&BTRFS_I(inode)->io_tree, start, end,
                                   cached_state, GFP_NOFS);
 }
@@ -1867,22 +1922,20 @@ static int btrfs_finish_ordered_io(struct btrfs_ordered_extent *ordered_extent)
        if (test_bit(BTRFS_ORDERED_NOCOW, &ordered_extent->flags)) {
                BUG_ON(!list_empty(&ordered_extent->list)); /* Logic error */
-                ret = btrfs_ordered_update_i_size(inode, 0, ordered_extent);
+                btrfs_ordered_update_i_size(inode, 0, ordered_extent);
-                if (!ret) {
+                if (nolock)
-                        if (nolock)
+                        trans = btrfs_join_transaction_nolock(root);
-                                trans = btrfs_join_transaction_nolock(root);
+                else
-                        else
+                        trans = btrfs_join_transaction(root);
-                                trans = btrfs_join_transaction(root);
+                if (IS_ERR(trans)) {
-                        if (IS_ERR(trans)) {
+                        ret = PTR_ERR(trans);
-                                ret = PTR_ERR(trans);
+                        trans = NULL;
-                                trans = NULL;
+                        goto out;
-                                goto out;
-                        }
-                        trans->block_rsv = &root->fs_info->delalloc_block_rsv;
-                        ret = btrfs_update_inode_fallback(trans, root, inode);
-                        if (ret) /* -ENOMEM or corruption */
-                                btrfs_abort_transaction(trans, root, ret);
                }
+                trans->block_rsv = &root->fs_info->delalloc_block_rsv;
+                ret = btrfs_update_inode_fallback(trans, root, inode);
+                if (ret) /* -ENOMEM or corruption */
+                        btrfs_abort_transaction(trans, root, ret);
                goto out;
        }
@@ -1931,15 +1984,11 @@ static int btrfs_finish_ordered_io(struct btrfs_ordered_extent *ordered_extent)
        add_pending_csums(trans, inode, ordered_extent->file_offset,
                          &ordered_extent->list);
-        ret = btrfs_ordered_update_i_size(inode, 0, ordered_extent);
+        btrfs_ordered_update_i_size(inode, 0, ordered_extent);
-        if (!ret || !test_bit(BTRFS_ORDERED_PREALLOC, &ordered_extent->flags)) {
+        ret = btrfs_update_inode_fallback(trans, root, inode);
-                ret = btrfs_update_inode_fallback(trans, root, inode);
+        if (ret) { /* -ENOMEM or corruption */
-                if (ret) { /* -ENOMEM or corruption */
+                btrfs_abort_transaction(trans, root, ret);
-                        btrfs_abort_transaction(trans, root, ret);
+                goto out_unlock;
-                        goto out_unlock;
-                }
-        } else {
-                btrfs_set_inode_last_trans(trans, inode);
        }
        ret = 0;
 out_unlock:
@@ -2429,6 +2478,18 @@ int btrfs_orphan_cleanup(struct btrfs_root *root)
                                continue;
                        }
                        nr_truncate++;
+                        /* 1 for the orphan item deletion. */
+                        trans = btrfs_start_transaction(root, 1);
+                        if (IS_ERR(trans)) {
+                                ret = PTR_ERR(trans);
+                                goto out;
+                        }
+                        ret = btrfs_orphan_add(trans, inode);
+                        btrfs_end_transaction(trans, root);
+                        if (ret)
+                                goto out;
                        ret = btrfs_truncate(inode);
                } else {
                        nr_unlink++;
@@ -3074,7 +3135,6 @@ static int btrfs_unlink(struct inode *dir, struct dentry *dentry)
        struct btrfs_trans_handle *trans;
        struct inode *inode = dentry->d_inode;
        int ret;
-        unsigned long nr = 0;
        trans = __unlink_start_trans(dir, dentry);
        if (IS_ERR(trans))
@@ -3094,9 +3154,8 @@ static int btrfs_unlink(struct inode *dir, struct dentry *dentry)
        }
 out:
-        nr = trans->blocks_used;
        __unlink_end_trans(trans, root);
-        btrfs_btree_balance_dirty(root, nr);
+        btrfs_btree_balance_dirty(root);
        return ret;
 }
@@ -3186,7 +3245,6 @@ static int btrfs_rmdir(struct inode *dir, struct dentry *dentry)
        int err = 0;
        struct btrfs_root *root = BTRFS_I(dir)->root;
        struct btrfs_trans_handle *trans;
-        unsigned long nr = 0;
        if (inode->i_size > BTRFS_EMPTY_DIR_SIZE)
                return -ENOTEMPTY;
@@ -3215,9 +3273,8 @@ static int btrfs_rmdir(struct inode *dir, struct dentry *dentry)
        if (!err)
                btrfs_i_size_write(inode, 0);
 out:
-        nr = trans->blocks_used;
        __unlink_end_trans(trans, root);
-        btrfs_btree_balance_dirty(root, nr);
+        btrfs_btree_balance_dirty(root);
        return err;
 }
@@ -3497,11 +3554,11 @@ int btrfs_truncate_page(struct inode *inode, loff_t from, loff_t len,
        if (ret)
                goto out;
-        ret = -ENOMEM;
 again:
        page = find_or_create_page(mapping, index, mask);
        if (!page) {
                btrfs_delalloc_release_space(inode, PAGE_CACHE_SIZE);
+                ret = -ENOMEM;
                goto out;
        }
@@ -3550,7 +3607,6 @@ again:
                goto out_unlock;
        }
-        ret = 0;
        if (offset != PAGE_CACHE_SIZE) {
                if (!len)
                        len = PAGE_CACHE_SIZE - offset;
@@ -3621,6 +3677,7 @@ int btrfs_cont_expand(struct inode *inode, loff_t oldsize, loff_t size)
                                block_end - cur_offset, 0);
                if (IS_ERR(em)) {
                        err = PTR_ERR(em);
+                        em = NULL;
                        break;
                }
                last_byte = min(extent_map_end(em), block_end);
@@ -3668,6 +3725,7 @@ int btrfs_cont_expand(struct inode *inode, loff_t oldsize, loff_t size)
                        hole_em->block_start = EXTENT_MAP_HOLE;
                        hole_em->block_len = 0;
+                        hole_em->orig_block_len = 0;
                        hole_em->bdev = root->fs_info->fs_devices->latest_bdev;
                        hole_em->compress_type = BTRFS_COMPRESS_NONE;
                        hole_em->generation = trans->transid;
@@ -3703,16 +3761,27 @@ next:
        return err;
 }
-static int btrfs_setsize(struct inode *inode, loff_t newsize)
+static int btrfs_setsize(struct inode *inode, struct iattr *attr)
 {
        struct btrfs_root *root = BTRFS_I(inode)->root;
        struct btrfs_trans_handle *trans;
        loff_t oldsize = i_size_read(inode);
+        loff_t newsize = attr->ia_size;
+        int mask = attr->ia_valid;
        int ret;
        if (newsize == oldsize)
                return 0;
+        /*
+         * The regular truncate() case without ATTR_CTIME and ATTR_MTIME is a
+         * special case where we need to update the times despite not having
+         * these flags set.  For all other operations the VFS set these flags
+         * explicitly if it wants a timestamp update.
+         */
+        if (newsize != oldsize && (!(mask & (ATTR_CTIME | ATTR_MTIME))))
+                inode->i_ctime = inode->i_mtime = current_fs_time(inode->i_sb);
        if (newsize > oldsize) {
                truncate_pagecache(inode, oldsize, newsize);
                ret = btrfs_cont_expand(inode, oldsize, newsize);
@@ -3738,9 +3807,34 @@ static int btrfs_setsize(struct inode *inode, loff_t newsize)
                        set_bit(BTRFS_INODE_ORDERED_DATA_CLOSE,
                                &BTRFS_I(inode)->runtime_flags);
+                /*
+                 * 1 for the orphan item we're going to add
+                 * 1 for the orphan item deletion.
+                 */
+                trans = btrfs_start_transaction(root, 2);
+                if (IS_ERR(trans))
+                        return PTR_ERR(trans);
+                /*
+                 * We need to do this in case we fail at _any_ point during the
+                 * actual truncate.  Once we do the truncate_setsize we could
+                 * invalidate pages which forces any outstanding ordered io to
+                 * be instantly completed which will give us extents that need
+                 * to be truncated.  If we fail to get an orphan inode down we
+                 * could have left over extents that were never meant to live,
+                 * so we need to garuntee from this point on that everything
+                 * will be consistent.
+                 */
+                ret = btrfs_orphan_add(trans, inode);
+                btrfs_end_transaction(trans, root);
+                if (ret)
+                        return ret;
                /* we don't support swapfiles, so vmtruncate shouldn't fail */
                truncate_setsize(inode, newsize);
                ret = btrfs_truncate(inode);
+                if (ret && inode->i_nlink)
+                        btrfs_orphan_del(NULL, inode);
        }
        return ret;
@@ -3760,7 +3854,7 @@ static int btrfs_setattr(struct dentry *dentry, struct iattr *attr)
                return err;
        if (S_ISREG(inode->i_mode) && (attr->ia_valid & ATTR_SIZE)) {
-                err = btrfs_setsize(inode, attr->ia_size);
+                err = btrfs_setsize(inode, attr);
                if (err)
                        return err;
        }
@@ -3783,7 +3877,6 @@ void btrfs_evict_inode(struct inode *inode)
        struct btrfs_root *root = BTRFS_I(inode)->root;
        struct btrfs_block_rsv *rsv, *global_rsv;
        u64 min_size = btrfs_calc_trunc_metadata_size(root, 1);
-        unsigned long nr;
        int ret;
        trace_btrfs_inode_evict(inode);
@@ -3829,7 +3922,8 @@ void btrfs_evict_inode(struct inode *inode)
         * inode item when doing the truncate.
         */
        while (1) {
-                ret = btrfs_block_rsv_refill_noflush(root, rsv, min_size);
+                ret = btrfs_block_rsv_refill(root, rsv, min_size,
+                                             BTRFS_RESERVE_FLUSH_LIMIT);
                /*
                 * Try and steal from the global reserve since we will
@@ -3847,7 +3941,7 @@ void btrfs_evict_inode(struct inode *inode)
                        goto no_delete;
                }
-                trans = btrfs_start_transaction_noflush(root, 1);
+                trans = btrfs_start_transaction_lflush(root, 1);
                if (IS_ERR(trans)) {
                        btrfs_orphan_del(NULL, inode);
                        btrfs_free_block_rsv(root, rsv);
@@ -3864,10 +3958,9 @@ void btrfs_evict_inode(struct inode *inode)
                ret = btrfs_update_inode(trans, root, inode);
                BUG_ON(ret);
-                nr = trans->blocks_used;
                btrfs_end_transaction(trans, root);
                trans = NULL;
-                btrfs_btree_balance_dirty(root, nr);
+                btrfs_btree_balance_dirty(root);
        }
        btrfs_free_block_rsv(root, rsv);
@@ -3883,9 +3976,8 @@ void btrfs_evict_inode(struct inode *inode)
              root->root_key.objectid == BTRFS_TREE_RELOC_OBJECTID))
                btrfs_return_ino(root, btrfs_ino(inode));
-        nr = trans->blocks_used;
        btrfs_end_transaction(trans, root);
-        btrfs_btree_balance_dirty(root, nr);
+        btrfs_btree_balance_dirty(root);
 no_delete:
        clear_inode(inode);
        return;
@@ -4219,16 +4311,7 @@ struct inode *btrfs_lookup_dentry(struct inode *dir, struct dentry *dentry)
        if (dentry->d_name.len > BTRFS_NAME_LEN)
                return ERR_PTR(-ENAMETOOLONG);
-        if (unlikely(d_need_lookup(dentry))) {
+        ret = btrfs_inode_by_name(dir, dentry, &location);
-                memcpy(&location, dentry->d_fsdata, sizeof(struct btrfs_key));
-                kfree(dentry->d_fsdata);
-                dentry->d_fsdata = NULL;
-                /* This thing is hashed, drop it for now */
-                d_drop(dentry);
-        } else {
-                ret = btrfs_inode_by_name(dir, dentry, &location);
-        }
        if (ret < 0)
                return ERR_PTR(ret);
@@ -4298,11 +4381,6 @@ static struct dentry *btrfs_lookup(struct inode *dir, struct dentry *dentry,
        struct dentry *ret;
        ret = d_splice_alias(btrfs_lookup_dentry(dir, dentry), dentry);
-        if (unlikely(d_need_lookup(dentry))) {
-                spin_lock(&dentry->d_lock);
-                dentry->d_flags &= ~DCACHE_NEED_LOOKUP;
-                spin_unlock(&dentry->d_lock);
-        }
        return ret;
 }
@@ -4775,8 +4853,7 @@ static struct inode *btrfs_new_inode(struct btrfs_trans_handle *trans,
        if (S_ISREG(mode)) {
                if (btrfs_test_opt(root, NODATASUM))
                        BTRFS_I(inode)->flags |= BTRFS_INODE_NODATASUM;
-                if (btrfs_test_opt(root, NODATACOW) ||
+                if (btrfs_test_opt(root, NODATACOW))
-                    (BTRFS_I(dir)->flags & BTRFS_INODE_NODATACOW))
                        BTRFS_I(inode)->flags |= BTRFS_INODE_NODATACOW;
        }
@@ -4842,7 +4919,7 @@ int btrfs_add_link(struct btrfs_trans_handle *trans,
        ret = btrfs_insert_dir_item(trans, root, name, name_len,
                                    parent_inode, &key,
                                    btrfs_inode_type(inode), index);
-        if (ret == -EEXIST)
+        if (ret == -EEXIST || ret == -EOVERFLOW)
                goto fail_dir_item;
        else if (ret) {
                btrfs_abort_transaction(trans, root, ret);
@@ -4897,7 +4974,6 @@ static int btrfs_mknod(struct inode *dir, struct dentry *dentry,
        int err;
        int drop_inode = 0;
        u64 objectid;
-        unsigned long nr = 0;
        u64 index = 0;
        if (!new_valid_dev(rdev))
@@ -4930,6 +5006,12 @@ static int btrfs_mknod(struct inode *dir, struct dentry *dentry,
                goto out_unlock;
        }
+        err = btrfs_update_inode(trans, root, inode);
+        if (err) {
+                drop_inode = 1;
+                goto out_unlock;
+        }
        /*
        * If the active LSM wants to access the inode during
        * d_instantiate it needs these. Smack checks to see
@@ -4947,9 +5029,8 @@ static int btrfs_mknod(struct inode *dir, struct dentry *dentry,
                d_instantiate(dentry, inode);
        }
 out_unlock:
-        nr = trans->blocks_used;
        btrfs_end_transaction(trans, root);
-        btrfs_btree_balance_dirty(root, nr);
+        btrfs_btree_balance_dirty(root);
        if (drop_inode) {
                inode_dec_link_count(inode);
                iput(inode);
@@ -4963,9 +5044,8 @@ static int btrfs_create(struct inode *dir, struct dentry *dentry,
        struct btrfs_trans_handle *trans;
        struct btrfs_root *root = BTRFS_I(dir)->root;
        struct inode *inode = NULL;
-        int drop_inode = 0;
+        int drop_inode_on_err = 0;
        int err;
-        unsigned long nr = 0;
        u64 objectid;
        u64 index = 0;
@@ -4989,12 +5069,15 @@ static int btrfs_create(struct inode *dir, struct dentry *dentry,
                err = PTR_ERR(inode);
                goto out_unlock;
        }
+        drop_inode_on_err = 1;
        err = btrfs_init_inode_security(trans, inode, dir, &dentry->d_name);
-        if (err) {
+        if (err)
-                drop_inode = 1;
+                goto out_unlock;
+        err = btrfs_update_inode(trans, root, inode);
+        if (err)
                goto out_unlock;
-        }
        /*
        * If the active LSM wants to access the inode during
@@ -5007,21 +5090,20 @@ static int btrfs_create(struct inode *dir, struct dentry *dentry,
        err = btrfs_add_nondir(trans, dir, dentry, inode, 0, index);
        if (err)
-                drop_inode = 1;
+                goto out_unlock;
-        else {
-                inode->i_mapping->a_ops = &btrfs_aops;
+        inode->i_mapping->a_ops = &btrfs_aops;
-                inode->i_mapping->backing_dev_info = &root->fs_info->bdi;
+        inode->i_mapping->backing_dev_info = &root->fs_info->bdi;
-                BTRFS_I(inode)->io_tree.ops = &btrfs_extent_io_ops;
+        BTRFS_I(inode)->io_tree.ops = &btrfs_extent_io_ops;
-                d_instantiate(dentry, inode);
+        d_instantiate(dentry, inode);
-        }
 out_unlock:
-        nr = trans->blocks_used;
        btrfs_end_transaction(trans, root);
-        if (drop_inode) {
+        if (err && drop_inode_on_err) {
                inode_dec_link_count(inode);
                iput(inode);
        }
-        btrfs_btree_balance_dirty(root, nr);
+        btrfs_btree_balance_dirty(root);
        return err;
 }
@@ -5032,7 +5114,6 @@ static int btrfs_link(struct dentry *old_dentry, struct inode *dir,
        struct btrfs_root *root = BTRFS_I(dir)->root;
        struct inode *inode = old_dentry->d_inode;
        u64 index;
-        unsigned long nr = 0;
        int err;
        int drop_inode = 0;
@@ -5062,6 +5143,7 @@ static int btrfs_link(struct dentry *old_dentry, struct inode *dir,
        inode_inc_iversion(inode);
        inode->i_ctime = CURRENT_TIME;
        ihold(inode);
+        set_bit(BTRFS_INODE_COPY_EVERYTHING, &BTRFS_I(inode)->runtime_flags);
        err = btrfs_add_nondir(trans, dir, dentry, inode, 1, index);
@@ -5076,14 +5158,13 @@ static int btrfs_link(struct dentry *old_dentry, struct inode *dir,
                btrfs_log_new_name(trans, inode, NULL, parent);
        }
-        nr = trans->blocks_used;
        btrfs_end_transaction(trans, root);
 fail:
        if (drop_inode) {
                inode_dec_link_count(inode);
                iput(inode);
        }
-        btrfs_btree_balance_dirty(root, nr);
+        btrfs_btree_balance_dirty(root);
        return err;
 }
@@ -5096,7 +5177,6 @@ static int btrfs_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode)
        int drop_on_err = 0;
        u64 objectid = 0;
        u64 index = 0;
-        unsigned long nr = 1;
        /*
         * 2 items for inode and ref
@@ -5142,11 +5222,10 @@ static int btrfs_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode)
        drop_on_err = 0;
 out_fail:
-        nr = trans->blocks_used;
        btrfs_end_transaction(trans, root);
        if (drop_on_err)
                iput(inode);
-        btrfs_btree_balance_dirty(root, nr);
+        btrfs_btree_balance_dirty(root);
        return err;
 }
@@ -5340,6 +5419,7 @@ again:
                if (start + len <= found_key.offset)
                        goto not_found;
                em->start = start;
+                em->orig_start = start;
                em->len = found_key.offset - start;
                goto not_found_em;
        }
@@ -5350,6 +5430,8 @@ again:
                em->len = extent_end - extent_start;
                em->orig_start = extent_start -
                                 btrfs_file_extent_offset(leaf, item);
+                em->orig_block_len = btrfs_file_extent_disk_num_bytes(leaf,
+                                                                      item);
                bytenr = btrfs_file_extent_disk_bytenr(leaf, item);
                if (bytenr == 0) {
                        em->block_start = EXTENT_MAP_HOLE;
@@ -5359,8 +5441,7 @@ again:
                        set_bit(EXTENT_FLAG_COMPRESSED, &em->flags);
                        em->compress_type = compress_type;
                        em->block_start = bytenr;
-                        em->block_len = btrfs_file_extent_disk_num_bytes(leaf,
+                        em->block_len = em->orig_block_len;
-                                                                         item);
                } else {
                        bytenr += btrfs_file_extent_offset(leaf, item);
                        em->block_start = bytenr;
@@ -5390,7 +5471,8 @@ again:
                em->start = extent_start + extent_offset;
                em->len = (copy_size + root->sectorsize - 1) &
                        ~((u64)root->sectorsize - 1);
-                em->orig_start = EXTENT_MAP_INLINE;
+                em->orig_block_len = em->len;
+                em->orig_start = em->start;
                if (compress_type) {
                        set_bit(EXTENT_FLAG_COMPRESSED, &em->flags);
                        em->compress_type = compress_type;
@@ -5439,11 +5521,11 @@ again:
                                    extent_map_end(em) - 1, NULL, GFP_NOFS);
                goto insert;
        } else {
-                printk(KERN_ERR "btrfs unknown found_type %d\n", found_type);
+                WARN(1, KERN_ERR "btrfs unknown found_type %d\n", found_type);
-                WARN_ON(1);
        }
 not_found:
        em->start = start;
+        em->orig_start = start;
        em->len = len;
 not_found_em:
        em->block_start = EXTENT_MAP_HOLE;
@@ -5539,10 +5621,13 @@ struct extent_map *btrfs_get_extent_fiemap(struct inode *inode, struct page *pag
                return em;
        if (em) {
                /*
-                 * if our em maps to a hole, there might
+                 * if our em maps to
-                 * actually be delalloc bytes behind it
+                 * -  a hole or
+                 * -  a pre-alloc extent,
+                 * there might actually be delalloc bytes behind it.
                 */
-                if (em->block_start != EXTENT_MAP_HOLE)
+                if (em->block_start != EXTENT_MAP_HOLE &&
+                    !test_bit(EXTENT_FLAG_PREALLOC, &em->flags))
                        return em;
                else
                        hole_em = em;
@@ -5624,6 +5709,8 @@ struct extent_map *btrfs_get_extent_fiemap(struct inode *inode, struct page *pag
                         */
                        em->block_start = hole_em->block_start;
                        em->block_len = hole_len;
+                        if (test_bit(EXTENT_FLAG_PREALLOC, &hole_em->flags))
+                                set_bit(EXTENT_FLAG_PREALLOC, &em->flags);
                } else {
                        em->start = range_start;
                        em->len = found;
@@ -5645,38 +5732,19 @@ out:
 }
 static struct extent_map *btrfs_new_extent_direct(struct inode *inode,
-                                                  struct extent_map *em,
                                                  u64 start, u64 len)
 {
        struct btrfs_root *root = BTRFS_I(inode)->root;
        struct btrfs_trans_handle *trans;
-        struct extent_map_tree *em_tree = &BTRFS_I(inode)->extent_tree;
+        struct extent_map *em;
        struct btrfs_key ins;
        u64 alloc_hint;
        int ret;
-        bool insert = false;
-        /*
-         * Ok if the extent map we looked up is a hole and is for the exact
-         * range we want, there is no reason to allocate a new one, however if
-         * it is not right then we need to free this one and drop the cache for
-         * our range.
-         */
-        if (em->block_start != EXTENT_MAP_HOLE || em->start != start ||
-            em->len != len) {
-                free_extent_map(em);
-                em = NULL;
-                insert = true;
-                btrfs_drop_extent_cache(inode, start, start + len - 1, 0);
-        }
        trans = btrfs_join_transaction(root);
        if (IS_ERR(trans))
                return ERR_CAST(trans);
-        if (start <= BTRFS_I(inode)->disk_i_size && len < 64 * 1024)
-                btrfs_add_inode_defrag(trans, inode);
        trans->block_rsv = &root->fs_info->delalloc_block_rsv;
        alloc_hint = get_extent_allocation_hint(inode, start, len);
@@ -5687,37 +5755,10 @@ static struct extent_map *btrfs_new_extent_direct(struct inode *inode,
                goto out;
        }
-        if (!em) {
+        em = create_pinned_em(inode, start, ins.offset, start, ins.objectid,
-                em = alloc_extent_map();
+                              ins.offset, ins.offset, 0);
-                if (!em) {
+        if (IS_ERR(em))
-                        em = ERR_PTR(-ENOMEM);
+                goto out;
-                        goto out;
-                }
-        }
-        em->start = start;
-        em->orig_start = em->start;
-        em->len = ins.offset;
-        em->block_start = ins.objectid;
-        em->block_len = ins.offset;
-        em->bdev = root->fs_info->fs_devices->latest_bdev;
-        /*
-         * We need to do this because if we're using the original em we searched
-         * for, we could have EXTENT_FLAG_VACANCY set, and we don't want that.
-         */
-        em->flags = 0;
-        set_bit(EXTENT_FLAG_PINNED, &em->flags);
-        while (insert) {
-                write_lock(&em_tree->lock);
-                ret = add_extent_mapping(em_tree, em);
-                write_unlock(&em_tree->lock);
-                if (ret != -EEXIST)
-                        break;
-                btrfs_drop_extent_cache(inode, start, start + em->len - 1, 0);
-        }
        ret = btrfs_add_ordered_extent_dio(inode, start, ins.objectid,
                                           ins.offset, ins.offset, 0);
@@ -5894,7 +5935,7 @@ static int lock_extent_direct(struct inode *inode, u64 lockstart, u64 lockend,
 static struct extent_map *create_pinned_em(struct inode *inode, u64 start,
                                           u64 len, u64 orig_start,
                                           u64 block_start, u64 block_len,
-                                           int type)
+                                           u64 orig_block_len, int type)
 {
        struct extent_map_tree *em_tree;
        struct extent_map *em;
@@ -5912,15 +5953,20 @@ static struct extent_map *create_pinned_em(struct inode *inode, u64 start,
        em->block_len = block_len;
        em->block_start = block_start;
        em->bdev = root->fs_info->fs_devices->latest_bdev;
+        em->orig_block_len = orig_block_len;
+        em->generation = -1;
        set_bit(EXTENT_FLAG_PINNED, &em->flags);
        if (type == BTRFS_ORDERED_PREALLOC)
-                set_bit(EXTENT_FLAG_PREALLOC, &em->flags);
+                set_bit(EXTENT_FLAG_FILLING, &em->flags);
        do {
                btrfs_drop_extent_cache(inode, em->start,
                                em->start + em->len - 1, 0);
                write_lock(&em_tree->lock);
                ret = add_extent_mapping(em_tree, em);
+                if (!ret)
+                        list_move(&em->list,
+                                  &em_tree->modified_extents);
                write_unlock(&em_tree->lock);
        } while (ret == -EEXIST);
@@ -6047,13 +6093,15 @@ static int btrfs_get_blocks_direct(struct inode *inode, sector_t iblock,
                        goto must_cow;
                if (can_nocow_odirect(trans, inode, start, len) == 1) {
-                        u64 orig_start = em->start;
+                        u64 orig_start = em->orig_start;
+                        u64 orig_block_len = em->orig_block_len;
                        if (type == BTRFS_ORDERED_PREALLOC) {
                                free_extent_map(em);
                                em = create_pinned_em(inode, start, len,
                                                       orig_start,
-                                                       block_start, len, type);
+                                                       block_start, len,
+                                                       orig_block_len, type);
                                if (IS_ERR(em)) {
                                        btrfs_end_transaction(trans, root);
                                        goto unlock_err;
@@ -6077,7 +6125,8 @@ must_cow:
         * it above
         */
        len = bh_result->b_size;
-        em = btrfs_new_extent_direct(inode, em, start, len);
+        free_extent_map(em);
+        em = btrfs_new_extent_direct(inode, start, len);
        if (IS_ERR(em)) {
                ret = PTR_ERR(em);
                goto unlock_err;
@@ -6318,6 +6367,9 @@ static inline int __btrfs_submit_dio_bio(struct bio *bio, struct inode *inode,
        struct btrfs_root *root = BTRFS_I(inode)->root;
        int ret;
+        if (async_submit)
+                async_submit = !atomic_read(&BTRFS_I(inode)->sync_writers);
        bio_get(bio);
        if (!write) {
@@ -6362,7 +6414,6 @@ static int btrfs_submit_direct_hook(int rw, struct btrfs_dio_private *dip,
 {
        struct inode *inode = dip->inode;
        struct btrfs_root *root = BTRFS_I(inode)->root;
-        struct btrfs_mapping_tree *map_tree = &root->fs_info->mapping_tree;
        struct bio *bio;
        struct bio *orig_bio = dip->orig_bio;
        struct bio_vec *bvec = orig_bio->bi_io_vec;
@@ -6375,7 +6426,7 @@ static int btrfs_submit_direct_hook(int rw, struct btrfs_dio_private *dip,
        int async_submit = 0;
        map_length = orig_bio->bi_size;
-        ret = btrfs_map_block(map_tree, READ, start_sector << 9,
+        ret = btrfs_map_block(root->fs_info, READ, start_sector << 9,
                              &map_length, NULL, 0);
        if (ret) {
                bio_put(orig_bio);
@@ -6429,7 +6480,8 @@ static int btrfs_submit_direct_hook(int rw, struct btrfs_dio_private *dip,
                        bio->bi_end_io = btrfs_end_dio_bio;
                        map_length = orig_bio->bi_size;
-                        ret = btrfs_map_block(map_tree, READ, start_sector << 9,
+                        ret = btrfs_map_block(root->fs_info, READ,
+                                              start_sector << 9,
                                              &map_length, NULL, 0);
                        if (ret) {
                                bio_put(bio);
@@ -6582,9 +6634,17 @@ static ssize_t btrfs_direct_IO(int rw, struct kiocb *iocb,
                   btrfs_submit_direct, 0);
 }
+#define BTRFS_FIEMAP_FLAGS      (FIEMAP_FLAG_SYNC)
 static int btrfs_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
                __u64 start, __u64 len)
 {
+        int     ret;
+        ret = fiemap_check_flags(fieinfo, BTRFS_FIEMAP_FLAGS);
+        if (ret)
+                return ret;
        return extent_fiemap(inode, fieinfo, start, len, btrfs_get_extent_fiemap);
 }
@@ -6855,7 +6915,6 @@ static int btrfs_truncate(struct inode *inode)
        int ret;
        int err = 0;
        struct btrfs_trans_handle *trans;
-        unsigned long nr;
        u64 mask = root->sectorsize - 1;
        u64 min_size = btrfs_calc_trunc_metadata_size(root, 1);
@@ -6910,11 +6969,9 @@ static int btrfs_truncate(struct inode *inode)
        /*
         * 1 for the truncate slack space
-         * 1 for the orphan item we're going to add
-         * 1 for the orphan item deletion
         * 1 for updating the inode.
         */
-        trans = btrfs_start_transaction(root, 4);
+        trans = btrfs_start_transaction(root, 2);
        if (IS_ERR(trans)) {
                err = PTR_ERR(trans);
                goto out;
@@ -6925,12 +6982,6 @@ static int btrfs_truncate(struct inode *inode)
                                      min_size);
        BUG_ON(ret);
-        ret = btrfs_orphan_add(trans, inode);
-        if (ret) {
-                btrfs_end_transaction(trans, root);
-                goto out;
-        }
        /*
         * setattr is responsible for setting the ordered_data_close flag,
         * but that is only tested during the last file release.  That
@@ -6978,9 +7029,8 @@ static int btrfs_truncate(struct inode *inode)
                        break;
                }
-                nr = trans->blocks_used;
                btrfs_end_transaction(trans, root);
-                btrfs_btree_balance_dirty(root, nr);
+                btrfs_btree_balance_dirty(root);
                trans = btrfs_start_transaction(root, 2);
                if (IS_ERR(trans)) {
@@ -7000,12 +7050,6 @@ static int btrfs_truncate(struct inode *inode)
                ret = btrfs_orphan_del(trans, inode);
                if (ret)
                        err = ret;
-        } else if (ret && inode->i_nlink > 0) {
-                /*
-                 * Failed to do the truncate, remove us from the in memory
-                 * orphan list.
-                 */
-                ret = btrfs_orphan_del(NULL, inode);
        }
        if (trans) {
@@ -7014,9 +7058,8 @@ static int btrfs_truncate(struct inode *inode)
                if (ret && !err)
                        err = ret;
-                nr = trans->blocks_used;
                ret = btrfs_end_transaction(trans, root);
-                btrfs_btree_balance_dirty(root, nr);
+                btrfs_btree_balance_dirty(root);
        }
 out:
@@ -7093,6 +7136,7 @@ struct inode *btrfs_alloc_inode(struct super_block *sb)
        extent_io_tree_init(&ei->io_failure_tree, &inode->i_data);
        ei->io_tree.track_uptodate = 1;
        ei->io_failure_tree.track_uptodate = 1;
+        atomic_set(&ei->sync_writers, 0);
        mutex_init(&ei->log_mutex);
        mutex_init(&ei->delalloc_mutex);
        btrfs_ordered_inode_tree_init(&ei->ordered_tree);
@@ -7203,6 +7247,8 @@ void btrfs_destroy_cachep(void)
                kmem_cache_destroy(btrfs_path_cachep);
        if (btrfs_free_space_cachep)
                kmem_cache_destroy(btrfs_free_space_cachep);
+        if (btrfs_delalloc_work_cachep)
+                kmem_cache_destroy(btrfs_delalloc_work_cachep);
 }
 int btrfs_init_cachep(void)
@@ -7237,6 +7283,13 @@ int btrfs_init_cachep(void)
        if (!btrfs_free_space_cachep)
                goto fail;
+        btrfs_delalloc_work_cachep = kmem_cache_create("btrfs_delalloc_work",
+                        sizeof(struct btrfs_delalloc_work), 0,
+                        SLAB_RECLAIM_ACCOUNT | SLAB_MEM_SPREAD,
+                        NULL);
+        if (!btrfs_delalloc_work_cachep)
+                goto fail;
        return 0;
 fail:
        btrfs_destroy_cachep();
@@ -7308,6 +7361,28 @@ static int btrfs_rename(struct inode *old_dir, struct dentry *old_dentry,
        if (S_ISDIR(old_inode->i_mode) && new_inode &&
            new_inode->i_size > BTRFS_EMPTY_DIR_SIZE)
                return -ENOTEMPTY;
+        /* check for collisions, even if the  name isn't there */
+        ret = btrfs_check_dir_item_collision(root, new_dir->i_ino,
+                             new_dentry->d_name.name,
+                             new_dentry->d_name.len);
+        if (ret) {
+                if (ret == -EEXIST) {
+                        /* we shouldn't get
+                         * eexist without a new_inode */
+                        if (!new_inode) {
+                                WARN_ON(1);
+                                return ret;
+                        }
+                } else {
+                        /* maybe -EOVERFLOW */
+                        return ret;
+                }
+        }
+        ret = 0;
        /*
         * we're using rename to replace one file with another.
         * and the replacement file is large.  Start IO on it now so
@@ -7447,39 +7522,110 @@ out_notrans:
        return ret;
 }
+static void btrfs_run_delalloc_work(struct btrfs_work *work)
+{
+        struct btrfs_delalloc_work *delalloc_work;
+        delalloc_work = container_of(work, struct btrfs_delalloc_work,
+                                     work);
+        if (delalloc_work->wait)
+                btrfs_wait_ordered_range(delalloc_work->inode, 0, (u64)-1);
+        else
+                filemap_flush(delalloc_work->inode->i_mapping);
+        if (delalloc_work->delay_iput)
+                btrfs_add_delayed_iput(delalloc_work->inode);
+        else
+                iput(delalloc_work->inode);
+        complete(&delalloc_work->completion);
+}
+struct btrfs_delalloc_work *btrfs_alloc_delalloc_work(struct inode *inode,
+                                                    int wait, int delay_iput)
+{
+        struct btrfs_delalloc_work *work;
+        work = kmem_cache_zalloc(btrfs_delalloc_work_cachep, GFP_NOFS);
+        if (!work)
+                return NULL;
+        init_completion(&work->completion);
+        INIT_LIST_HEAD(&work->list);
+        work->inode = inode;
+        work->wait = wait;
+        work->delay_iput = delay_iput;
+        work->work.func = btrfs_run_delalloc_work;
+        return work;
+}
+void btrfs_wait_and_free_delalloc_work(struct btrfs_delalloc_work *work)
+{
+        wait_for_completion(&work->completion);
+        kmem_cache_free(btrfs_delalloc_work_cachep, work);
+}
 /*
 * some fairly slow code that needs optimization. This walks the list
 * of all the inodes with pending delalloc and forces them to disk.
 */
 int btrfs_start_delalloc_inodes(struct btrfs_root *root, int delay_iput)
 {
-        struct list_head *head = &root->fs_info->delalloc_inodes;
        struct btrfs_inode *binode;
        struct inode *inode;
+        struct btrfs_delalloc_work *work, *next;
+        struct list_head works;
+        struct list_head splice;
+        int ret = 0;
        if (root->fs_info->sb->s_flags & MS_RDONLY)
                return -EROFS;
+        INIT_LIST_HEAD(&works);
+        INIT_LIST_HEAD(&splice);
+again:
        spin_lock(&root->fs_info->delalloc_lock);
-        while (!list_empty(head)) {
+        list_splice_init(&root->fs_info->delalloc_inodes, &splice);
-                binode = list_entry(head->next, struct btrfs_inode,
+        while (!list_empty(&splice)) {
+                binode = list_entry(splice.next, struct btrfs_inode,
                                    delalloc_inodes);
+                list_del_init(&binode->delalloc_inodes);
                inode = igrab(&binode->vfs_inode);
                if (!inode)
-                        list_del_init(&binode->delalloc_inodes);
+                        continue;
+                list_add_tail(&binode->delalloc_inodes,
+                              &root->fs_info->delalloc_inodes);
                spin_unlock(&root->fs_info->delalloc_lock);
-                if (inode) {
-                        filemap_flush(inode->i_mapping);
+                work = btrfs_alloc_delalloc_work(inode, 0, delay_iput);
-                        if (delay_iput)
+                if (unlikely(!work)) {
-                                btrfs_add_delayed_iput(inode);
+                        ret = -ENOMEM;
-                        else
+                        goto out;
-                                iput(inode);
                }
+                list_add_tail(&work->list, &works);
+                btrfs_queue_worker(&root->fs_info->flush_workers,
+                                   &work->work);
                cond_resched();
                spin_lock(&root->fs_info->delalloc_lock);
        }
        spin_unlock(&root->fs_info->delalloc_lock);
+        list_for_each_entry_safe(work, next, &works, list) {
+                list_del_init(&work->list);
+                btrfs_wait_and_free_delalloc_work(work);
+        }
+        spin_lock(&root->fs_info->delalloc_lock);
+        if (!list_empty(&root->fs_info->delalloc_inodes)) {
+                spin_unlock(&root->fs_info->delalloc_lock);
+                goto again;
+        }
+        spin_unlock(&root->fs_info->delalloc_lock);
        /* the filemap_flush will queue IO into the worker threads, but
         * we have to make sure the IO is actually started and that
         * ordered extents get created before we return
@@ -7493,6 +7639,18 @@ int btrfs_start_delalloc_inodes(struct btrfs_root *root, int delay_iput)
        }
        atomic_dec(&root->fs_info->async_submit_draining);
        return 0;
+out:
+        list_for_each_entry_safe(work, next, &works, list) {
+                list_del_init(&work->list);
+                btrfs_wait_and_free_delalloc_work(work);
+        }
+        if (!list_empty_careful(&splice)) {
+                spin_lock(&root->fs_info->delalloc_lock);
+                list_splice_tail(&splice, &root->fs_info->delalloc_inodes);
+                spin_unlock(&root->fs_info->delalloc_lock);
+        }
+        return ret;
 }
 static int btrfs_symlink(struct inode *dir, struct dentry *dentry,
@@ -7512,7 +7670,6 @@ static int btrfs_symlink(struct inode *dir, struct dentry *dentry,
        unsigned long ptr;
        struct btrfs_file_extent_item *ei;
        struct extent_buffer *leaf;
-        unsigned long nr = 0;
        name_len = strlen(symname) + 1;
        if (name_len > BTRFS_MAX_INLINE_DATA_SIZE(root))
@@ -7610,13 +7767,12 @@ static int btrfs_symlink(struct inode *dir, struct dentry *dentry,
 out_unlock:
        if (!err)
                d_instantiate(dentry, inode);
-        nr = trans->blocks_used;
        btrfs_end_transaction(trans, root);
        if (drop_inode) {
                inode_dec_link_count(inode);
                iput(inode);
        }
-        btrfs_btree_balance_dirty(root, nr);
+        btrfs_btree_balance_dirty(root);
        return err;
 }
@@ -7679,6 +7835,7 @@ static int __btrfs_prealloc_file_range(struct inode *inode, int mode,
                em->len = ins.offset;
                em->block_start = ins.objectid;
                em->block_len = ins.offset;
+                em->orig_block_len = ins.offset;
                em->bdev = root->fs_info->fs_devices->latest_bdev;
                set_bit(EXTENT_FLAG_PREALLOC, &em->flags);
                em->generation = trans->transid;
diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c
index 8fcf9a59c28d..338f2597bf7f 100644
--- a/fs/btrfs/ioctl.c
+++ b/fs/btrfs/ioctl.c
@@ -55,6 +55,7 @@
 #include "backref.h"
 #include "rcu-string.h"
 #include "send.h"
+#include "dev-replace.h"
 /* Mask out flags that are inappropriate for the given type of inode. */
 static inline __u32 btrfs_mask_flags(umode_t mode, __u32 flags)
@@ -140,8 +141,11 @@ void btrfs_inherit_iflags(struct inode *inode, struct inode *dir)
                BTRFS_I(inode)->flags |= BTRFS_INODE_COMPRESS;
        }
-        if (flags & BTRFS_INODE_NODATACOW)
+        if (flags & BTRFS_INODE_NODATACOW) {
                BTRFS_I(inode)->flags |= BTRFS_INODE_NODATACOW;
+                if (S_ISREG(inode->i_mode))
+                        BTRFS_I(inode)->flags |= BTRFS_INODE_NODATASUM;
+        }
        btrfs_update_iflags(inode);
 }
@@ -511,7 +515,6 @@ static noinline int create_subvol(struct btrfs_root *root,
        BUG_ON(ret);
-        d_instantiate(dentry, btrfs_lookup_dentry(dir, dentry));
 fail:
        if (async_transid) {
                *async_transid = trans->transid;
@@ -521,6 +524,10 @@ fail:
        }
        if (err && !ret)
                ret = err;
+        if (!ret)
+                d_instantiate(dentry, btrfs_lookup_dentry(dir, dentry));
        return ret;
 }
@@ -571,8 +578,12 @@ static int create_snapshot(struct btrfs_root *root, struct dentry *dentry,
                ret = btrfs_commit_transaction(trans,
                                               root->fs_info->extent_root);
        }
-        if (ret)
+        if (ret) {
+                /* cleanup_transaction has freed this for us */
+                if (trans->aborted)
+                        pending_snapshot = NULL;
                goto fail;
+        }
        ret = pending_snapshot->error;
        if (ret)
@@ -705,6 +716,16 @@ static noinline int btrfs_mksubvol(struct path *parent,
        if (error)
                goto out_dput;
+        /*
+         * even if this name doesn't exist, we may get hash collisions.
+         * check for them now when we can safely fail
+         */
+        error = btrfs_check_dir_item_collision(BTRFS_I(dir)->root,
+                                               dir->i_ino, name,
+                                               namelen);
+        if (error)
+                goto out_dput;
        down_read(&BTRFS_I(dir)->root->fs_info->subvol_sem);
        if (btrfs_root_refs(&BTRFS_I(dir)->root->root_item) == 0)
@@ -1225,7 +1246,7 @@ int btrfs_defrag_file(struct inode *inode, struct file *file,
                }
                defrag_count += ret;
-                balance_dirty_pages_ratelimited_nr(inode->i_mapping, ret);
+                balance_dirty_pages_ratelimited(inode->i_mapping);
                mutex_unlock(&inode->i_mutex);
                if (newer_than) {
@@ -1293,12 +1314,13 @@ out_ra:
        return ret;
 }
-static noinline int btrfs_ioctl_resize(struct btrfs_root *root,
+static noinline int btrfs_ioctl_resize(struct file *file,
                                        void __user *arg)
 {
        u64 new_size;
        u64 old_size;
        u64 devid = 1;
+        struct btrfs_root *root = BTRFS_I(fdentry(file)->d_inode)->root;
        struct btrfs_ioctl_vol_args *vol_args;
        struct btrfs_trans_handle *trans;
        struct btrfs_device *device = NULL;
@@ -1313,13 +1335,18 @@ static noinline int btrfs_ioctl_resize(struct btrfs_root *root,
        if (!capable(CAP_SYS_ADMIN))
                return -EPERM;
-        mutex_lock(&root->fs_info->volume_mutex);
+        ret = mnt_want_write_file(file);
-        if (root->fs_info->balance_ctl) {
+        if (ret)
-                printk(KERN_INFO "btrfs: balance in progress\n");
+                return ret;
-                ret = -EINVAL;
-                goto out;
+        if (atomic_xchg(&root->fs_info->mutually_exclusive_operation_running,
+                        1)) {
+                pr_info("btrfs: dev add/delete/balance/replace/resize operation in progress\n");
+                mnt_drop_write_file(file);
+                return -EINVAL;
        }
+        mutex_lock(&root->fs_info->volume_mutex);
        vol_args = memdup_user(arg, sizeof(*vol_args));
        if (IS_ERR(vol_args)) {
                ret = PTR_ERR(vol_args);
@@ -1339,16 +1366,18 @@ static noinline int btrfs_ioctl_resize(struct btrfs_root *root,
                printk(KERN_INFO "btrfs: resizing devid %llu\n",
                       (unsigned long long)devid);
        }
-        device = btrfs_find_device(root, devid, NULL, NULL);
+        device = btrfs_find_device(root->fs_info, devid, NULL, NULL);
        if (!device) {
                printk(KERN_INFO "btrfs: resizer unable to find device %llu\n",
                       (unsigned long long)devid);
                ret = -EINVAL;
                goto out_free;
        }
-        if (device->fs_devices && device->fs_devices->seeding) {
+        if (!device->writeable) {
                printk(KERN_INFO "btrfs: resizer unable to apply on "
-                       "seeding device %llu\n",
+                       "readonly device %llu\n",
                       (unsigned long long)devid);
                ret = -EINVAL;
                goto out_free;
@@ -1371,6 +1400,11 @@ static noinline int btrfs_ioctl_resize(struct btrfs_root *root,
                }
        }
+        if (device->is_tgtdev_for_dev_replace) {
+                ret = -EINVAL;
+                goto out_free;
+        }
        old_size = device->total_bytes;
        if (mod < 0) {
@@ -1409,12 +1443,14 @@ static noinline int btrfs_ioctl_resize(struct btrfs_root *root,
                btrfs_commit_transaction(trans, root);
        } else if (new_size < old_size) {
                ret = btrfs_shrink_device(device, new_size);
-        }
+        } /* equal, nothing need to do */
 out_free:
        kfree(vol_args);
 out:
        mutex_unlock(&root->fs_info->volume_mutex);
+        atomic_set(&root->fs_info->mutually_exclusive_operation_running, 0);
+        mnt_drop_write_file(file);
        return ret;
 }
@@ -2065,13 +2101,13 @@ static noinline int btrfs_ioctl_snap_destroy(struct file *file,
                err = inode_permission(inode, MAY_WRITE | MAY_EXEC);
                if (err)
                        goto out_dput;
-                /* check if subvolume may be deleted by a non-root user */
-                err = btrfs_may_delete(dir, dentry, 1);
-                if (err)
-                        goto out_dput;
        }
+        /* check if subvolume may be deleted by a user */
+        err = btrfs_may_delete(dir, dentry, 1);
+        if (err)
+                goto out_dput;
        if (btrfs_ino(inode) != BTRFS_FIRST_FREE_OBJECTID) {
                err = -EINVAL;
                goto out_dput;
@@ -2153,13 +2189,22 @@ static int btrfs_ioctl_defrag(struct file *file, void __user *argp)
        struct btrfs_ioctl_defrag_range_args *range;
        int ret;
-        if (btrfs_root_readonly(root))
-                return -EROFS;
        ret = mnt_want_write_file(file);
        if (ret)
                return ret;
+        if (atomic_xchg(&root->fs_info->mutually_exclusive_operation_running,
+                        1)) {
+                pr_info("btrfs: dev add/delete/balance/replace/resize operation in progress\n");
+                mnt_drop_write_file(file);
+                return -EINVAL;
+        }
+        if (btrfs_root_readonly(root)) {
+                ret = -EROFS;
+                goto out;
+        }
        switch (inode->i_mode & S_IFMT) {
        case S_IFDIR:
                if (!capable(CAP_SYS_ADMIN)) {
@@ -2209,6 +2254,7 @@ static int btrfs_ioctl_defrag(struct file *file, void __user *argp)
                ret = -EINVAL;
        }
 out:
+        atomic_set(&root->fs_info->mutually_exclusive_operation_running, 0);
        mnt_drop_write_file(file);
        return ret;
 }
@@ -2221,13 +2267,13 @@ static long btrfs_ioctl_add_dev(struct btrfs_root *root, void __user *arg)
        if (!capable(CAP_SYS_ADMIN))
                return -EPERM;
-        mutex_lock(&root->fs_info->volume_mutex);
+        if (atomic_xchg(&root->fs_info->mutually_exclusive_operation_running,
-        if (root->fs_info->balance_ctl) {
+                        1)) {
-                printk(KERN_INFO "btrfs: balance in progress\n");
+                pr_info("btrfs: dev add/delete/balance/replace/resize operation in progress\n");
-                ret = -EINVAL;
+                return -EINVAL;
-                goto out;
        }
+        mutex_lock(&root->fs_info->volume_mutex);
        vol_args = memdup_user(arg, sizeof(*vol_args));
        if (IS_ERR(vol_args)) {
                ret = PTR_ERR(vol_args);
@@ -2240,27 +2286,31 @@ static long btrfs_ioctl_add_dev(struct btrfs_root *root, void __user *arg)
        kfree(vol_args);
 out:
        mutex_unlock(&root->fs_info->volume_mutex);
+        atomic_set(&root->fs_info->mutually_exclusive_operation_running, 0);
        return ret;
 }
-static long btrfs_ioctl_rm_dev(struct btrfs_root *root, void __user *arg)
+static long btrfs_ioctl_rm_dev(struct file *file, void __user *arg)
 {
+        struct btrfs_root *root = BTRFS_I(fdentry(file)->d_inode)->root;
        struct btrfs_ioctl_vol_args *vol_args;
        int ret;
        if (!capable(CAP_SYS_ADMIN))
                return -EPERM;
-        if (root->fs_info->sb->s_flags & MS_RDONLY)
+        ret = mnt_want_write_file(file);
-                return -EROFS;
+        if (ret)
+                return ret;
-        mutex_lock(&root->fs_info->volume_mutex);
+        if (atomic_xchg(&root->fs_info->mutually_exclusive_operation_running,
-        if (root->fs_info->balance_ctl) {
+                        1)) {
-                printk(KERN_INFO "btrfs: balance in progress\n");
+                pr_info("btrfs: dev add/delete/balance/replace/resize operation in progress\n");
-                ret = -EINVAL;
+                mnt_drop_write_file(file);
-                goto out;
+                return -EINVAL;
        }
+        mutex_lock(&root->fs_info->volume_mutex);
        vol_args = memdup_user(arg, sizeof(*vol_args));
        if (IS_ERR(vol_args)) {
                ret = PTR_ERR(vol_args);
@@ -2273,6 +2323,8 @@ static long btrfs_ioctl_rm_dev(struct btrfs_root *root, void __user *arg)
        kfree(vol_args);
 out:
        mutex_unlock(&root->fs_info->volume_mutex);
+        atomic_set(&root->fs_info->mutually_exclusive_operation_running, 0);
+        mnt_drop_write_file(file);
        return ret;
 }
@@ -2328,7 +2380,7 @@ static long btrfs_ioctl_dev_info(struct btrfs_root *root, void __user *arg)
                s_uuid = di_args->uuid;
        mutex_lock(&fs_devices->device_list_mutex);
-        dev = btrfs_find_device(root, di_args->devid, s_uuid, NULL);
+        dev = btrfs_find_device(root->fs_info, di_args->devid, s_uuid, NULL);
        mutex_unlock(&fs_devices->device_list_mutex);
        if (!dev) {
@@ -2821,12 +2873,19 @@ static long btrfs_ioctl_default_subvol(struct file *file, void __user *argp)
        struct btrfs_disk_key disk_key;
        u64 objectid = 0;
        u64 dir_id;
+        int ret;
        if (!capable(CAP_SYS_ADMIN))
                return -EPERM;
-        if (copy_from_user(&objectid, argp, sizeof(objectid)))
+        ret = mnt_want_write_file(file);
-                return -EFAULT;
+        if (ret)
+                return ret;
+        if (copy_from_user(&objectid, argp, sizeof(objectid))) {
+                ret = -EFAULT;
+                goto out;
+        }
        if (!objectid)
                objectid = root->root_key.objectid;
@@ -2836,21 +2895,28 @@ static long btrfs_ioctl_default_subvol(struct file *file, void __user *argp)
        location.offset = (u64)-1;
        new_root = btrfs_read_fs_root_no_name(root->fs_info, &location);
-        if (IS_ERR(new_root))
+        if (IS_ERR(new_root)) {
-                return PTR_ERR(new_root);
+                ret = PTR_ERR(new_root);
+                goto out;
+        }
-        if (btrfs_root_refs(&new_root->root_item) == 0)
+        if (btrfs_root_refs(&new_root->root_item) == 0) {
-                return -ENOENT;
+                ret = -ENOENT;
+                goto out;
+        }
        path = btrfs_alloc_path();
-        if (!path)
+        if (!path) {
-                return -ENOMEM;
+                ret = -ENOMEM;
+                goto out;
+        }
        path->leave_spinning = 1;
        trans = btrfs_start_transaction(root, 1);
        if (IS_ERR(trans)) {
                btrfs_free_path(path);
-                return PTR_ERR(trans);
+                ret = PTR_ERR(trans);
+                goto out;
        }
        dir_id = btrfs_super_root_dir(root->fs_info->super_copy);
@@ -2861,7 +2927,8 @@ static long btrfs_ioctl_default_subvol(struct file *file, void __user *argp)
                btrfs_end_transaction(trans, root);
                printk(KERN_ERR "Umm, you don't have the default dir item, "
                       "this isn't going to work\n");
-                return -ENOENT;
+                ret = -ENOENT;
+                goto out;
        }
        btrfs_cpu_key_to_disk(&disk_key, &new_root->root_key);
@@ -2871,8 +2938,9 @@ static long btrfs_ioctl_default_subvol(struct file *file, void __user *argp)
        btrfs_set_fs_incompat(root->fs_info, DEFAULT_SUBVOL);
        btrfs_end_transaction(trans, root);
+out:
-        return 0;
+        mnt_drop_write_file(file);
+        return ret;
 }
 void btrfs_get_block_group_info(struct list_head *groups_list,
@@ -3036,32 +3104,38 @@ long btrfs_ioctl_trans_end(struct file *file)
        return 0;
 }
-static noinline long btrfs_ioctl_start_sync(struct file *file, void __user *argp)
+static noinline long btrfs_ioctl_start_sync(struct btrfs_root *root,
+                                            void __user *argp)
 {
-        struct btrfs_root *root = BTRFS_I(file->f_dentry->d_inode)->root;
        struct btrfs_trans_handle *trans;
        u64 transid;
        int ret;
-        trans = btrfs_start_transaction(root, 0);
+        trans = btrfs_attach_transaction(root);
-        if (IS_ERR(trans))
+        if (IS_ERR(trans)) {
-                return PTR_ERR(trans);
+                if (PTR_ERR(trans) != -ENOENT)
+                        return PTR_ERR(trans);
+                /* No running transaction, don't bother */
+                transid = root->fs_info->last_trans_committed;
+                goto out;
+        }
        transid = trans->transid;
        ret = btrfs_commit_transaction_async(trans, root, 0);
        if (ret) {
                btrfs_end_transaction(trans, root);
                return ret;
        }
+out:
        if (argp)
                if (copy_to_user(argp, &transid, sizeof(transid)))
                        return -EFAULT;
        return 0;
 }
-static noinline long btrfs_ioctl_wait_sync(struct file *file, void __user *argp)
+static noinline long btrfs_ioctl_wait_sync(struct btrfs_root *root,
+                                           void __user *argp)
 {
-        struct btrfs_root *root = BTRFS_I(file->f_dentry->d_inode)->root;
        u64 transid;
        if (argp) {
@@ -3073,10 +3147,11 @@ static noinline long btrfs_ioctl_wait_sync(struct file *file, void __user *argp)
        return btrfs_wait_for_commit(root, transid);
 }
-static long btrfs_ioctl_scrub(struct btrfs_root *root, void __user *arg)
+static long btrfs_ioctl_scrub(struct file *file, void __user *arg)
 {
-        int ret;
+        struct btrfs_root *root = BTRFS_I(fdentry(file)->d_inode)->root;
        struct btrfs_ioctl_scrub_args *sa;
+        int ret;
        if (!capable(CAP_SYS_ADMIN))
                return -EPERM;
@@ -3085,12 +3160,22 @@ static long btrfs_ioctl_scrub(struct btrfs_root *root, void __user *arg)
        if (IS_ERR(sa))
                return PTR_ERR(sa);
-        ret = btrfs_scrub_dev(root, sa->devid, sa->start, sa->end,
+        if (!(sa->flags & BTRFS_SCRUB_READONLY)) {
-                              &sa->progress, sa->flags & BTRFS_SCRUB_READONLY);
+                ret = mnt_want_write_file(file);
+                if (ret)
+                        goto out;
+        }
+        ret = btrfs_scrub_dev(root->fs_info, sa->devid, sa->start, sa->end,
+                              &sa->progress, sa->flags & BTRFS_SCRUB_READONLY,
+                              0);
        if (copy_to_user(arg, sa, sizeof(*sa)))
                ret = -EFAULT;
+        if (!(sa->flags & BTRFS_SCRUB_READONLY))
+                mnt_drop_write_file(file);
+out:
        kfree(sa);
        return ret;
 }
@@ -3100,7 +3185,7 @@ static long btrfs_ioctl_scrub_cancel(struct btrfs_root *root, void __user *arg)
        if (!capable(CAP_SYS_ADMIN))
                return -EPERM;
-        return btrfs_scrub_cancel(root);
+        return btrfs_scrub_cancel(root->fs_info);
 }
 static long btrfs_ioctl_scrub_progress(struct btrfs_root *root,
@@ -3149,6 +3234,51 @@ static long btrfs_ioctl_get_dev_stats(struct btrfs_root *root,
        return ret;
 }
+static long btrfs_ioctl_dev_replace(struct btrfs_root *root, void __user *arg)
+{
+        struct btrfs_ioctl_dev_replace_args *p;
+        int ret;
+        if (!capable(CAP_SYS_ADMIN))
+                return -EPERM;
+        p = memdup_user(arg, sizeof(*p));
+        if (IS_ERR(p))
+                return PTR_ERR(p);
+        switch (p->cmd) {
+        case BTRFS_IOCTL_DEV_REPLACE_CMD_START:
+                if (atomic_xchg(
+                        &root->fs_info->mutually_exclusive_operation_running,
+                        1)) {
+                        pr_info("btrfs: dev add/delete/balance/replace/resize operation in progress\n");
+                        ret = -EINPROGRESS;
+                } else {
+                        ret = btrfs_dev_replace_start(root, p);
+                        atomic_set(
+                         &root->fs_info->mutually_exclusive_operation_running,
+                         0);
+                }
+                break;
+        case BTRFS_IOCTL_DEV_REPLACE_CMD_STATUS:
+                btrfs_dev_replace_status(root->fs_info, p);
+                ret = 0;
+                break;
+        case BTRFS_IOCTL_DEV_REPLACE_CMD_CANCEL:
+                ret = btrfs_dev_replace_cancel(root->fs_info, p);
+                break;
+        default:
+                ret = -EINVAL;
+                break;
+        }
+        if (copy_to_user(arg, p, sizeof(*p)))
+                ret = -EFAULT;
+        kfree(p);
+        return ret;
+}
 static long btrfs_ioctl_ino_to_path(struct btrfs_root *root, void __user *arg)
 {
        int ret = 0;
@@ -3314,6 +3444,7 @@ static long btrfs_ioctl_balance(struct file *file, void __user *arg)
        struct btrfs_fs_info *fs_info = root->fs_info;
        struct btrfs_ioctl_balance_args *bargs;
        struct btrfs_balance_control *bctl;
+        bool need_unlock; /* for mut. excl. ops lock */
        int ret;
        if (!capable(CAP_SYS_ADMIN))
@@ -3323,14 +3454,61 @@ static long btrfs_ioctl_balance(struct file *file, void __user *arg)
        if (ret)
                return ret;
-        mutex_lock(&fs_info->volume_mutex);
+again:
+        if (!atomic_xchg(&fs_info->mutually_exclusive_operation_running, 1)) {
+                mutex_lock(&fs_info->volume_mutex);
+                mutex_lock(&fs_info->balance_mutex);
+                need_unlock = true;
+                goto locked;
+        }
+        /*
+         * mut. excl. ops lock is locked.  Three possibilites:
+         *   (1) some other op is running
+         *   (2) balance is running
+         *   (3) balance is paused -- special case (think resume)
+         */
        mutex_lock(&fs_info->balance_mutex);
+        if (fs_info->balance_ctl) {
+                /* this is either (2) or (3) */
+                if (!atomic_read(&fs_info->balance_running)) {
+                        mutex_unlock(&fs_info->balance_mutex);
+                        if (!mutex_trylock(&fs_info->volume_mutex))
+                                goto again;
+                        mutex_lock(&fs_info->balance_mutex);
+                        if (fs_info->balance_ctl &&
+                            !atomic_read(&fs_info->balance_running)) {
+                                /* this is (3) */
+                                need_unlock = false;
+                                goto locked;
+                        }
+                        mutex_unlock(&fs_info->balance_mutex);
+                        mutex_unlock(&fs_info->volume_mutex);
+                        goto again;
+                } else {
+                        /* this is (2) */
+                        mutex_unlock(&fs_info->balance_mutex);
+                        ret = -EINPROGRESS;
+                        goto out;
+                }
+        } else {
+                /* this is (1) */
+                mutex_unlock(&fs_info->balance_mutex);
+                pr_info("btrfs: dev add/delete/balance/replace/resize operation in progress\n");
+                ret = -EINVAL;
+                goto out;
+        }
+locked:
+        BUG_ON(!atomic_read(&fs_info->mutually_exclusive_operation_running));
        if (arg) {
                bargs = memdup_user(arg, sizeof(*bargs));
                if (IS_ERR(bargs)) {
                        ret = PTR_ERR(bargs);
-                        goto out;
+                        goto out_unlock;
                }
                if (bargs->flags & BTRFS_BALANCE_RESUME) {
@@ -3374,11 +3552,17 @@ static long btrfs_ioctl_balance(struct file *file, void __user *arg)
        }
 do_balance:
-        ret = btrfs_balance(bctl, bargs);
        /*
-         * bctl is freed in __cancel_balance or in free_fs_info if
+         * Ownership of bctl and mutually_exclusive_operation_running
-         * restriper was paused all the way until unmount
+         * goes to to btrfs_balance.  bctl is freed in __cancel_balance,
+         * or, if restriper was paused all the way until unmount, in
+         * free_fs_info.  mutually_exclusive_operation_running is
+         * cleared in __cancel_balance.
         */
+        need_unlock = false;
+        ret = btrfs_balance(bctl, bargs);
        if (arg) {
                if (copy_to_user(arg, bargs, sizeof(*bargs)))
                        ret = -EFAULT;
@@ -3386,9 +3570,12 @@ do_balance:
 out_bargs:
        kfree(bargs);
-out:
+out_unlock:
        mutex_unlock(&fs_info->balance_mutex);
        mutex_unlock(&fs_info->volume_mutex);
+        if (need_unlock)
+                atomic_set(&fs_info->mutually_exclusive_operation_running, 0);
+out:
        mnt_drop_write_file(file);
        return ret;
 }
@@ -3441,8 +3628,9 @@ out:
        return ret;
 }
-static long btrfs_ioctl_quota_ctl(struct btrfs_root *root, void __user *arg)
+static long btrfs_ioctl_quota_ctl(struct file *file, void __user *arg)
 {
+        struct btrfs_root *root = BTRFS_I(fdentry(file)->d_inode)->root;
        struct btrfs_ioctl_quota_ctl_args *sa;
        struct btrfs_trans_handle *trans = NULL;
        int ret;
@@ -3451,12 +3639,15 @@ static long btrfs_ioctl_quota_ctl(struct btrfs_root *root, void __user *arg)
        if (!capable(CAP_SYS_ADMIN))
                return -EPERM;
-        if (root->fs_info->sb->s_flags & MS_RDONLY)
+        ret = mnt_want_write_file(file);
-                return -EROFS;
+        if (ret)
+                return ret;
        sa = memdup_user(arg, sizeof(*sa));
-        if (IS_ERR(sa))
+        if (IS_ERR(sa)) {
-                return PTR_ERR(sa);
+                ret = PTR_ERR(sa);
+                goto drop_write;
+        }
        if (sa->cmd != BTRFS_QUOTA_CTL_RESCAN) {
                trans = btrfs_start_transaction(root, 2);
@@ -3489,14 +3680,16 @@ static long btrfs_ioctl_quota_ctl(struct btrfs_root *root, void __user *arg)
                if (err && !ret)
                        ret = err;
        }
 out:
        kfree(sa);
+drop_write:
+        mnt_drop_write_file(file);
        return ret;
 }
-static long btrfs_ioctl_qgroup_assign(struct btrfs_root *root, void __user *arg)
+static long btrfs_ioctl_qgroup_assign(struct file *file, void __user *arg)
 {
+        struct btrfs_root *root = BTRFS_I(fdentry(file)->d_inode)->root;
        struct btrfs_ioctl_qgroup_assign_args *sa;
        struct btrfs_trans_handle *trans;
        int ret;
@@ -3505,12 +3698,15 @@ static long btrfs_ioctl_qgroup_assign(struct btrfs_root *root, void __user *arg)
        if (!capable(CAP_SYS_ADMIN))
                return -EPERM;
-        if (root->fs_info->sb->s_flags & MS_RDONLY)
+        ret = mnt_want_write_file(file);
-                return -EROFS;
+        if (ret)
+                return ret;
        sa = memdup_user(arg, sizeof(*sa));
-        if (IS_ERR(sa))
+        if (IS_ERR(sa)) {
-                return PTR_ERR(sa);
+                ret = PTR_ERR(sa);
+                goto drop_write;
+        }
        trans = btrfs_join_transaction(root);
        if (IS_ERR(trans)) {
@@ -3533,11 +3729,14 @@ static long btrfs_ioctl_qgroup_assign(struct btrfs_root *root, void __user *arg)
 out:
        kfree(sa);
+drop_write:
+        mnt_drop_write_file(file);
        return ret;
 }
-static long btrfs_ioctl_qgroup_create(struct btrfs_root *root, void __user *arg)
+static long btrfs_ioctl_qgroup_create(struct file *file, void __user *arg)
 {
+        struct btrfs_root *root = BTRFS_I(fdentry(file)->d_inode)->root;
        struct btrfs_ioctl_qgroup_create_args *sa;
        struct btrfs_trans_handle *trans;
        int ret;
@@ -3546,12 +3745,20 @@ static long btrfs_ioctl_qgroup_create(struct btrfs_root *root, void __user *arg)
        if (!capable(CAP_SYS_ADMIN))
                return -EPERM;
-        if (root->fs_info->sb->s_flags & MS_RDONLY)
+        ret = mnt_want_write_file(file);
-                return -EROFS;
+        if (ret)
+                return ret;
        sa = memdup_user(arg, sizeof(*sa));
-        if (IS_ERR(sa))
+        if (IS_ERR(sa)) {
-                return PTR_ERR(sa);
+                ret = PTR_ERR(sa);
+                goto drop_write;
+        }
+        if (!sa->qgroupid) {
+                ret = -EINVAL;
+                goto out;
+        }
        trans = btrfs_join_transaction(root);
        if (IS_ERR(trans)) {
@@ -3573,11 +3780,14 @@ static long btrfs_ioctl_qgroup_create(struct btrfs_root *root, void __user *arg)
 out:
        kfree(sa);
+drop_write:
+        mnt_drop_write_file(file);
        return ret;
 }
-static long btrfs_ioctl_qgroup_limit(struct btrfs_root *root, void __user *arg)
+static long btrfs_ioctl_qgroup_limit(struct file *file, void __user *arg)
 {
+        struct btrfs_root *root = BTRFS_I(fdentry(file)->d_inode)->root;
        struct btrfs_ioctl_qgroup_limit_args *sa;
        struct btrfs_trans_handle *trans;
        int ret;
@@ -3587,12 +3797,15 @@ static long btrfs_ioctl_qgroup_limit(struct btrfs_root *root, void __user *arg)
        if (!capable(CAP_SYS_ADMIN))
                return -EPERM;
-        if (root->fs_info->sb->s_flags & MS_RDONLY)
+        ret = mnt_want_write_file(file);
-                return -EROFS;
+        if (ret)
+                return ret;
        sa = memdup_user(arg, sizeof(*sa));
-        if (IS_ERR(sa))
+        if (IS_ERR(sa)) {
-                return PTR_ERR(sa);
+                ret = PTR_ERR(sa);
+                goto drop_write;
+        }
        trans = btrfs_join_transaction(root);
        if (IS_ERR(trans)) {
@@ -3615,6 +3828,8 @@ static long btrfs_ioctl_qgroup_limit(struct btrfs_root *root, void __user *arg)
 out:
        kfree(sa);
+drop_write:
+        mnt_drop_write_file(file);
        return ret;
 }
@@ -3735,11 +3950,11 @@ long btrfs_ioctl(struct file *file, unsigned int
        case BTRFS_IOC_DEFRAG_RANGE:
                return btrfs_ioctl_defrag(file, argp);
        case BTRFS_IOC_RESIZE:
-                return btrfs_ioctl_resize(root, argp);
+                return btrfs_ioctl_resize(file, argp);
        case BTRFS_IOC_ADD_DEV:
                return btrfs_ioctl_add_dev(root, argp);
        case BTRFS_IOC_RM_DEV:
-                return btrfs_ioctl_rm_dev(root, argp);
+                return btrfs_ioctl_rm_dev(file, argp);
        case BTRFS_IOC_FS_INFO:
                return btrfs_ioctl_fs_info(root, argp);
        case BTRFS_IOC_DEV_INFO:
@@ -3768,11 +3983,11 @@ long btrfs_ioctl(struct file *file, unsigned int
                btrfs_sync_fs(file->f_dentry->d_sb, 1);
                return 0;
        case BTRFS_IOC_START_SYNC:
-                return btrfs_ioctl_start_sync(file, argp);
+                return btrfs_ioctl_start_sync(root, argp);
        case BTRFS_IOC_WAIT_SYNC:
-                return btrfs_ioctl_wait_sync(file, argp);
+                return btrfs_ioctl_wait_sync(root, argp);
        case BTRFS_IOC_SCRUB:
-                return btrfs_ioctl_scrub(root, argp);
+                return btrfs_ioctl_scrub(file, argp);
        case BTRFS_IOC_SCRUB_CANCEL:
                return btrfs_ioctl_scrub_cancel(root, argp);
        case BTRFS_IOC_SCRUB_PROGRESS:
@@ -3790,13 +4005,15 @@ long btrfs_ioctl(struct file *file, unsigned int
        case BTRFS_IOC_GET_DEV_STATS:
                return btrfs_ioctl_get_dev_stats(root, argp);
        case BTRFS_IOC_QUOTA_CTL:
-                return btrfs_ioctl_quota_ctl(root, argp);
+                return btrfs_ioctl_quota_ctl(file, argp);
        case BTRFS_IOC_QGROUP_ASSIGN:
-                return btrfs_ioctl_qgroup_assign(root, argp);
+                return btrfs_ioctl_qgroup_assign(file, argp);
        case BTRFS_IOC_QGROUP_CREATE:
-                return btrfs_ioctl_qgroup_create(root, argp);
+                return btrfs_ioctl_qgroup_create(file, argp);
        case BTRFS_IOC_QGROUP_LIMIT:
-                return btrfs_ioctl_qgroup_limit(root, argp);
+                return btrfs_ioctl_qgroup_limit(file, argp);
+        case BTRFS_IOC_DEV_REPLACE:
+                return btrfs_ioctl_dev_replace(root, argp);
        }
        return -ENOTTY;
diff --git a/fs/btrfs/ioctl.h b/fs/btrfs/ioctl.h
index 731e2875ab93..dabca9cc8c2e 100644
--- a/fs/btrfs/ioctl.h
+++ b/fs/btrfs/ioctl.h
@@ -30,6 +30,8 @@ struct btrfs_ioctl_vol_args {
        char name[BTRFS_PATH_NAME_MAX + 1];
 };
+#define BTRFS_DEVICE_PATH_NAME_MAX 1024
 #define BTRFS_SUBVOL_CREATE_ASYNC       (1ULL << 0)
 #define BTRFS_SUBVOL_RDONLY             (1ULL << 1)
 #define BTRFS_SUBVOL_QGROUP_INHERIT     (1ULL << 2)
@@ -123,7 +125,48 @@ struct btrfs_ioctl_scrub_args {
        __u64 unused[(1024-32-sizeof(struct btrfs_scrub_progress))/8];
 };
-#define BTRFS_DEVICE_PATH_NAME_MAX 1024
+#define BTRFS_IOCTL_DEV_REPLACE_CONT_READING_FROM_SRCDEV_MODE_ALWAYS    0
+#define BTRFS_IOCTL_DEV_REPLACE_CONT_READING_FROM_SRCDEV_MODE_AVOID     1
+struct btrfs_ioctl_dev_replace_start_params {
+        __u64 srcdevid; /* in, if 0, use srcdev_name instead */
+        __u64 cont_reading_from_srcdev_mode;    /* in, see #define
+                                                 * above */
+        __u8 srcdev_name[BTRFS_DEVICE_PATH_NAME_MAX + 1];       /* in */
+        __u8 tgtdev_name[BTRFS_DEVICE_PATH_NAME_MAX + 1];       /* in */
+};
+#define BTRFS_IOCTL_DEV_REPLACE_STATE_NEVER_STARTED     0
+#define BTRFS_IOCTL_DEV_REPLACE_STATE_STARTED           1
+#define BTRFS_IOCTL_DEV_REPLACE_STATE_FINISHED          2
+#define BTRFS_IOCTL_DEV_REPLACE_STATE_CANCELED          3
+#define BTRFS_IOCTL_DEV_REPLACE_STATE_SUSPENDED         4
+struct btrfs_ioctl_dev_replace_status_params {
+        __u64 replace_state;    /* out, see #define above */
+        __u64 progress_1000;    /* out, 0 <= x <= 1000 */
+        __u64 time_started;     /* out, seconds since 1-Jan-1970 */
+        __u64 time_stopped;     /* out, seconds since 1-Jan-1970 */
+        __u64 num_write_errors; /* out */
+        __u64 num_uncorrectable_read_errors;    /* out */
+};
+#define BTRFS_IOCTL_DEV_REPLACE_CMD_START                       0
+#define BTRFS_IOCTL_DEV_REPLACE_CMD_STATUS                      1
+#define BTRFS_IOCTL_DEV_REPLACE_CMD_CANCEL                      2
+#define BTRFS_IOCTL_DEV_REPLACE_RESULT_NO_ERROR                 0
+#define BTRFS_IOCTL_DEV_REPLACE_RESULT_NOT_STARTED              1
+#define BTRFS_IOCTL_DEV_REPLACE_RESULT_ALREADY_STARTED          2
+struct btrfs_ioctl_dev_replace_args {
+        __u64 cmd;      /* in */
+        __u64 result;   /* out */
+        union {
+                struct btrfs_ioctl_dev_replace_start_params start;
+                struct btrfs_ioctl_dev_replace_status_params status;
+        };      /* in/out */
+        __u64 spare[64];
+};
 struct btrfs_ioctl_dev_info_args {
        __u64 devid;                            /* in/out */
        __u8 uuid[BTRFS_UUID_SIZE];             /* in/out */
@@ -453,4 +496,7 @@ struct btrfs_ioctl_send_args {
                               struct btrfs_ioctl_qgroup_limit_args)
 #define BTRFS_IOC_GET_DEV_STATS _IOWR(BTRFS_IOCTL_MAGIC, 52, \
                                      struct btrfs_ioctl_get_dev_stats)
+#define BTRFS_IOC_DEV_REPLACE _IOWR(BTRFS_IOCTL_MAGIC, 53, \
+                                    struct btrfs_ioctl_dev_replace_args)
 #endif
diff --git a/fs/btrfs/math.h b/fs/btrfs/math.h
new file mode 100644
index 000000000000..b7816cefbd13
--- /dev/null
+++ b/fs/btrfs/math.h
@@ -0,0 +1,44 @@
+/*
+ * Copyright (C) 2012 Fujitsu.  All rights reserved.
+ * Written by Miao Xie <miaox@cn.fujitsu.com>
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public
+ * License v2 as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public
+ * License along with this program; if not, write to the
+ * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
+ * Boston, MA 021110-1307, USA.
+ */
+#ifndef __BTRFS_MATH_H
+#define __BTRFS_MATH_H
+#include <asm/div64.h>
+static inline u64 div_factor(u64 num, int factor)
+{
+        if (factor == 10)
+                return num;
+        num *= factor;
+        do_div(num, 10);
+        return num;
+}
+static inline u64 div_factor_fine(u64 num, int factor)
+{
+        if (factor == 100)
+                return num;
+        num *= factor;
+        do_div(num, 100);
+        return num;
+}
+#endif
diff --git a/fs/btrfs/ordered-data.c b/fs/btrfs/ordered-data.c
index 7772f02ba28e..e5ed56729607 100644
--- a/fs/btrfs/ordered-data.c
+++ b/fs/btrfs/ordered-data.c
@@ -211,6 +211,8 @@ static int __btrfs_add_ordered_extent(struct inode *inode, u64 file_offset,
        init_waitqueue_head(&entry->wait);
        INIT_LIST_HEAD(&entry->list);
        INIT_LIST_HEAD(&entry->root_extent_list);
+        INIT_LIST_HEAD(&entry->work_list);
+        init_completion(&entry->completion);
        trace_btrfs_ordered_extent_add(inode, entry);
@@ -464,18 +466,28 @@ void btrfs_remove_ordered_extent(struct inode *inode,
        wake_up(&entry->wait);
 }
+static void btrfs_run_ordered_extent_work(struct btrfs_work *work)
+{
+        struct btrfs_ordered_extent *ordered;
+        ordered = container_of(work, struct btrfs_ordered_extent, flush_work);
+        btrfs_start_ordered_extent(ordered->inode, ordered, 1);
+        complete(&ordered->completion);
+}
 /*
 * wait for all the ordered extents in a root.  This is done when balancing
 * space between drives.
 */
 void btrfs_wait_ordered_extents(struct btrfs_root *root, int delay_iput)
 {
-        struct list_head splice;
+        struct list_head splice, works;
        struct list_head *cur;
-        struct btrfs_ordered_extent *ordered;
+        struct btrfs_ordered_extent *ordered, *next;
        struct inode *inode;
        INIT_LIST_HEAD(&splice);
+        INIT_LIST_HEAD(&works);
        spin_lock(&root->fs_info->ordered_extent_lock);
        list_splice_init(&root->fs_info->ordered_extents, &splice);
@@ -494,19 +506,32 @@ void btrfs_wait_ordered_extents(struct btrfs_root *root, int delay_iput)
                spin_unlock(&root->fs_info->ordered_extent_lock);
                if (inode) {
-                        btrfs_start_ordered_extent(inode, ordered, 1);
+                        ordered->flush_work.func = btrfs_run_ordered_extent_work;
-                        btrfs_put_ordered_extent(ordered);
+                        list_add_tail(&ordered->work_list, &works);
-                        if (delay_iput)
+                        btrfs_queue_worker(&root->fs_info->flush_workers,
-                                btrfs_add_delayed_iput(inode);
+                                           &ordered->flush_work);
-                        else
-                                iput(inode);
                } else {
                        btrfs_put_ordered_extent(ordered);
                }
+                cond_resched();
                spin_lock(&root->fs_info->ordered_extent_lock);
        }
        spin_unlock(&root->fs_info->ordered_extent_lock);
+        list_for_each_entry_safe(ordered, next, &works, work_list) {
+                list_del_init(&ordered->work_list);
+                wait_for_completion(&ordered->completion);
+                inode = ordered->inode;
+                btrfs_put_ordered_extent(ordered);
+                if (delay_iput)
+                        btrfs_add_delayed_iput(inode);
+                else
+                        iput(inode);
+                cond_resched();
+        }
 }
 /*
@@ -519,13 +544,17 @@ void btrfs_wait_ordered_extents(struct btrfs_root *root, int delay_iput)
 * extra check to make sure the ordered operation list really is empty
 * before we return
 */
-void btrfs_run_ordered_operations(struct btrfs_root *root, int wait)
+int btrfs_run_ordered_operations(struct btrfs_root *root, int wait)
 {
        struct btrfs_inode *btrfs_inode;
        struct inode *inode;
        struct list_head splice;
+        struct list_head works;
+        struct btrfs_delalloc_work *work, *next;
+        int ret = 0;
        INIT_LIST_HEAD(&splice);
+        INIT_LIST_HEAD(&works);
        mutex_lock(&root->fs_info->ordered_operations_mutex);
        spin_lock(&root->fs_info->ordered_extent_lock);
@@ -533,6 +562,7 @@ again:
        list_splice_init(&root->fs_info->ordered_operations, &splice);
        while (!list_empty(&splice)) {
                btrfs_inode = list_entry(splice.next, struct btrfs_inode,
                                   ordered_operations);
@@ -549,15 +579,26 @@ again:
                        list_add_tail(&BTRFS_I(inode)->ordered_operations,
                              &root->fs_info->ordered_operations);
                }
+                if (!inode)
+                        continue;
                spin_unlock(&root->fs_info->ordered_extent_lock);
-                if (inode) {
+                work = btrfs_alloc_delalloc_work(inode, wait, 1);
-                        if (wait)
+                if (!work) {
-                                btrfs_wait_ordered_range(inode, 0, (u64)-1);
+                        if (list_empty(&BTRFS_I(inode)->ordered_operations))
-                        else
+                                list_add_tail(&btrfs_inode->ordered_operations,
-                                filemap_flush(inode->i_mapping);
+                                              &splice);
-                        btrfs_add_delayed_iput(inode);
+                        spin_lock(&root->fs_info->ordered_extent_lock);
+                        list_splice_tail(&splice,
+                                         &root->fs_info->ordered_operations);
+                        spin_unlock(&root->fs_info->ordered_extent_lock);
+                        ret = -ENOMEM;
+                        goto out;
                }
+                list_add_tail(&work->list, &works);
+                btrfs_queue_worker(&root->fs_info->flush_workers,
+                                   &work->work);
                cond_resched();
                spin_lock(&root->fs_info->ordered_extent_lock);
@@ -566,7 +607,13 @@ again:
                goto again;
        spin_unlock(&root->fs_info->ordered_extent_lock);
+out:
+        list_for_each_entry_safe(work, next, &works, list) {
+                list_del_init(&work->list);
+                btrfs_wait_and_free_delalloc_work(work);
+        }
        mutex_unlock(&root->fs_info->ordered_operations_mutex);
+        return ret;
 }
 /*
@@ -606,7 +653,6 @@ void btrfs_wait_ordered_range(struct inode *inode, u64 start, u64 len)
        u64 end;
        u64 orig_end;
        struct btrfs_ordered_extent *ordered;
-        int found;
        if (start + len < start) {
                orig_end = INT_LIMIT(loff_t);
@@ -642,7 +688,6 @@ void btrfs_wait_ordered_range(struct inode *inode, u64 start, u64 len)
        filemap_fdatawait_range(inode->i_mapping, start, orig_end);
        end = orig_end;
-        found = 0;
        while (1) {
                ordered = btrfs_lookup_first_ordered_extent(inode, end);
                if (!ordered)
@@ -655,7 +700,6 @@ void btrfs_wait_ordered_range(struct inode *inode, u64 start, u64 len)
                        btrfs_put_ordered_extent(ordered);
                        break;
                }
-                found++;
                btrfs_start_ordered_extent(inode, ordered, 1);
                end = ordered->file_offset;
                btrfs_put_ordered_extent(ordered);
@@ -792,9 +836,16 @@ int btrfs_ordered_update_i_size(struct inode *inode, u64 offset,
         * if the disk i_size is already at the inode->i_size, or
         * this ordered extent is inside the disk i_size, we're done
         */
-        if (disk_i_size == i_size || offset <= disk_i_size) {
+        if (disk_i_size == i_size)
+                goto out;
+        /*
+         * We still need to update disk_i_size if outstanding_isize is greater
+         * than disk_i_size.
+         */
+        if (offset <= disk_i_size &&
+            (!ordered || ordered->outstanding_isize <= disk_i_size))
                goto out;
-        }
        /*
         * walk backward from this ordered extent to disk_i_size.
@@ -826,7 +877,7 @@ int btrfs_ordered_update_i_size(struct inode *inode, u64 offset,
                        break;
                if (test->file_offset >= i_size)
                        break;
-                if (test->file_offset >= disk_i_size) {
+                if (entry_end(test) > disk_i_size) {
                        /*
                         * we don't update disk_i_size now, so record this
                         * undealt i_size. Or we will not know the real
@@ -934,15 +985,6 @@ void btrfs_add_ordered_operation(struct btrfs_trans_handle *trans,
        if (last_mod < root->fs_info->last_trans_committed)
                return;
-        /*
-         * the transaction is already committing.  Just start the IO and
-         * don't bother with all of this list nonsense
-         */
-        if (trans && root->fs_info->running_transaction->blocked) {
-                btrfs_wait_ordered_range(inode, 0, (u64)-1);
-                return;
-        }
        spin_lock(&root->fs_info->ordered_extent_lock);
        if (list_empty(&BTRFS_I(inode)->ordered_operations)) {
                list_add_tail(&BTRFS_I(inode)->ordered_operations,
@@ -959,6 +1001,7 @@ int __init ordered_data_init(void)
                                     NULL);
        if (!btrfs_ordered_extent_cache)
                return -ENOMEM;
        return 0;
 }
diff --git a/fs/btrfs/ordered-data.h b/fs/btrfs/ordered-data.h
index dd27a0b46a37..f29d4bf5fbe7 100644
--- a/fs/btrfs/ordered-data.h
+++ b/fs/btrfs/ordered-data.h
@@ -76,7 +76,7 @@ struct btrfs_ordered_sum {
 #define BTRFS_ORDERED_IOERR 6 /* We had an io error when writing this out */
-#define BTRFS_ORDERED_UPDATED_ISIZE 7 /* indicates wether this ordered extent
+#define BTRFS_ORDERED_UPDATED_ISIZE 7 /* indicates whether this ordered extent
                                       * has done its due diligence in updating
                                       * the isize. */
@@ -128,8 +128,11 @@ struct btrfs_ordered_extent {
        struct list_head root_extent_list;
        struct btrfs_work work;
-};
+        struct completion completion;
+        struct btrfs_work flush_work;
+        struct list_head work_list;
+};
 /*
 * calculates the total size you need to allocate for an ordered sum
@@ -186,7 +189,7 @@ struct btrfs_ordered_extent *btrfs_lookup_ordered_range(struct inode *inode,
 int btrfs_ordered_update_i_size(struct inode *inode, u64 offset,
                                struct btrfs_ordered_extent *ordered);
 int btrfs_find_ordered_sum(struct inode *inode, u64 offset, u64 disk_bytenr, u32 *sum);
-void btrfs_run_ordered_operations(struct btrfs_root *root, int wait);
+int btrfs_run_ordered_operations(struct btrfs_root *root, int wait);
 void btrfs_add_ordered_operation(struct btrfs_trans_handle *trans,
                                 struct btrfs_root *root,
                                 struct inode *inode);
diff --git a/fs/btrfs/print-tree.c b/fs/btrfs/print-tree.c
index 5e23684887eb..50d95fd190a5 100644
--- a/fs/btrfs/print-tree.c
+++ b/fs/btrfs/print-tree.c
@@ -297,6 +297,9 @@ void btrfs_print_leaf(struct btrfs_root *root, struct extent_buffer *l)
                case BTRFS_DEV_STATS_KEY:
                        printk(KERN_INFO "\t\tdevice stats\n");
                        break;
+                case BTRFS_DEV_REPLACE_KEY:
+                        printk(KERN_INFO "\t\tdev replace\n");
+                        break;
                };
        }
 }
diff --git a/fs/btrfs/qgroup.c b/fs/btrfs/qgroup.c
index fe9d02c45f8e..a5c856234323 100644
--- a/fs/btrfs/qgroup.c
+++ b/fs/btrfs/qgroup.c
@@ -379,6 +379,13 @@ next1:
                ret = add_relation_rb(fs_info, found_key.objectid,
                                      found_key.offset);
+                if (ret == -ENOENT) {
+                        printk(KERN_WARNING
+                                "btrfs: orphan qgroup relation 0x%llx->0x%llx\n",
+                                (unsigned long long)found_key.objectid,
+                                (unsigned long long)found_key.offset);
+                        ret = 0;        /* ignore the error */
+                }
                if (ret)
                        goto out;
 next2:
@@ -956,17 +963,28 @@ int btrfs_remove_qgroup(struct btrfs_trans_handle *trans,
                        struct btrfs_fs_info *fs_info, u64 qgroupid)
 {
        struct btrfs_root *quota_root;
+        struct btrfs_qgroup *qgroup;
        int ret = 0;
        quota_root = fs_info->quota_root;
        if (!quota_root)
                return -EINVAL;
+        /* check if there are no relations to this qgroup */
+        spin_lock(&fs_info->qgroup_lock);
+        qgroup = find_qgroup_rb(fs_info, qgroupid);
+        if (qgroup) {
+                if (!list_empty(&qgroup->groups) || !list_empty(&qgroup->members)) {
+                        spin_unlock(&fs_info->qgroup_lock);
+                        return -EBUSY;
+                }
+        }
+        spin_unlock(&fs_info->qgroup_lock);
        ret = del_qgroup_item(trans, quota_root, qgroupid);
        spin_lock(&fs_info->qgroup_lock);
        del_qgroup_rb(quota_root->fs_info, qgroupid);
        spin_unlock(&fs_info->qgroup_lock);
        return ret;
diff --git a/fs/btrfs/reada.c b/fs/btrfs/reada.c
index a955669519a2..96b93daa0bbb 100644
--- a/fs/btrfs/reada.c
+++ b/fs/btrfs/reada.c
@@ -27,6 +27,7 @@
 #include "volumes.h"
 #include "disk-io.h"
 #include "transaction.h"
+#include "dev-replace.h"
 #undef DEBUG
@@ -323,7 +324,6 @@ static struct reada_extent *reada_find_extent(struct btrfs_root *root,
        struct reada_extent *re = NULL;
        struct reada_extent *re_exist = NULL;
        struct btrfs_fs_info *fs_info = root->fs_info;
-        struct btrfs_mapping_tree *map_tree = &fs_info->mapping_tree;
        struct btrfs_bio *bbio = NULL;
        struct btrfs_device *dev;
        struct btrfs_device *prev_dev;
@@ -332,6 +332,7 @@ static struct reada_extent *reada_find_extent(struct btrfs_root *root,
        int nzones = 0;
        int i;
        unsigned long index = logical >> PAGE_CACHE_SHIFT;
+        int dev_replace_is_ongoing;
        spin_lock(&fs_info->reada_lock);
        re = radix_tree_lookup(&fs_info->reada_tree, index);
@@ -358,7 +359,8 @@ static struct reada_extent *reada_find_extent(struct btrfs_root *root,
         * map block
         */
        length = blocksize;
-        ret = btrfs_map_block(map_tree, REQ_WRITE, logical, &length, &bbio, 0);
+        ret = btrfs_map_block(fs_info, REQ_GET_READ_MIRRORS, logical, &length,
+                              &bbio, 0);
        if (ret || !bbio || length < blocksize)
                goto error;
@@ -393,6 +395,7 @@ static struct reada_extent *reada_find_extent(struct btrfs_root *root,
        }
        /* insert extent in reada_tree + all per-device trees, all or nothing */
+        btrfs_dev_replace_lock(&fs_info->dev_replace);
        spin_lock(&fs_info->reada_lock);
        ret = radix_tree_insert(&fs_info->reada_tree, index, re);
        if (ret == -EEXIST) {
@@ -400,13 +403,17 @@ static struct reada_extent *reada_find_extent(struct btrfs_root *root,
                BUG_ON(!re_exist);
                re_exist->refcnt++;
                spin_unlock(&fs_info->reada_lock);
+                btrfs_dev_replace_unlock(&fs_info->dev_replace);
                goto error;
        }
        if (ret) {
                spin_unlock(&fs_info->reada_lock);
+                btrfs_dev_replace_unlock(&fs_info->dev_replace);
                goto error;
        }
        prev_dev = NULL;
+        dev_replace_is_ongoing = btrfs_dev_replace_is_ongoing(
+                        &fs_info->dev_replace);
        for (i = 0; i < nzones; ++i) {
                dev = bbio->stripes[i].dev;
                if (dev == prev_dev) {
@@ -419,21 +426,36 @@ static struct reada_extent *reada_find_extent(struct btrfs_root *root,
                         */
                        continue;
                }
+                if (!dev->bdev) {
+                        /* cannot read ahead on missing device */
+                        continue;
+                }
+                if (dev_replace_is_ongoing &&
+                    dev == fs_info->dev_replace.tgtdev) {
+                        /*
+                         * as this device is selected for reading only as
+                         * a last resort, skip it for read ahead.
+                         */
+                        continue;
+                }
                prev_dev = dev;
                ret = radix_tree_insert(&dev->reada_extents, index, re);
                if (ret) {
                        while (--i >= 0) {
                                dev = bbio->stripes[i].dev;
                                BUG_ON(dev == NULL);
+                                /* ignore whether the entry was inserted */
                                radix_tree_delete(&dev->reada_extents, index);
                        }
                        BUG_ON(fs_info == NULL);
                        radix_tree_delete(&fs_info->reada_tree, index);
                        spin_unlock(&fs_info->reada_lock);
+                        btrfs_dev_replace_unlock(&fs_info->dev_replace);
                        goto error;
                }
        }
        spin_unlock(&fs_info->reada_lock);
+        btrfs_dev_replace_unlock(&fs_info->dev_replace);
        kfree(bbio);
        return re;
@@ -915,7 +937,10 @@ struct reada_control *btrfs_reada_add(struct btrfs_root *root,
        generation = btrfs_header_generation(node);
        free_extent_buffer(node);
-        reada_add_block(rc, start, &max_key, level, generation);
+        if (reada_add_block(rc, start, &max_key, level, generation)) {
+                kfree(rc);
+                return ERR_PTR(-ENOMEM);
+        }
        reada_start_machine(root->fs_info);
diff --git a/fs/btrfs/relocation.c b/fs/btrfs/relocation.c
index 776f0aa128fc..300e09ac3659 100644
--- a/fs/btrfs/relocation.c
+++ b/fs/btrfs/relocation.c
@@ -2025,7 +2025,6 @@ static noinline_for_stack int merge_reloc_root(struct reloc_control *rc,
        struct btrfs_root_item *root_item;
        struct btrfs_path *path;
        struct extent_buffer *leaf;
-        unsigned long nr;
        int level;
        int max_level;
        int replaced = 0;
@@ -2074,7 +2073,8 @@ static noinline_for_stack int merge_reloc_root(struct reloc_control *rc,
                BUG_ON(IS_ERR(trans));
                trans->block_rsv = rc->block_rsv;
-                ret = btrfs_block_rsv_refill(root, rc->block_rsv, min_reserved);
+                ret = btrfs_block_rsv_refill(root, rc->block_rsv, min_reserved,
+                                             BTRFS_RESERVE_FLUSH_ALL);
                if (ret) {
                        BUG_ON(ret != -EAGAIN);
                        ret = btrfs_commit_transaction(trans, root);
@@ -2125,10 +2125,9 @@ static noinline_for_stack int merge_reloc_root(struct reloc_control *rc,
                               path->slots[level]);
                root_item->drop_level = level;
-                nr = trans->blocks_used;
                btrfs_end_transaction_throttle(trans, root);
-                btrfs_btree_balance_dirty(root, nr);
+                btrfs_btree_balance_dirty(root);
                if (replaced && rc->stage == UPDATE_DATA_PTRS)
                        invalidate_extent_cache(root, &key, &next_key);
@@ -2155,10 +2154,9 @@ out:
                btrfs_update_reloc_root(trans, root);
        }
-        nr = trans->blocks_used;
        btrfs_end_transaction_throttle(trans, root);
-        btrfs_btree_balance_dirty(root, nr);
+        btrfs_btree_balance_dirty(root);
        if (replaced && rc->stage == UPDATE_DATA_PTRS)
                invalidate_extent_cache(root, &key, &next_key);
@@ -2184,7 +2182,8 @@ int prepare_to_merge(struct reloc_control *rc, int err)
 again:
        if (!err) {
                num_bytes = rc->merging_rsv_size;
-                ret = btrfs_block_rsv_add(root, rc->block_rsv, num_bytes);
+                ret = btrfs_block_rsv_add(root, rc->block_rsv, num_bytes,
+                                          BTRFS_RESERVE_FLUSH_ALL);
                if (ret)
                        err = ret;
        }
@@ -2459,7 +2458,8 @@ static int reserve_metadata_space(struct btrfs_trans_handle *trans,
        num_bytes = calcu_metadata_size(rc, node, 1) * 2;
        trans->block_rsv = rc->block_rsv;
-        ret = btrfs_block_rsv_add(root, rc->block_rsv, num_bytes);
+        ret = btrfs_block_rsv_add(root, rc->block_rsv, num_bytes,
+                                  BTRFS_RESERVE_FLUSH_ALL);
        if (ret) {
                if (ret == -EAGAIN)
                        rc->commit_transaction = 1;
@@ -3259,7 +3259,6 @@ static int delete_block_group_cache(struct btrfs_fs_info *fs_info,
        struct btrfs_path *path;
        struct btrfs_root *root = fs_info->tree_root;
        struct btrfs_trans_handle *trans;
-        unsigned long nr;
        int ret = 0;
        if (inode)
@@ -3293,9 +3292,8 @@ truncate:
        ret = btrfs_truncate_free_space_cache(root, trans, path, inode);
        btrfs_free_path(path);
-        nr = trans->blocks_used;
        btrfs_end_transaction(trans, root);
-        btrfs_btree_balance_dirty(root, nr);
+        btrfs_btree_balance_dirty(root);
 out:
        iput(inode);
        return ret;
@@ -3685,7 +3683,8 @@ int prepare_to_relocate(struct reloc_control *rc)
         * is no reservation in transaction handle.
         */
        ret = btrfs_block_rsv_add(rc->extent_root, rc->block_rsv,
-                                  rc->extent_root->nodesize * 256);
+                                  rc->extent_root->nodesize * 256,
+                                  BTRFS_RESERVE_FLUSH_ALL);
        if (ret)
                return ret;
@@ -3711,7 +3710,6 @@ static noinline_for_stack int relocate_block_group(struct reloc_control *rc)
        struct btrfs_trans_handle *trans = NULL;
        struct btrfs_path *path;
        struct btrfs_extent_item *ei;
-        unsigned long nr;
        u64 flags;
        u32 item_size;
        int ret;
@@ -3828,9 +3826,8 @@ restart:
                        ret = btrfs_commit_transaction(trans, rc->extent_root);
                        BUG_ON(ret);
                } else {
-                        nr = trans->blocks_used;
                        btrfs_end_transaction_throttle(trans, rc->extent_root);
-                        btrfs_btree_balance_dirty(rc->extent_root, nr);
+                        btrfs_btree_balance_dirty(rc->extent_root);
                }
                trans = NULL;
@@ -3860,9 +3857,8 @@ restart:
                          GFP_NOFS);
        if (trans) {
-                nr = trans->blocks_used;
                btrfs_end_transaction_throttle(trans, rc->extent_root);
-                btrfs_btree_balance_dirty(rc->extent_root, nr);
+                btrfs_btree_balance_dirty(rc->extent_root);
        }
        if (!err) {
@@ -3941,7 +3937,6 @@ struct inode *create_reloc_inode(struct btrfs_fs_info *fs_info,
        struct btrfs_trans_handle *trans;
        struct btrfs_root *root;
        struct btrfs_key key;
-        unsigned long nr;
        u64 objectid = BTRFS_FIRST_FREE_OBJECTID;
        int err = 0;
@@ -3969,9 +3964,8 @@ struct inode *create_reloc_inode(struct btrfs_fs_info *fs_info,
        err = btrfs_orphan_add(trans, inode);
 out:
-        nr = trans->blocks_used;
        btrfs_end_transaction(trans, root);
-        btrfs_btree_balance_dirty(root, nr);
+        btrfs_btree_balance_dirty(root);
        if (err) {
                if (inode)
                        iput(inode);
@@ -4057,7 +4051,11 @@ int btrfs_relocate_block_group(struct btrfs_root *extent_root, u64 group_start)
               (unsigned long long)rc->block_group->key.objectid,
               (unsigned long long)rc->block_group->flags);
-        btrfs_start_delalloc_inodes(fs_info->tree_root, 0);
+        ret = btrfs_start_delalloc_inodes(fs_info->tree_root, 0);
+        if (ret < 0) {
+                err = ret;
+                goto out;
+        }
        btrfs_wait_ordered_extents(fs_info->tree_root, 0);
        while (1) {
diff --git a/fs/btrfs/root-tree.c b/fs/btrfs/root-tree.c
index eb923d087da7..668af537a3ea 100644
--- a/fs/btrfs/root-tree.c
+++ b/fs/btrfs/root-tree.c
@@ -548,9 +548,9 @@ void btrfs_update_root_times(struct btrfs_trans_handle *trans,
        struct btrfs_root_item *item = &root->root_item;
        struct timespec ct = CURRENT_TIME;
-        spin_lock(&root->root_times_lock);
+        spin_lock(&root->root_item_lock);
        item->ctransid = cpu_to_le64(trans->transid);
        item->ctime.sec = cpu_to_le64(ct.tv_sec);
        item->ctime.nsec = cpu_to_le32(ct.tv_nsec);
-        spin_unlock(&root->root_times_lock);
+        spin_unlock(&root->root_item_lock);
 }
diff --git a/fs/btrfs/scrub.c b/fs/btrfs/scrub.c
index 27892f67e69b..67783e03d121 100644
--- a/fs/btrfs/scrub.c
+++ b/fs/btrfs/scrub.c
@@ -1,5 +1,5 @@
 /*
- * Copyright (C) 2011 STRATO.  All rights reserved.
+ * Copyright (C) 2011, 2012 STRATO.  All rights reserved.
 *
 * This program is free software; you can redistribute it and/or
 * modify it under the terms of the GNU General Public
@@ -25,6 +25,7 @@
 #include "transaction.h"
 #include "backref.h"
 #include "extent_io.h"
+#include "dev-replace.h"
 #include "check-integrity.h"
 #include "rcu-string.h"
@@ -42,10 +43,23 @@
 */
 struct scrub_block;
-struct scrub_dev;
+struct scrub_ctx;
-#define SCRUB_PAGES_PER_BIO     16      /* 64k per bio */
+/*
-#define SCRUB_BIOS_PER_DEV      16      /* 1 MB per device in flight */
+ * the following three values only influence the performance.
+ * The last one configures the number of parallel and outstanding I/O
+ * operations. The first two values configure an upper limit for the number
+ * of (dynamically allocated) pages that are added to a bio.
+ */
+#define SCRUB_PAGES_PER_RD_BIO  32      /* 128k per bio */
+#define SCRUB_PAGES_PER_WR_BIO  32      /* 128k per bio */
+#define SCRUB_BIOS_PER_SCTX     64      /* 8MB per device in flight */
+/*
+ * the following value times PAGE_SIZE needs to be large enough to match the
+ * largest node/leaf/sector size that shall be supported.
+ * Values larger than BTRFS_STRIPE_LEN are not supported.
+ */
 #define SCRUB_MAX_PAGES_PER_BLOCK       16      /* 64k per node/leaf/sector */
 struct scrub_page {
@@ -56,6 +70,8 @@ struct scrub_page {
        u64                     generation;
        u64                     logical;
        u64                     physical;
+        u64                     physical_for_dev_replace;
+        atomic_t                ref_count;
        struct {
                unsigned int    mirror_num:8;
                unsigned int    have_csum:1;
@@ -66,23 +82,28 @@ struct scrub_page {
 struct scrub_bio {
        int                     index;
-        struct scrub_dev        *sdev;
+        struct scrub_ctx        *sctx;
+        struct btrfs_device     *dev;
        struct bio              *bio;
        int                     err;
        u64                     logical;
        u64                     physical;
-        struct scrub_page       *pagev[SCRUB_PAGES_PER_BIO];
+#if SCRUB_PAGES_PER_WR_BIO >= SCRUB_PAGES_PER_RD_BIO
+        struct scrub_page       *pagev[SCRUB_PAGES_PER_WR_BIO];
+#else
+        struct scrub_page       *pagev[SCRUB_PAGES_PER_RD_BIO];
+#endif
        int                     page_count;
        int                     next_free;
        struct btrfs_work       work;
 };
 struct scrub_block {
-        struct scrub_page       pagev[SCRUB_MAX_PAGES_PER_BLOCK];
+        struct scrub_page       *pagev[SCRUB_MAX_PAGES_PER_BLOCK];
        int                     page_count;
        atomic_t                outstanding_pages;
        atomic_t                ref_count; /* free mem on transition to zero */
-        struct scrub_dev        *sdev;
+        struct scrub_ctx        *sctx;
        struct {
                unsigned int    header_error:1;
                unsigned int    checksum_error:1;
@@ -91,23 +112,35 @@ struct scrub_block {
        };
 };
-struct scrub_dev {
+struct scrub_wr_ctx {
-        struct scrub_bio        *bios[SCRUB_BIOS_PER_DEV];
+        struct scrub_bio *wr_curr_bio;
-        struct btrfs_device     *dev;
+        struct btrfs_device *tgtdev;
+        int pages_per_wr_bio; /* <= SCRUB_PAGES_PER_WR_BIO */
+        atomic_t flush_all_writes;
+        struct mutex wr_lock;
+};
+struct scrub_ctx {
+        struct scrub_bio        *bios[SCRUB_BIOS_PER_SCTX];
+        struct btrfs_root       *dev_root;
        int                     first_free;
        int                     curr;
-        atomic_t                in_flight;
+        atomic_t                bios_in_flight;
-        atomic_t                fixup_cnt;
+        atomic_t                workers_pending;
        spinlock_t              list_lock;
        wait_queue_head_t       list_wait;
        u16                     csum_size;
        struct list_head        csum_list;
        atomic_t                cancel_req;
        int                     readonly;
-        int                     pages_per_bio; /* <= SCRUB_PAGES_PER_BIO */
+        int                     pages_per_rd_bio;
        u32                     sectorsize;
        u32                     nodesize;
        u32                     leafsize;
+        int                     is_dev_replace;
+        struct scrub_wr_ctx     wr_ctx;
        /*
         * statistics
         */
@@ -116,13 +149,23 @@ struct scrub_dev {
 };
 struct scrub_fixup_nodatasum {
-        struct scrub_dev        *sdev;
+        struct scrub_ctx        *sctx;
+        struct btrfs_device     *dev;
        u64                     logical;
        struct btrfs_root       *root;
        struct btrfs_work       work;
        int                     mirror_num;
 };
+struct scrub_copy_nocow_ctx {
+        struct scrub_ctx        *sctx;
+        u64                     logical;
+        u64                     len;
+        int                     mirror_num;
+        u64                     physical_for_dev_replace;
+        struct btrfs_work       work;
+};
 struct scrub_warning {
        struct btrfs_path       *path;
        u64                     extent_item_size;
@@ -137,15 +180,20 @@ struct scrub_warning {
 };
+static void scrub_pending_bio_inc(struct scrub_ctx *sctx);
+static void scrub_pending_bio_dec(struct scrub_ctx *sctx);
+static void scrub_pending_trans_workers_inc(struct scrub_ctx *sctx);
+static void scrub_pending_trans_workers_dec(struct scrub_ctx *sctx);
 static int scrub_handle_errored_block(struct scrub_block *sblock_to_check);
-static int scrub_setup_recheck_block(struct scrub_dev *sdev,
+static int scrub_setup_recheck_block(struct scrub_ctx *sctx,
-                                     struct btrfs_mapping_tree *map_tree,
+                                     struct btrfs_fs_info *fs_info,
+                                     struct scrub_block *original_sblock,
                                     u64 length, u64 logical,
-                                     struct scrub_block *sblock);
+                                     struct scrub_block *sblocks_for_recheck);
-static int scrub_recheck_block(struct btrfs_fs_info *fs_info,
+static void scrub_recheck_block(struct btrfs_fs_info *fs_info,
-                               struct scrub_block *sblock, int is_metadata,
+                                struct scrub_block *sblock, int is_metadata,
-                               int have_csum, u8 *csum, u64 generation,
+                                int have_csum, u8 *csum, u64 generation,
-                               u16 csum_size);
+                                u16 csum_size);
 static void scrub_recheck_block_checksum(struct btrfs_fs_info *fs_info,
                                         struct scrub_block *sblock,
                                         int is_metadata, int have_csum,
@@ -158,118 +206,221 @@ static int scrub_repair_block_from_good_copy(struct scrub_block *sblock_bad,
 static int scrub_repair_page_from_good_copy(struct scrub_block *sblock_bad,
                                            struct scrub_block *sblock_good,
                                            int page_num, int force_write);
+static void scrub_write_block_to_dev_replace(struct scrub_block *sblock);
+static int scrub_write_page_to_dev_replace(struct scrub_block *sblock,
+                                           int page_num);
 static int scrub_checksum_data(struct scrub_block *sblock);
 static int scrub_checksum_tree_block(struct scrub_block *sblock);
 static int scrub_checksum_super(struct scrub_block *sblock);
 static void scrub_block_get(struct scrub_block *sblock);
 static void scrub_block_put(struct scrub_block *sblock);
-static int scrub_add_page_to_bio(struct scrub_dev *sdev,
+static void scrub_page_get(struct scrub_page *spage);
-                                 struct scrub_page *spage);
+static void scrub_page_put(struct scrub_page *spage);
-static int scrub_pages(struct scrub_dev *sdev, u64 logical, u64 len,
+static int scrub_add_page_to_rd_bio(struct scrub_ctx *sctx,
-                       u64 physical, u64 flags, u64 gen, int mirror_num,
+                                    struct scrub_page *spage);
-                       u8 *csum, int force);
+static int scrub_pages(struct scrub_ctx *sctx, u64 logical, u64 len,
+                       u64 physical, struct btrfs_device *dev, u64 flags,
+                       u64 gen, int mirror_num, u8 *csum, int force,
+                       u64 physical_for_dev_replace);
 static void scrub_bio_end_io(struct bio *bio, int err);
 static void scrub_bio_end_io_worker(struct btrfs_work *work);
 static void scrub_block_complete(struct scrub_block *sblock);
+static void scrub_remap_extent(struct btrfs_fs_info *fs_info,
+                               u64 extent_logical, u64 extent_len,
+                               u64 *extent_physical,
+                               struct btrfs_device **extent_dev,
+                               int *extent_mirror_num);
+static int scrub_setup_wr_ctx(struct scrub_ctx *sctx,
+                              struct scrub_wr_ctx *wr_ctx,
+                              struct btrfs_fs_info *fs_info,
+                              struct btrfs_device *dev,
+                              int is_dev_replace);
+static void scrub_free_wr_ctx(struct scrub_wr_ctx *wr_ctx);
+static int scrub_add_page_to_wr_bio(struct scrub_ctx *sctx,
+                                    struct scrub_page *spage);
+static void scrub_wr_submit(struct scrub_ctx *sctx);
+static void scrub_wr_bio_end_io(struct bio *bio, int err);
+static void scrub_wr_bio_end_io_worker(struct btrfs_work *work);
+static int write_page_nocow(struct scrub_ctx *sctx,
+                            u64 physical_for_dev_replace, struct page *page);
+static int copy_nocow_pages_for_inode(u64 inum, u64 offset, u64 root,
+                                      void *ctx);
+static int copy_nocow_pages(struct scrub_ctx *sctx, u64 logical, u64 len,
+                            int mirror_num, u64 physical_for_dev_replace);
+static void copy_nocow_pages_worker(struct btrfs_work *work);
+static void scrub_pending_bio_inc(struct scrub_ctx *sctx)
+{
+        atomic_inc(&sctx->bios_in_flight);
+}
+static void scrub_pending_bio_dec(struct scrub_ctx *sctx)
+{
+        atomic_dec(&sctx->bios_in_flight);
+        wake_up(&sctx->list_wait);
+}
-static void scrub_free_csums(struct scrub_dev *sdev)
+/*
+ * used for workers that require transaction commits (i.e., for the
+ * NOCOW case)
+ */
+static void scrub_pending_trans_workers_inc(struct scrub_ctx *sctx)
 {
-        while (!list_empty(&sdev->csum_list)) {
+        struct btrfs_fs_info *fs_info = sctx->dev_root->fs_info;
+        /*
+         * increment scrubs_running to prevent cancel requests from
+         * completing as long as a worker is running. we must also
+         * increment scrubs_paused to prevent deadlocking on pause
+         * requests used for transactions commits (as the worker uses a
+         * transaction context). it is safe to regard the worker
+         * as paused for all matters practical. effectively, we only
+         * avoid cancellation requests from completing.
+         */
+        mutex_lock(&fs_info->scrub_lock);
+        atomic_inc(&fs_info->scrubs_running);
+        atomic_inc(&fs_info->scrubs_paused);
+        mutex_unlock(&fs_info->scrub_lock);
+        atomic_inc(&sctx->workers_pending);
+}
+/* used for workers that require transaction commits */
+static void scrub_pending_trans_workers_dec(struct scrub_ctx *sctx)
+{
+        struct btrfs_fs_info *fs_info = sctx->dev_root->fs_info;
+        /*
+         * see scrub_pending_trans_workers_inc() why we're pretending
+         * to be paused in the scrub counters
+         */
+        mutex_lock(&fs_info->scrub_lock);
+        atomic_dec(&fs_info->scrubs_running);
+        atomic_dec(&fs_info->scrubs_paused);
+        mutex_unlock(&fs_info->scrub_lock);
+        atomic_dec(&sctx->workers_pending);
+        wake_up(&fs_info->scrub_pause_wait);
+        wake_up(&sctx->list_wait);
+}
+static void scrub_free_csums(struct scrub_ctx *sctx)
+{
+        while (!list_empty(&sctx->csum_list)) {
                struct btrfs_ordered_sum *sum;
-                sum = list_first_entry(&sdev->csum_list,
+                sum = list_first_entry(&sctx->csum_list,
                                       struct btrfs_ordered_sum, list);
                list_del(&sum->list);
                kfree(sum);
        }
 }
-static noinline_for_stack void scrub_free_dev(struct scrub_dev *sdev)
+static noinline_for_stack void scrub_free_ctx(struct scrub_ctx *sctx)
 {
        int i;
-        if (!sdev)
+        if (!sctx)
                return;
+        scrub_free_wr_ctx(&sctx->wr_ctx);
        /* this can happen when scrub is cancelled */
-        if (sdev->curr != -1) {
+        if (sctx->curr != -1) {
-                struct scrub_bio *sbio = sdev->bios[sdev->curr];
+                struct scrub_bio *sbio = sctx->bios[sctx->curr];
                for (i = 0; i < sbio->page_count; i++) {
-                        BUG_ON(!sbio->pagev[i]);
+                        WARN_ON(!sbio->pagev[i]->page);
-                        BUG_ON(!sbio->pagev[i]->page);
                        scrub_block_put(sbio->pagev[i]->sblock);
                }
                bio_put(sbio->bio);
        }
-        for (i = 0; i < SCRUB_BIOS_PER_DEV; ++i) {
+        for (i = 0; i < SCRUB_BIOS_PER_SCTX; ++i) {
-                struct scrub_bio *sbio = sdev->bios[i];
+                struct scrub_bio *sbio = sctx->bios[i];
                if (!sbio)
                        break;
                kfree(sbio);
        }
-        scrub_free_csums(sdev);
+        scrub_free_csums(sctx);
-        kfree(sdev);
+        kfree(sctx);
 }
 static noinline_for_stack
-struct scrub_dev *scrub_setup_dev(struct btrfs_device *dev)
+struct scrub_ctx *scrub_setup_ctx(struct btrfs_device *dev, int is_dev_replace)
 {
-        struct scrub_dev *sdev;
+        struct scrub_ctx *sctx;
        int             i;
        struct btrfs_fs_info *fs_info = dev->dev_root->fs_info;
-        int pages_per_bio;
+        int pages_per_rd_bio;
+        int ret;
-        pages_per_bio = min_t(int, SCRUB_PAGES_PER_BIO,
+        /*
-                              bio_get_nr_vecs(dev->bdev));
+         * the setting of pages_per_rd_bio is correct for scrub but might
-        sdev = kzalloc(sizeof(*sdev), GFP_NOFS);
+         * be wrong for the dev_replace code where we might read from
-        if (!sdev)
+         * different devices in the initial huge bios. However, that
+         * code is able to correctly handle the case when adding a page
+         * to a bio fails.
+         */
+        if (dev->bdev)
+                pages_per_rd_bio = min_t(int, SCRUB_PAGES_PER_RD_BIO,
+                                         bio_get_nr_vecs(dev->bdev));
+        else
+                pages_per_rd_bio = SCRUB_PAGES_PER_RD_BIO;
+        sctx = kzalloc(sizeof(*sctx), GFP_NOFS);
+        if (!sctx)
                goto nomem;
-        sdev->dev = dev;
+        sctx->is_dev_replace = is_dev_replace;
-        sdev->pages_per_bio = pages_per_bio;
+        sctx->pages_per_rd_bio = pages_per_rd_bio;
-        sdev->curr = -1;
+        sctx->curr = -1;
-        for (i = 0; i < SCRUB_BIOS_PER_DEV; ++i) {
+        sctx->dev_root = dev->dev_root;
+        for (i = 0; i < SCRUB_BIOS_PER_SCTX; ++i) {
                struct scrub_bio *sbio;
                sbio = kzalloc(sizeof(*sbio), GFP_NOFS);
                if (!sbio)
                        goto nomem;
-                sdev->bios[i] = sbio;
+                sctx->bios[i] = sbio;
                sbio->index = i;
-                sbio->sdev = sdev;
+                sbio->sctx = sctx;
                sbio->page_count = 0;
                sbio->work.func = scrub_bio_end_io_worker;
-                if (i != SCRUB_BIOS_PER_DEV-1)
+                if (i != SCRUB_BIOS_PER_SCTX - 1)
-                        sdev->bios[i]->next_free = i + 1;
+                        sctx->bios[i]->next_free = i + 1;
                else
-                        sdev->bios[i]->next_free = -1;
+                        sctx->bios[i]->next_free = -1;
-        }
+        }
-        sdev->first_free = 0;
+        sctx->first_free = 0;
-        sdev->nodesize = dev->dev_root->nodesize;
+        sctx->nodesize = dev->dev_root->nodesize;
-        sdev->leafsize = dev->dev_root->leafsize;
+        sctx->leafsize = dev->dev_root->leafsize;
-        sdev->sectorsize = dev->dev_root->sectorsize;
+        sctx->sectorsize = dev->dev_root->sectorsize;
-        atomic_set(&sdev->in_flight, 0);
+        atomic_set(&sctx->bios_in_flight, 0);
-        atomic_set(&sdev->fixup_cnt, 0);
+        atomic_set(&sctx->workers_pending, 0);
-        atomic_set(&sdev->cancel_req, 0);
+        atomic_set(&sctx->cancel_req, 0);
-        sdev->csum_size = btrfs_super_csum_size(fs_info->super_copy);
+        sctx->csum_size = btrfs_super_csum_size(fs_info->super_copy);
-        INIT_LIST_HEAD(&sdev->csum_list);
+        INIT_LIST_HEAD(&sctx->csum_list);
-        spin_lock_init(&sdev->list_lock);
+        spin_lock_init(&sctx->list_lock);
-        spin_lock_init(&sdev->stat_lock);
+        spin_lock_init(&sctx->stat_lock);
-        init_waitqueue_head(&sdev->list_wait);
+        init_waitqueue_head(&sctx->list_wait);
-        return sdev;
+        ret = scrub_setup_wr_ctx(sctx, &sctx->wr_ctx, fs_info,
+                                 fs_info->dev_replace.tgtdev, is_dev_replace);
+        if (ret) {
+                scrub_free_ctx(sctx);
+                return ERR_PTR(ret);
+        }
+        return sctx;
 nomem:
-        scrub_free_dev(sdev);
+        scrub_free_ctx(sctx);
        return ERR_PTR(-ENOMEM);
 }
-static int scrub_print_warning_inode(u64 inum, u64 offset, u64 root, void *ctx)
+static int scrub_print_warning_inode(u64 inum, u64 offset, u64 root,
+                                     void *warn_ctx)
 {
        u64 isize;
        u32 nlink;
@@ -277,7 +428,7 @@ static int scrub_print_warning_inode(u64 inum, u64 offset, u64 root, void *ctx)
        int i;
        struct extent_buffer *eb;
        struct btrfs_inode_item *inode_item;
-        struct scrub_warning *swarn = ctx;
+        struct scrub_warning *swarn = warn_ctx;
        struct btrfs_fs_info *fs_info = swarn->dev->dev_root->fs_info;
        struct inode_fs_paths *ipath = NULL;
        struct btrfs_root *local_root;
@@ -345,8 +496,8 @@ err:
 static void scrub_print_warning(const char *errstr, struct scrub_block *sblock)
 {
-        struct btrfs_device *dev = sblock->sdev->dev;
+        struct btrfs_device *dev;
-        struct btrfs_fs_info *fs_info = dev->dev_root->fs_info;
+        struct btrfs_fs_info *fs_info;
        struct btrfs_path *path;
        struct btrfs_key found_key;
        struct extent_buffer *eb;
@@ -361,15 +512,18 @@ static void scrub_print_warning(const char *errstr, struct scrub_block *sblock)
        const int bufsize = 4096;
        int ret;
+        WARN_ON(sblock->page_count < 1);
+        dev = sblock->pagev[0]->dev;
+        fs_info = sblock->sctx->dev_root->fs_info;
        path = btrfs_alloc_path();
        swarn.scratch_buf = kmalloc(bufsize, GFP_NOFS);
        swarn.msg_buf = kmalloc(bufsize, GFP_NOFS);
-        BUG_ON(sblock->page_count < 1);
+        swarn.sector = (sblock->pagev[0]->physical) >> 9;
-        swarn.sector = (sblock->pagev[0].physical) >> 9;
+        swarn.logical = sblock->pagev[0]->logical;
-        swarn.logical = sblock->pagev[0].logical;
        swarn.errstr = errstr;
-        swarn.dev = dev;
+        swarn.dev = NULL;
        swarn.msg_bufsize = bufsize;
        swarn.scratch_bufsize = bufsize;
@@ -405,6 +559,7 @@ static void scrub_print_warning(const char *errstr, struct scrub_block *sblock)
                } while (ret != 1);
        } else {
                swarn.path = path;
+                swarn.dev = dev;
                iterate_extent_inodes(fs_info, found_key.objectid,
                                        extent_item_pos, 1,
                                        scrub_print_warning_inode, &swarn);
@@ -416,29 +571,38 @@ out:
        kfree(swarn.msg_buf);
 }
-static int scrub_fixup_readpage(u64 inum, u64 offset, u64 root, void *ctx)
+static int scrub_fixup_readpage(u64 inum, u64 offset, u64 root, void *fixup_ctx)
 {
        struct page *page = NULL;
        unsigned long index;
-        struct scrub_fixup_nodatasum *fixup = ctx;
+        struct scrub_fixup_nodatasum *fixup = fixup_ctx;
        int ret;
        int corrected = 0;
        struct btrfs_key key;
        struct inode *inode = NULL;
+        struct btrfs_fs_info *fs_info;
        u64 end = offset + PAGE_SIZE - 1;
        struct btrfs_root *local_root;
+        int srcu_index;
        key.objectid = root;
        key.type = BTRFS_ROOT_ITEM_KEY;
        key.offset = (u64)-1;
-        local_root = btrfs_read_fs_root_no_name(fixup->root->fs_info, &key);
-        if (IS_ERR(local_root))
+        fs_info = fixup->root->fs_info;
+        srcu_index = srcu_read_lock(&fs_info->subvol_srcu);
+        local_root = btrfs_read_fs_root_no_name(fs_info, &key);
+        if (IS_ERR(local_root)) {
+                srcu_read_unlock(&fs_info->subvol_srcu, srcu_index);
                return PTR_ERR(local_root);
+        }
        key.type = BTRFS_INODE_ITEM_KEY;
        key.objectid = inum;
        key.offset = 0;
-        inode = btrfs_iget(fixup->root->fs_info->sb, &key, local_root, NULL);
+        inode = btrfs_iget(fs_info->sb, &key, local_root, NULL);
+        srcu_read_unlock(&fs_info->subvol_srcu, srcu_index);
        if (IS_ERR(inode))
                return PTR_ERR(inode);
@@ -451,7 +615,6 @@ static int scrub_fixup_readpage(u64 inum, u64 offset, u64 root, void *ctx)
        }
        if (PageUptodate(page)) {
-                struct btrfs_mapping_tree *map_tree;
                if (PageDirty(page)) {
                        /*
                         * we need to write the data to the defect sector. the
@@ -472,8 +635,8 @@ static int scrub_fixup_readpage(u64 inum, u64 offset, u64 root, void *ctx)
                        ret = -EIO;
                        goto out;
                }
-                map_tree = &BTRFS_I(inode)->root->fs_info->mapping_tree;
+                fs_info = BTRFS_I(inode)->root->fs_info;
-                ret = repair_io_failure(map_tree, offset, PAGE_SIZE,
+                ret = repair_io_failure(fs_info, offset, PAGE_SIZE,
                                        fixup->logical, page,
                                        fixup->mirror_num);
                unlock_page(page);
@@ -530,21 +693,21 @@ static void scrub_fixup_nodatasum(struct btrfs_work *work)
 {
        int ret;
        struct scrub_fixup_nodatasum *fixup;
-        struct scrub_dev *sdev;
+        struct scrub_ctx *sctx;
        struct btrfs_trans_handle *trans = NULL;
        struct btrfs_fs_info *fs_info;
        struct btrfs_path *path;
        int uncorrectable = 0;
        fixup = container_of(work, struct scrub_fixup_nodatasum, work);
-        sdev = fixup->sdev;
+        sctx = fixup->sctx;
        fs_info = fixup->root->fs_info;
        path = btrfs_alloc_path();
        if (!path) {
-                spin_lock(&sdev->stat_lock);
+                spin_lock(&sctx->stat_lock);
-                ++sdev->stat.malloc_errors;
+                ++sctx->stat.malloc_errors;
-                spin_unlock(&sdev->stat_lock);
+                spin_unlock(&sctx->stat_lock);
                uncorrectable = 1;
                goto out;
        }
@@ -573,35 +736,30 @@ static void scrub_fixup_nodatasum(struct btrfs_work *work)
        }
        WARN_ON(ret != 1);
-        spin_lock(&sdev->stat_lock);
+        spin_lock(&sctx->stat_lock);
-        ++sdev->stat.corrected_errors;
+        ++sctx->stat.corrected_errors;
-        spin_unlock(&sdev->stat_lock);
+        spin_unlock(&sctx->stat_lock);
 out:
        if (trans && !IS_ERR(trans))
                btrfs_end_transaction(trans, fixup->root);
        if (uncorrectable) {
-                spin_lock(&sdev->stat_lock);
+                spin_lock(&sctx->stat_lock);
-                ++sdev->stat.uncorrectable_errors;
+                ++sctx->stat.uncorrectable_errors;
-                spin_unlock(&sdev->stat_lock);
+                spin_unlock(&sctx->stat_lock);
+                btrfs_dev_replace_stats_inc(
+                        &sctx->dev_root->fs_info->dev_replace.
+                        num_uncorrectable_read_errors);
                printk_ratelimited_in_rcu(KERN_ERR
                        "btrfs: unable to fixup (nodatasum) error at logical %llu on dev %s\n",
                        (unsigned long long)fixup->logical,
-                        rcu_str_deref(sdev->dev->name));
+                        rcu_str_deref(fixup->dev->name));
        }
        btrfs_free_path(path);
        kfree(fixup);
-        /* see caller why we're pretending to be paused in the scrub counters */
+        scrub_pending_trans_workers_dec(sctx);
-        mutex_lock(&fs_info->scrub_lock);
-        atomic_dec(&fs_info->scrubs_running);
-        atomic_dec(&fs_info->scrubs_paused);
-        mutex_unlock(&fs_info->scrub_lock);
-        atomic_dec(&sdev->fixup_cnt);
-        wake_up(&fs_info->scrub_pause_wait);
-        wake_up(&sdev->list_wait);
 }
 /*
@@ -614,7 +772,8 @@ out:
 */
 static int scrub_handle_errored_block(struct scrub_block *sblock_to_check)
 {
-        struct scrub_dev *sdev = sblock_to_check->sdev;
+        struct scrub_ctx *sctx = sblock_to_check->sctx;
+        struct btrfs_device *dev;
        struct btrfs_fs_info *fs_info;
        u64 length;
        u64 logical;
@@ -633,16 +792,33 @@ static int scrub_handle_errored_block(struct scrub_block *sblock_to_check)
                                      DEFAULT_RATELIMIT_BURST);
        BUG_ON(sblock_to_check->page_count < 1);
-        fs_info = sdev->dev->dev_root->fs_info;
+        fs_info = sctx->dev_root->fs_info;
+        if (sblock_to_check->pagev[0]->flags & BTRFS_EXTENT_FLAG_SUPER) {
+                /*
+                 * if we find an error in a super block, we just report it.
+                 * They will get written with the next transaction commit
+                 * anyway
+                 */
+                spin_lock(&sctx->stat_lock);
+                ++sctx->stat.super_errors;
+                spin_unlock(&sctx->stat_lock);
+                return 0;
+        }
        length = sblock_to_check->page_count * PAGE_SIZE;
-        logical = sblock_to_check->pagev[0].logical;
+        logical = sblock_to_check->pagev[0]->logical;
-        generation = sblock_to_check->pagev[0].generation;
+        generation = sblock_to_check->pagev[0]->generation;
-        BUG_ON(sblock_to_check->pagev[0].mirror_num < 1);
+        BUG_ON(sblock_to_check->pagev[0]->mirror_num < 1);
-        failed_mirror_index = sblock_to_check->pagev[0].mirror_num - 1;
+        failed_mirror_index = sblock_to_check->pagev[0]->mirror_num - 1;
-        is_metadata = !(sblock_to_check->pagev[0].flags &
+        is_metadata = !(sblock_to_check->pagev[0]->flags &
                        BTRFS_EXTENT_FLAG_DATA);
-        have_csum = sblock_to_check->pagev[0].have_csum;
+        have_csum = sblock_to_check->pagev[0]->have_csum;
-        csum = sblock_to_check->pagev[0].csum;
+        csum = sblock_to_check->pagev[0]->csum;
+        dev = sblock_to_check->pagev[0]->dev;
+        if (sctx->is_dev_replace && !is_metadata && !have_csum) {
+                sblocks_for_recheck = NULL;
+                goto nodatasum_case;
+        }
        /*
         * read all mirrors one after the other. This includes to
@@ -677,43 +853,32 @@ static int scrub_handle_errored_block(struct scrub_block *sblock_to_check)
                                     sizeof(*sblocks_for_recheck),
                                     GFP_NOFS);
        if (!sblocks_for_recheck) {
-                spin_lock(&sdev->stat_lock);
+                spin_lock(&sctx->stat_lock);
-                sdev->stat.malloc_errors++;
+                sctx->stat.malloc_errors++;
-                sdev->stat.read_errors++;
+                sctx->stat.read_errors++;
-                sdev->stat.uncorrectable_errors++;
+                sctx->stat.uncorrectable_errors++;
-                spin_unlock(&sdev->stat_lock);
+                spin_unlock(&sctx->stat_lock);
-                btrfs_dev_stat_inc_and_print(sdev->dev,
+                btrfs_dev_stat_inc_and_print(dev, BTRFS_DEV_STAT_READ_ERRS);
-                                             BTRFS_DEV_STAT_READ_ERRS);
                goto out;
        }
        /* setup the context, map the logical blocks and alloc the pages */
-        ret = scrub_setup_recheck_block(sdev, &fs_info->mapping_tree, length,
+        ret = scrub_setup_recheck_block(sctx, fs_info, sblock_to_check, length,
                                        logical, sblocks_for_recheck);
        if (ret) {
-                spin_lock(&sdev->stat_lock);
+                spin_lock(&sctx->stat_lock);
-                sdev->stat.read_errors++;
+                sctx->stat.read_errors++;
-                sdev->stat.uncorrectable_errors++;
+                sctx->stat.uncorrectable_errors++;
-                spin_unlock(&sdev->stat_lock);
+                spin_unlock(&sctx->stat_lock);
-                btrfs_dev_stat_inc_and_print(sdev->dev,
+                btrfs_dev_stat_inc_and_print(dev, BTRFS_DEV_STAT_READ_ERRS);
-                                             BTRFS_DEV_STAT_READ_ERRS);
                goto out;
        }
        BUG_ON(failed_mirror_index >= BTRFS_MAX_MIRRORS);
        sblock_bad = sblocks_for_recheck + failed_mirror_index;
        /* build and submit the bios for the failed mirror, check checksums */
-        ret = scrub_recheck_block(fs_info, sblock_bad, is_metadata, have_csum,
+        scrub_recheck_block(fs_info, sblock_bad, is_metadata, have_csum,
-                                  csum, generation, sdev->csum_size);
+                            csum, generation, sctx->csum_size);
-        if (ret) {
-                spin_lock(&sdev->stat_lock);
-                sdev->stat.read_errors++;
-                sdev->stat.uncorrectable_errors++;
-                spin_unlock(&sdev->stat_lock);
-                btrfs_dev_stat_inc_and_print(sdev->dev,
-                                             BTRFS_DEV_STAT_READ_ERRS);
-                goto out;
-        }
        if (!sblock_bad->header_error && !sblock_bad->checksum_error &&
            sblock_bad->no_io_error_seen) {
@@ -725,50 +890,54 @@ static int scrub_handle_errored_block(struct scrub_block *sblock_to_check)
                 * different bio (usually one of the two latter cases is
                 * the cause)
                 */
-                spin_lock(&sdev->stat_lock);
+                spin_lock(&sctx->stat_lock);
-                sdev->stat.unverified_errors++;
+                sctx->stat.unverified_errors++;
-                spin_unlock(&sdev->stat_lock);
+                spin_unlock(&sctx->stat_lock);
+                if (sctx->is_dev_replace)
+                        scrub_write_block_to_dev_replace(sblock_bad);
                goto out;
        }
        if (!sblock_bad->no_io_error_seen) {
-                spin_lock(&sdev->stat_lock);
+                spin_lock(&sctx->stat_lock);
-                sdev->stat.read_errors++;
+                sctx->stat.read_errors++;
-                spin_unlock(&sdev->stat_lock);
+                spin_unlock(&sctx->stat_lock);
                if (__ratelimit(&_rs))
                        scrub_print_warning("i/o error", sblock_to_check);
-                btrfs_dev_stat_inc_and_print(sdev->dev,
+                btrfs_dev_stat_inc_and_print(dev, BTRFS_DEV_STAT_READ_ERRS);
-                                             BTRFS_DEV_STAT_READ_ERRS);
        } else if (sblock_bad->checksum_error) {
-                spin_lock(&sdev->stat_lock);
+                spin_lock(&sctx->stat_lock);
-                sdev->stat.csum_errors++;
+                sctx->stat.csum_errors++;
-                spin_unlock(&sdev->stat_lock);
+                spin_unlock(&sctx->stat_lock);
                if (__ratelimit(&_rs))
                        scrub_print_warning("checksum error", sblock_to_check);
-                btrfs_dev_stat_inc_and_print(sdev->dev,
+                btrfs_dev_stat_inc_and_print(dev,
                                             BTRFS_DEV_STAT_CORRUPTION_ERRS);
        } else if (sblock_bad->header_error) {
-                spin_lock(&sdev->stat_lock);
+                spin_lock(&sctx->stat_lock);
-                sdev->stat.verify_errors++;
+                sctx->stat.verify_errors++;
-                spin_unlock(&sdev->stat_lock);
+                spin_unlock(&sctx->stat_lock);
                if (__ratelimit(&_rs))
                        scrub_print_warning("checksum/header error",
                                            sblock_to_check);
                if (sblock_bad->generation_error)
-                        btrfs_dev_stat_inc_and_print(sdev->dev,
+                        btrfs_dev_stat_inc_and_print(dev,
                                BTRFS_DEV_STAT_GENERATION_ERRS);
                else
-                        btrfs_dev_stat_inc_and_print(sdev->dev,
+                        btrfs_dev_stat_inc_and_print(dev,
                                BTRFS_DEV_STAT_CORRUPTION_ERRS);
        }
-        if (sdev->readonly)
+        if (sctx->readonly && !sctx->is_dev_replace)
                goto did_not_correct_error;
        if (!is_metadata && !have_csum) {
                struct scrub_fixup_nodatasum *fixup_nodatasum;
+nodatasum_case:
+                WARN_ON(sctx->is_dev_replace);
                /*
                 * !is_metadata and !have_csum, this means that the data
                 * might not be COW'ed, that it might be modified
@@ -779,24 +948,12 @@ static int scrub_handle_errored_block(struct scrub_block *sblock_to_check)
                fixup_nodatasum = kzalloc(sizeof(*fixup_nodatasum), GFP_NOFS);
                if (!fixup_nodatasum)
                        goto did_not_correct_error;
-                fixup_nodatasum->sdev = sdev;
+                fixup_nodatasum->sctx = sctx;
+                fixup_nodatasum->dev = dev;
                fixup_nodatasum->logical = logical;
                fixup_nodatasum->root = fs_info->extent_root;
                fixup_nodatasum->mirror_num = failed_mirror_index + 1;
-                /*
+                scrub_pending_trans_workers_inc(sctx);
-                 * increment scrubs_running to prevent cancel requests from
-                 * completing as long as a fixup worker is running. we must also
-                 * increment scrubs_paused to prevent deadlocking on pause
-                 * requests used for transactions commits (as the worker uses a
-                 * transaction context). it is safe to regard the fixup worker
-                 * as paused for all matters practical. effectively, we only
-                 * avoid cancellation requests from completing.
-                 */
-                mutex_lock(&fs_info->scrub_lock);
-                atomic_inc(&fs_info->scrubs_running);
-                atomic_inc(&fs_info->scrubs_paused);
-                mutex_unlock(&fs_info->scrub_lock);
-                atomic_inc(&sdev->fixup_cnt);
                fixup_nodatasum->work.func = scrub_fixup_nodatasum;
                btrfs_queue_worker(&fs_info->scrub_workers,
                                   &fixup_nodatasum->work);
@@ -805,26 +962,8 @@ static int scrub_handle_errored_block(struct scrub_block *sblock_to_check)
        /*
         * now build and submit the bios for the other mirrors, check
-         * checksums
+         * checksums.
-         */
+         * First try to pick the mirror which is completely without I/O
-        for (mirror_index = 0;
-             mirror_index < BTRFS_MAX_MIRRORS &&
-             sblocks_for_recheck[mirror_index].page_count > 0;
-             mirror_index++) {
-                if (mirror_index == failed_mirror_index)
-                        continue;
-                /* build and submit the bios, check checksums */
-                ret = scrub_recheck_block(fs_info,
-                                          sblocks_for_recheck + mirror_index,
-                                          is_metadata, have_csum, csum,
-                                          generation, sdev->csum_size);
-                if (ret)
-                        goto did_not_correct_error;
-        }
-        /*
-         * first try to pick the mirror which is completely without I/O
         * errors and also does not have a checksum error.
         * If one is found, and if a checksum is present, the full block
         * that is known to contain an error is rewritten. Afterwards
@@ -840,24 +979,93 @@ static int scrub_handle_errored_block(struct scrub_block *sblock_to_check)
             mirror_index < BTRFS_MAX_MIRRORS &&
             sblocks_for_recheck[mirror_index].page_count > 0;
             mirror_index++) {
-                struct scrub_block *sblock_other = sblocks_for_recheck +
+                struct scrub_block *sblock_other;
-                                                   mirror_index;
+                if (mirror_index == failed_mirror_index)
+                        continue;
+                sblock_other = sblocks_for_recheck + mirror_index;
+                /* build and submit the bios, check checksums */
+                scrub_recheck_block(fs_info, sblock_other, is_metadata,
+                                    have_csum, csum, generation,
+                                    sctx->csum_size);
                if (!sblock_other->header_error &&
                    !sblock_other->checksum_error &&
                    sblock_other->no_io_error_seen) {
-                        int force_write = is_metadata || have_csum;
+                        if (sctx->is_dev_replace) {
+                                scrub_write_block_to_dev_replace(sblock_other);
-                        ret = scrub_repair_block_from_good_copy(sblock_bad,
+                        } else {
-                                                                sblock_other,
+                                int force_write = is_metadata || have_csum;
-                                                                force_write);
+                                ret = scrub_repair_block_from_good_copy(
+                                                sblock_bad, sblock_other,
+                                                force_write);
+                        }
                        if (0 == ret)
                                goto corrected_error;
                }
        }
        /*
-         * in case of I/O errors in the area that is supposed to be
+         * for dev_replace, pick good pages and write to the target device.
+         */
+        if (sctx->is_dev_replace) {
+                success = 1;
+                for (page_num = 0; page_num < sblock_bad->page_count;
+                     page_num++) {
+                        int sub_success;
+                        sub_success = 0;
+                        for (mirror_index = 0;
+                             mirror_index < BTRFS_MAX_MIRRORS &&
+                             sblocks_for_recheck[mirror_index].page_count > 0;
+                             mirror_index++) {
+                                struct scrub_block *sblock_other =
+                                        sblocks_for_recheck + mirror_index;
+                                struct scrub_page *page_other =
+                                        sblock_other->pagev[page_num];
+                                if (!page_other->io_error) {
+                                        ret = scrub_write_page_to_dev_replace(
+                                                        sblock_other, page_num);
+                                        if (ret == 0) {
+                                                /* succeeded for this page */
+                                                sub_success = 1;
+                                                break;
+                                        } else {
+                                                btrfs_dev_replace_stats_inc(
+                                                        &sctx->dev_root->
+                                                        fs_info->dev_replace.
+                                                        num_write_errors);
+                                        }
+                                }
+                        }
+                        if (!sub_success) {
+                                /*
+                                 * did not find a mirror to fetch the page
+                                 * from. scrub_write_page_to_dev_replace()
+                                 * handles this case (page->io_error), by
+                                 * filling the block with zeros before
+                                 * submitting the write request
+                                 */
+                                success = 0;
+                                ret = scrub_write_page_to_dev_replace(
+                                                sblock_bad, page_num);
+                                if (ret)
+                                        btrfs_dev_replace_stats_inc(
+                                                &sctx->dev_root->fs_info->
+                                                dev_replace.num_write_errors);
+                        }
+                }
+                goto out;
+        }
+        /*
+         * for regular scrub, repair those pages that are errored.
+         * In case of I/O errors in the area that is supposed to be
         * repaired, continue by picking good copies of those pages.
         * Select the good pages from mirrors to rewrite bad pages from
         * the area to fix. Afterwards verify the checksum of the block
@@ -887,7 +1095,7 @@ static int scrub_handle_errored_block(struct scrub_block *sblock_to_check)
        success = 1;
        for (page_num = 0; page_num < sblock_bad->page_count; page_num++) {
-                struct scrub_page *page_bad = sblock_bad->pagev + page_num;
+                struct scrub_page *page_bad = sblock_bad->pagev[page_num];
                if (!page_bad->io_error)
                        continue;
@@ -898,8 +1106,8 @@ static int scrub_handle_errored_block(struct scrub_block *sblock_to_check)
                     mirror_index++) {
                        struct scrub_block *sblock_other = sblocks_for_recheck +
                                                           mirror_index;
-                        struct scrub_page *page_other = sblock_other->pagev +
+                        struct scrub_page *page_other = sblock_other->pagev[
-                                                        page_num;
+                                                        page_num];
                        if (!page_other->io_error) {
                                ret = scrub_repair_page_from_good_copy(
@@ -928,10 +1136,10 @@ static int scrub_handle_errored_block(struct scrub_block *sblock_to_check)
                         * is verified, but most likely the data comes out
                         * of the page cache.
                         */
-                        ret = scrub_recheck_block(fs_info, sblock_bad,
+                        scrub_recheck_block(fs_info, sblock_bad,
-                                                  is_metadata, have_csum, csum,
+                                            is_metadata, have_csum, csum,
-                                                  generation, sdev->csum_size);
+                                            generation, sctx->csum_size);
-                        if (!ret && !sblock_bad->header_error &&
+                        if (!sblock_bad->header_error &&
                            !sblock_bad->checksum_error &&
                            sblock_bad->no_io_error_seen)
                                goto corrected_error;
@@ -939,23 +1147,23 @@ static int scrub_handle_errored_block(struct scrub_block *sblock_to_check)
                                goto did_not_correct_error;
                } else {
 corrected_error:
-                        spin_lock(&sdev->stat_lock);
+                        spin_lock(&sctx->stat_lock);
-                        sdev->stat.corrected_errors++;
+                        sctx->stat.corrected_errors++;
-                        spin_unlock(&sdev->stat_lock);
+                        spin_unlock(&sctx->stat_lock);
                        printk_ratelimited_in_rcu(KERN_ERR
                                "btrfs: fixed up error at logical %llu on dev %s\n",
                                (unsigned long long)logical,
-                                rcu_str_deref(sdev->dev->name));
+                                rcu_str_deref(dev->name));
                }
        } else {
 did_not_correct_error:
-                spin_lock(&sdev->stat_lock);
+                spin_lock(&sctx->stat_lock);
-                sdev->stat.uncorrectable_errors++;
+                sctx->stat.uncorrectable_errors++;
-                spin_unlock(&sdev->stat_lock);
+                spin_unlock(&sctx->stat_lock);
                printk_ratelimited_in_rcu(KERN_ERR
                        "btrfs: unable to fixup (regular) error at logical %llu on dev %s\n",
                        (unsigned long long)logical,
-                        rcu_str_deref(sdev->dev->name));
+                        rcu_str_deref(dev->name));
        }
 out:
@@ -966,11 +1174,11 @@ out:
                                                     mirror_index;
                        int page_index;
-                        for (page_index = 0; page_index < SCRUB_PAGES_PER_BIO;
+                        for (page_index = 0; page_index < sblock->page_count;
-                             page_index++)
+                             page_index++) {
-                                if (sblock->pagev[page_index].page)
+                                sblock->pagev[page_index]->sblock = NULL;
-                                        __free_page(
+                                scrub_page_put(sblock->pagev[page_index]);
-                                                sblock->pagev[page_index].page);
+                        }
                }
                kfree(sblocks_for_recheck);
        }
@@ -978,8 +1186,9 @@ out:
        return 0;
 }
-static int scrub_setup_recheck_block(struct scrub_dev *sdev,
+static int scrub_setup_recheck_block(struct scrub_ctx *sctx,
-                                     struct btrfs_mapping_tree *map_tree,
+                                     struct btrfs_fs_info *fs_info,
+                                     struct scrub_block *original_sblock,
                                     u64 length, u64 logical,
                                     struct scrub_block *sblocks_for_recheck)
 {
@@ -988,7 +1197,7 @@ static int scrub_setup_recheck_block(struct scrub_dev *sdev,
        int ret;
        /*
-         * note: the three members sdev, ref_count and outstanding_pages
+         * note: the two members ref_count and outstanding_pages
         * are not used (and not set) in the blocks that are used for
         * the recheck procedure
         */
@@ -1003,14 +1212,14 @@ static int scrub_setup_recheck_block(struct scrub_dev *sdev,
                 * with a length of PAGE_SIZE, each returned stripe
                 * represents one mirror
                 */
-                ret = btrfs_map_block(map_tree, WRITE, logical, &mapped_length,
+                ret = btrfs_map_block(fs_info, REQ_GET_READ_MIRRORS, logical,
-                                      &bbio, 0);
+                                      &mapped_length, &bbio, 0);
                if (ret || !bbio || mapped_length < sublen) {
                        kfree(bbio);
                        return -EIO;
                }
-                BUG_ON(page_index >= SCRUB_PAGES_PER_BIO);
+                BUG_ON(page_index >= SCRUB_PAGES_PER_RD_BIO);
                for (mirror_index = 0; mirror_index < (int)bbio->num_stripes;
                     mirror_index++) {
                        struct scrub_block *sblock;
@@ -1020,21 +1229,31 @@ static int scrub_setup_recheck_block(struct scrub_dev *sdev,
                                continue;
                        sblock = sblocks_for_recheck + mirror_index;
-                        page = sblock->pagev + page_index;
+                        sblock->sctx = sctx;
+                        page = kzalloc(sizeof(*page), GFP_NOFS);
+                        if (!page) {
+leave_nomem:
+                                spin_lock(&sctx->stat_lock);
+                                sctx->stat.malloc_errors++;
+                                spin_unlock(&sctx->stat_lock);
+                                kfree(bbio);
+                                return -ENOMEM;
+                        }
+                        scrub_page_get(page);
+                        sblock->pagev[page_index] = page;
                        page->logical = logical;
                        page->physical = bbio->stripes[mirror_index].physical;
+                        BUG_ON(page_index >= original_sblock->page_count);
+                        page->physical_for_dev_replace =
+                                original_sblock->pagev[page_index]->
+                                physical_for_dev_replace;
                        /* for missing devices, dev->bdev is NULL */
                        page->dev = bbio->stripes[mirror_index].dev;
                        page->mirror_num = mirror_index + 1;
-                        page->page = alloc_page(GFP_NOFS);
-                        if (!page->page) {
-                                spin_lock(&sdev->stat_lock);
-                                sdev->stat.malloc_errors++;
-                                spin_unlock(&sdev->stat_lock);
-                                kfree(bbio);
-                                return -ENOMEM;
-                        }
                        sblock->page_count++;
+                        page->page = alloc_page(GFP_NOFS);
+                        if (!page->page)
+                                goto leave_nomem;
                }
                kfree(bbio);
                length -= sublen;
@@ -1052,10 +1271,10 @@ static int scrub_setup_recheck_block(struct scrub_dev *sdev,
 * to take those pages that are not errored from all the mirrors so that
 * the pages that are errored in the just handled mirror can be repaired.
 */
-static int scrub_recheck_block(struct btrfs_fs_info *fs_info,
+static void scrub_recheck_block(struct btrfs_fs_info *fs_info,
-                               struct scrub_block *sblock, int is_metadata,
+                                struct scrub_block *sblock, int is_metadata,
-                               int have_csum, u8 *csum, u64 generation,
+                                int have_csum, u8 *csum, u64 generation,
-                               u16 csum_size)
+                                u16 csum_size)
 {
        int page_num;
@@ -1065,8 +1284,7 @@ static int scrub_recheck_block(struct btrfs_fs_info *fs_info,
        for (page_num = 0; page_num < sblock->page_count; page_num++) {
                struct bio *bio;
-                int ret;
+                struct scrub_page *page = sblock->pagev[page_num];
-                struct scrub_page *page = sblock->pagev + page_num;
                DECLARE_COMPLETION_ONSTACK(complete);
                if (page->dev->bdev == NULL) {
@@ -1075,20 +1293,19 @@ static int scrub_recheck_block(struct btrfs_fs_info *fs_info,
                        continue;
                }
-                BUG_ON(!page->page);
+                WARN_ON(!page->page);
                bio = bio_alloc(GFP_NOFS, 1);
-                if (!bio)
+                if (!bio) {
-                        return -EIO;
+                        page->io_error = 1;
+                        sblock->no_io_error_seen = 0;
+                        continue;
+                }
                bio->bi_bdev = page->dev->bdev;
                bio->bi_sector = page->physical >> 9;
                bio->bi_end_io = scrub_complete_bio_end_io;
                bio->bi_private = &complete;
-                ret = bio_add_page(bio, page->page, PAGE_SIZE, 0);
+                bio_add_page(bio, page->page, PAGE_SIZE, 0);
-                if (PAGE_SIZE != ret) {
-                        bio_put(bio);
-                        return -EIO;
-                }
                btrfsic_submit_bio(READ, bio);
                /* this will also unplug the queue */
@@ -1105,7 +1322,7 @@ static int scrub_recheck_block(struct btrfs_fs_info *fs_info,
                                             have_csum, csum, generation,
                                             csum_size);
-        return 0;
+        return;
 }
 static void scrub_recheck_block_checksum(struct btrfs_fs_info *fs_info,
@@ -1120,14 +1337,14 @@ static void scrub_recheck_block_checksum(struct btrfs_fs_info *fs_info,
        struct btrfs_root *root = fs_info->extent_root;
        void *mapped_buffer;
-        BUG_ON(!sblock->pagev[0].page);
+        WARN_ON(!sblock->pagev[0]->page);
        if (is_metadata) {
                struct btrfs_header *h;
-                mapped_buffer = kmap_atomic(sblock->pagev[0].page);
+                mapped_buffer = kmap_atomic(sblock->pagev[0]->page);
                h = (struct btrfs_header *)mapped_buffer;
-                if (sblock->pagev[0].logical != le64_to_cpu(h->bytenr) ||
+                if (sblock->pagev[0]->logical != le64_to_cpu(h->bytenr) ||
                    memcmp(h->fsid, fs_info->fsid, BTRFS_UUID_SIZE) ||
                    memcmp(h->chunk_tree_uuid, fs_info->chunk_tree_uuid,
                           BTRFS_UUID_SIZE)) {
@@ -1141,7 +1358,7 @@ static void scrub_recheck_block_checksum(struct btrfs_fs_info *fs_info,
                if (!have_csum)
                        return;
-                mapped_buffer = kmap_atomic(sblock->pagev[0].page);
+                mapped_buffer = kmap_atomic(sblock->pagev[0]->page);
        }
        for (page_num = 0;;) {
@@ -1157,9 +1374,9 @@ static void scrub_recheck_block_checksum(struct btrfs_fs_info *fs_info,
                page_num++;
                if (page_num >= sblock->page_count)
                        break;
-                BUG_ON(!sblock->pagev[page_num].page);
+                WARN_ON(!sblock->pagev[page_num]->page);
-                mapped_buffer = kmap_atomic(sblock->pagev[page_num].page);
+                mapped_buffer = kmap_atomic(sblock->pagev[page_num]->page);
        }
        btrfs_csum_final(crc, calculated_csum);
@@ -1197,17 +1414,23 @@ static int scrub_repair_page_from_good_copy(struct scrub_block *sblock_bad,
                                            struct scrub_block *sblock_good,
                                            int page_num, int force_write)
 {
-        struct scrub_page *page_bad = sblock_bad->pagev + page_num;
+        struct scrub_page *page_bad = sblock_bad->pagev[page_num];
-        struct scrub_page *page_good = sblock_good->pagev + page_num;
+        struct scrub_page *page_good = sblock_good->pagev[page_num];
-        BUG_ON(sblock_bad->pagev[page_num].page == NULL);
+        BUG_ON(page_bad->page == NULL);
-        BUG_ON(sblock_good->pagev[page_num].page == NULL);
+        BUG_ON(page_good->page == NULL);
        if (force_write || sblock_bad->header_error ||
            sblock_bad->checksum_error || page_bad->io_error) {
                struct bio *bio;
                int ret;
                DECLARE_COMPLETION_ONSTACK(complete);
+                if (!page_bad->dev->bdev) {
+                        printk_ratelimited(KERN_WARNING
+                                "btrfs: scrub_repair_page_from_good_copy(bdev == NULL) is unexpected!\n");
+                        return -EIO;
+                }
                bio = bio_alloc(GFP_NOFS, 1);
                if (!bio)
                        return -EIO;
@@ -1228,6 +1451,9 @@ static int scrub_repair_page_from_good_copy(struct scrub_block *sblock_bad,
                if (!bio_flagged(bio, BIO_UPTODATE)) {
                        btrfs_dev_stat_inc_and_print(page_bad->dev,
                                BTRFS_DEV_STAT_WRITE_ERRS);
+                        btrfs_dev_replace_stats_inc(
+                                &sblock_bad->sctx->dev_root->fs_info->
+                                dev_replace.num_write_errors);
                        bio_put(bio);
                        return -EIO;
                }
@@ -1237,13 +1463,174 @@ static int scrub_repair_page_from_good_copy(struct scrub_block *sblock_bad,
        return 0;
 }
-static void scrub_checksum(struct scrub_block *sblock)
+static void scrub_write_block_to_dev_replace(struct scrub_block *sblock)
+{
+        int page_num;
+        for (page_num = 0; page_num < sblock->page_count; page_num++) {
+                int ret;
+                ret = scrub_write_page_to_dev_replace(sblock, page_num);
+                if (ret)
+                        btrfs_dev_replace_stats_inc(
+                                &sblock->sctx->dev_root->fs_info->dev_replace.
+                                num_write_errors);
+        }
+}
+static int scrub_write_page_to_dev_replace(struct scrub_block *sblock,
+                                           int page_num)
+{
+        struct scrub_page *spage = sblock->pagev[page_num];
+        BUG_ON(spage->page == NULL);
+        if (spage->io_error) {
+                void *mapped_buffer = kmap_atomic(spage->page);
+                memset(mapped_buffer, 0, PAGE_CACHE_SIZE);
+                flush_dcache_page(spage->page);
+                kunmap_atomic(mapped_buffer);
+        }
+        return scrub_add_page_to_wr_bio(sblock->sctx, spage);
+}
+static int scrub_add_page_to_wr_bio(struct scrub_ctx *sctx,
+                                    struct scrub_page *spage)
+{
+        struct scrub_wr_ctx *wr_ctx = &sctx->wr_ctx;
+        struct scrub_bio *sbio;
+        int ret;
+        mutex_lock(&wr_ctx->wr_lock);
+again:
+        if (!wr_ctx->wr_curr_bio) {
+                wr_ctx->wr_curr_bio = kzalloc(sizeof(*wr_ctx->wr_curr_bio),
+                                              GFP_NOFS);
+                if (!wr_ctx->wr_curr_bio) {
+                        mutex_unlock(&wr_ctx->wr_lock);
+                        return -ENOMEM;
+                }
+                wr_ctx->wr_curr_bio->sctx = sctx;
+                wr_ctx->wr_curr_bio->page_count = 0;
+        }
+        sbio = wr_ctx->wr_curr_bio;
+        if (sbio->page_count == 0) {
+                struct bio *bio;
+                sbio->physical = spage->physical_for_dev_replace;
+                sbio->logical = spage->logical;
+                sbio->dev = wr_ctx->tgtdev;
+                bio = sbio->bio;
+                if (!bio) {
+                        bio = bio_alloc(GFP_NOFS, wr_ctx->pages_per_wr_bio);
+                        if (!bio) {
+                                mutex_unlock(&wr_ctx->wr_lock);
+                                return -ENOMEM;
+                        }
+                        sbio->bio = bio;
+                }
+                bio->bi_private = sbio;
+                bio->bi_end_io = scrub_wr_bio_end_io;
+                bio->bi_bdev = sbio->dev->bdev;
+                bio->bi_sector = sbio->physical >> 9;
+                sbio->err = 0;
+        } else if (sbio->physical + sbio->page_count * PAGE_SIZE !=
+                   spage->physical_for_dev_replace ||
+                   sbio->logical + sbio->page_count * PAGE_SIZE !=
+                   spage->logical) {
+                scrub_wr_submit(sctx);
+                goto again;
+        }
+        ret = bio_add_page(sbio->bio, spage->page, PAGE_SIZE, 0);
+        if (ret != PAGE_SIZE) {
+                if (sbio->page_count < 1) {
+                        bio_put(sbio->bio);
+                        sbio->bio = NULL;
+                        mutex_unlock(&wr_ctx->wr_lock);
+                        return -EIO;
+                }
+                scrub_wr_submit(sctx);
+                goto again;
+        }
+        sbio->pagev[sbio->page_count] = spage;
+        scrub_page_get(spage);
+        sbio->page_count++;
+        if (sbio->page_count == wr_ctx->pages_per_wr_bio)
+                scrub_wr_submit(sctx);
+        mutex_unlock(&wr_ctx->wr_lock);
+        return 0;
+}
+static void scrub_wr_submit(struct scrub_ctx *sctx)
+{
+        struct scrub_wr_ctx *wr_ctx = &sctx->wr_ctx;
+        struct scrub_bio *sbio;
+        if (!wr_ctx->wr_curr_bio)
+                return;
+        sbio = wr_ctx->wr_curr_bio;
+        wr_ctx->wr_curr_bio = NULL;
+        WARN_ON(!sbio->bio->bi_bdev);
+        scrub_pending_bio_inc(sctx);
+        /* process all writes in a single worker thread. Then the block layer
+         * orders the requests before sending them to the driver which
+         * doubled the write performance on spinning disks when measured
+         * with Linux 3.5 */
+        btrfsic_submit_bio(WRITE, sbio->bio);
+}
+static void scrub_wr_bio_end_io(struct bio *bio, int err)
+{
+        struct scrub_bio *sbio = bio->bi_private;
+        struct btrfs_fs_info *fs_info = sbio->dev->dev_root->fs_info;
+        sbio->err = err;
+        sbio->bio = bio;
+        sbio->work.func = scrub_wr_bio_end_io_worker;
+        btrfs_queue_worker(&fs_info->scrub_wr_completion_workers, &sbio->work);
+}
+static void scrub_wr_bio_end_io_worker(struct btrfs_work *work)
+{
+        struct scrub_bio *sbio = container_of(work, struct scrub_bio, work);
+        struct scrub_ctx *sctx = sbio->sctx;
+        int i;
+        WARN_ON(sbio->page_count > SCRUB_PAGES_PER_WR_BIO);
+        if (sbio->err) {
+                struct btrfs_dev_replace *dev_replace =
+                        &sbio->sctx->dev_root->fs_info->dev_replace;
+                for (i = 0; i < sbio->page_count; i++) {
+                        struct scrub_page *spage = sbio->pagev[i];
+                        spage->io_error = 1;
+                        btrfs_dev_replace_stats_inc(&dev_replace->
+                                                    num_write_errors);
+                }
+        }
+        for (i = 0; i < sbio->page_count; i++)
+                scrub_page_put(sbio->pagev[i]);
+        bio_put(sbio->bio);
+        kfree(sbio);
+        scrub_pending_bio_dec(sctx);
+}
+static int scrub_checksum(struct scrub_block *sblock)
 {
        u64 flags;
        int ret;
-        BUG_ON(sblock->page_count < 1);
+        WARN_ON(sblock->page_count < 1);
-        flags = sblock->pagev[0].flags;
+        flags = sblock->pagev[0]->flags;
        ret = 0;
        if (flags & BTRFS_EXTENT_FLAG_DATA)
                ret = scrub_checksum_data(sblock);
@@ -1255,30 +1642,32 @@ static void scrub_checksum(struct scrub_block *sblock)
                WARN_ON(1);
        if (ret)
                scrub_handle_errored_block(sblock);
+        return ret;
 }
 static int scrub_checksum_data(struct scrub_block *sblock)
 {
-        struct scrub_dev *sdev = sblock->sdev;
+        struct scrub_ctx *sctx = sblock->sctx;
        u8 csum[BTRFS_CSUM_SIZE];
        u8 *on_disk_csum;
        struct page *page;
        void *buffer;
        u32 crc = ~(u32)0;
        int fail = 0;
-        struct btrfs_root *root = sdev->dev->dev_root;
+        struct btrfs_root *root = sctx->dev_root;
        u64 len;
        int index;
        BUG_ON(sblock->page_count < 1);
-        if (!sblock->pagev[0].have_csum)
+        if (!sblock->pagev[0]->have_csum)
                return 0;
-        on_disk_csum = sblock->pagev[0].csum;
+        on_disk_csum = sblock->pagev[0]->csum;
-        page = sblock->pagev[0].page;
+        page = sblock->pagev[0]->page;
        buffer = kmap_atomic(page);
-        len = sdev->sectorsize;
+        len = sctx->sectorsize;
        index = 0;
        for (;;) {
                u64 l = min_t(u64, len, PAGE_SIZE);
@@ -1290,13 +1679,13 @@ static int scrub_checksum_data(struct scrub_block *sblock)
                        break;
                index++;
                BUG_ON(index >= sblock->page_count);
-                BUG_ON(!sblock->pagev[index].page);
+                BUG_ON(!sblock->pagev[index]->page);
-                page = sblock->pagev[index].page;
+                page = sblock->pagev[index]->page;
                buffer = kmap_atomic(page);
        }
        btrfs_csum_final(crc, csum);
-        if (memcmp(csum, on_disk_csum, sdev->csum_size))
+        if (memcmp(csum, on_disk_csum, sctx->csum_size))
                fail = 1;
        return fail;
@@ -1304,9 +1693,9 @@ static int scrub_checksum_data(struct scrub_block *sblock)
 static int scrub_checksum_tree_block(struct scrub_block *sblock)
 {
-        struct scrub_dev *sdev = sblock->sdev;
+        struct scrub_ctx *sctx = sblock->sctx;
        struct btrfs_header *h;
-        struct btrfs_root *root = sdev->dev->dev_root;
+        struct btrfs_root *root = sctx->dev_root;
        struct btrfs_fs_info *fs_info = root->fs_info;
        u8 calculated_csum[BTRFS_CSUM_SIZE];
        u8 on_disk_csum[BTRFS_CSUM_SIZE];
@@ -1321,10 +1710,10 @@ static int scrub_checksum_tree_block(struct scrub_block *sblock)
        int index;
        BUG_ON(sblock->page_count < 1);
-        page = sblock->pagev[0].page;
+        page = sblock->pagev[0]->page;
        mapped_buffer = kmap_atomic(page);
        h = (struct btrfs_header *)mapped_buffer;
-        memcpy(on_disk_csum, h->csum, sdev->csum_size);
+        memcpy(on_disk_csum, h->csum, sctx->csum_size);
        /*
         * we don't use the getter functions here, as we
@@ -1332,10 +1721,10 @@ static int scrub_checksum_tree_block(struct scrub_block *sblock)
         * b) the page is already kmapped
         */
-        if (sblock->pagev[0].logical != le64_to_cpu(h->bytenr))
+        if (sblock->pagev[0]->logical != le64_to_cpu(h->bytenr))
                ++fail;
-        if (sblock->pagev[0].generation != le64_to_cpu(h->generation))
+        if (sblock->pagev[0]->generation != le64_to_cpu(h->generation))
                ++fail;
        if (memcmp(h->fsid, fs_info->fsid, BTRFS_UUID_SIZE))
@@ -1345,8 +1734,8 @@ static int scrub_checksum_tree_block(struct scrub_block *sblock)
                   BTRFS_UUID_SIZE))
                ++fail;
-        BUG_ON(sdev->nodesize != sdev->leafsize);
+        WARN_ON(sctx->nodesize != sctx->leafsize);
-        len = sdev->nodesize - BTRFS_CSUM_SIZE;
+        len = sctx->nodesize - BTRFS_CSUM_SIZE;
        mapped_size = PAGE_SIZE - BTRFS_CSUM_SIZE;
        p = ((u8 *)mapped_buffer) + BTRFS_CSUM_SIZE;
        index = 0;
@@ -1360,15 +1749,15 @@ static int scrub_checksum_tree_block(struct scrub_block *sblock)
                        break;
                index++;
                BUG_ON(index >= sblock->page_count);
-                BUG_ON(!sblock->pagev[index].page);
+                BUG_ON(!sblock->pagev[index]->page);
-                page = sblock->pagev[index].page;
+                page = sblock->pagev[index]->page;
                mapped_buffer = kmap_atomic(page);
                mapped_size = PAGE_SIZE;
                p = mapped_buffer;
        }
        btrfs_csum_final(crc, calculated_csum);
-        if (memcmp(calculated_csum, on_disk_csum, sdev->csum_size))
+        if (memcmp(calculated_csum, on_disk_csum, sctx->csum_size))
                ++crc_fail;
        return fail || crc_fail;
@@ -1377,8 +1766,8 @@ static int scrub_checksum_tree_block(struct scrub_block *sblock)
 static int scrub_checksum_super(struct scrub_block *sblock)
 {
        struct btrfs_super_block *s;
-        struct scrub_dev *sdev = sblock->sdev;
+        struct scrub_ctx *sctx = sblock->sctx;
-        struct btrfs_root *root = sdev->dev->dev_root;
+        struct btrfs_root *root = sctx->dev_root;
        struct btrfs_fs_info *fs_info = root->fs_info;
        u8 calculated_csum[BTRFS_CSUM_SIZE];
        u8 on_disk_csum[BTRFS_CSUM_SIZE];
@@ -1393,15 +1782,15 @@ static int scrub_checksum_super(struct scrub_block *sblock)
        int index;
        BUG_ON(sblock->page_count < 1);
-        page = sblock->pagev[0].page;
+        page = sblock->pagev[0]->page;
        mapped_buffer = kmap_atomic(page);
        s = (struct btrfs_super_block *)mapped_buffer;
-        memcpy(on_disk_csum, s->csum, sdev->csum_size);
+        memcpy(on_disk_csum, s->csum, sctx->csum_size);
-        if (sblock->pagev[0].logical != le64_to_cpu(s->bytenr))
+        if (sblock->pagev[0]->logical != le64_to_cpu(s->bytenr))
                ++fail_cor;
-        if (sblock->pagev[0].generation != le64_to_cpu(s->generation))
+        if (sblock->pagev[0]->generation != le64_to_cpu(s->generation))
                ++fail_gen;
        if (memcmp(s->fsid, fs_info->fsid, BTRFS_UUID_SIZE))
@@ -1421,15 +1810,15 @@ static int scrub_checksum_super(struct scrub_block *sblock)
                        break;
                index++;
                BUG_ON(index >= sblock->page_count);
-                BUG_ON(!sblock->pagev[index].page);
+                BUG_ON(!sblock->pagev[index]->page);
-                page = sblock->pagev[index].page;
+                page = sblock->pagev[index]->page;
                mapped_buffer = kmap_atomic(page);
                mapped_size = PAGE_SIZE;
                p = mapped_buffer;
        }
        btrfs_csum_final(crc, calculated_csum);
-        if (memcmp(calculated_csum, on_disk_csum, sdev->csum_size))
+        if (memcmp(calculated_csum, on_disk_csum, sctx->csum_size))
                ++fail_cor;
        if (fail_cor + fail_gen) {
@@ -1438,14 +1827,14 @@ static int scrub_checksum_super(struct scrub_block *sblock)
                 * They will get written with the next transaction commit
                 * anyway
                 */
-                spin_lock(&sdev->stat_lock);
+                spin_lock(&sctx->stat_lock);
-                ++sdev->stat.super_errors;
+                ++sctx->stat.super_errors;
-                spin_unlock(&sdev->stat_lock);
+                spin_unlock(&sctx->stat_lock);
                if (fail_cor)
-                        btrfs_dev_stat_inc_and_print(sdev->dev,
+                        btrfs_dev_stat_inc_and_print(sblock->pagev[0]->dev,
                                BTRFS_DEV_STAT_CORRUPTION_ERRS);
                else
-                        btrfs_dev_stat_inc_and_print(sdev->dev,
+                        btrfs_dev_stat_inc_and_print(sblock->pagev[0]->dev,
                                BTRFS_DEV_STAT_GENERATION_ERRS);
        }
@@ -1463,28 +1852,54 @@ static void scrub_block_put(struct scrub_block *sblock)
                int i;
                for (i = 0; i < sblock->page_count; i++)
-                        if (sblock->pagev[i].page)
+                        scrub_page_put(sblock->pagev[i]);
-                                __free_page(sblock->pagev[i].page);
                kfree(sblock);
        }
 }
-static void scrub_submit(struct scrub_dev *sdev)
+static void scrub_page_get(struct scrub_page *spage)
+{
+        atomic_inc(&spage->ref_count);
+}
+static void scrub_page_put(struct scrub_page *spage)
+{
+        if (atomic_dec_and_test(&spage->ref_count)) {
+                if (spage->page)
+                        __free_page(spage->page);
+                kfree(spage);
+        }
+}
+static void scrub_submit(struct scrub_ctx *sctx)
 {
        struct scrub_bio *sbio;
-        if (sdev->curr == -1)
+        if (sctx->curr == -1)
                return;
-        sbio = sdev->bios[sdev->curr];
+        sbio = sctx->bios[sctx->curr];
-        sdev->curr = -1;
+        sctx->curr = -1;
-        atomic_inc(&sdev->in_flight);
+        scrub_pending_bio_inc(sctx);
-        btrfsic_submit_bio(READ, sbio->bio);
+        if (!sbio->bio->bi_bdev) {
+                /*
+                 * this case should not happen. If btrfs_map_block() is
+                 * wrong, it could happen for dev-replace operations on
+                 * missing devices when no mirrors are available, but in
+                 * this case it should already fail the mount.
+                 * This case is handled correctly (but _very_ slowly).
+                 */
+                printk_ratelimited(KERN_WARNING
+                        "btrfs: scrub_submit(bio bdev == NULL) is unexpected!\n");
+                bio_endio(sbio->bio, -EIO);
+        } else {
+                btrfsic_submit_bio(READ, sbio->bio);
+        }
 }
-static int scrub_add_page_to_bio(struct scrub_dev *sdev,
+static int scrub_add_page_to_rd_bio(struct scrub_ctx *sctx,
-                                 struct scrub_page *spage)
+                                    struct scrub_page *spage)
 {
        struct scrub_block *sblock = spage->sblock;
        struct scrub_bio *sbio;
@@ -1494,28 +1909,29 @@ again:
        /*
         * grab a fresh bio or wait for one to become available
         */
-        while (sdev->curr == -1) {
+        while (sctx->curr == -1) {
-                spin_lock(&sdev->list_lock);
+                spin_lock(&sctx->list_lock);
-                sdev->curr = sdev->first_free;
+                sctx->curr = sctx->first_free;
-                if (sdev->curr != -1) {
+                if (sctx->curr != -1) {
-                        sdev->first_free = sdev->bios[sdev->curr]->next_free;
+                        sctx->first_free = sctx->bios[sctx->curr]->next_free;
-                        sdev->bios[sdev->curr]->next_free = -1;
+                        sctx->bios[sctx->curr]->next_free = -1;
-                        sdev->bios[sdev->curr]->page_count = 0;
+                        sctx->bios[sctx->curr]->page_count = 0;
-                        spin_unlock(&sdev->list_lock);
+                        spin_unlock(&sctx->list_lock);
                } else {
-                        spin_unlock(&sdev->list_lock);
+                        spin_unlock(&sctx->list_lock);
-                        wait_event(sdev->list_wait, sdev->first_free != -1);
+                        wait_event(sctx->list_wait, sctx->first_free != -1);
                }
        }
-        sbio = sdev->bios[sdev->curr];
+        sbio = sctx->bios[sctx->curr];
        if (sbio->page_count == 0) {
                struct bio *bio;
                sbio->physical = spage->physical;
                sbio->logical = spage->logical;
+                sbio->dev = spage->dev;
                bio = sbio->bio;
                if (!bio) {
-                        bio = bio_alloc(GFP_NOFS, sdev->pages_per_bio);
+                        bio = bio_alloc(GFP_NOFS, sctx->pages_per_rd_bio);
                        if (!bio)
                                return -ENOMEM;
                        sbio->bio = bio;
@@ -1523,14 +1939,15 @@ again:
                bio->bi_private = sbio;
                bio->bi_end_io = scrub_bio_end_io;
-                bio->bi_bdev = sdev->dev->bdev;
+                bio->bi_bdev = sbio->dev->bdev;
-                bio->bi_sector = spage->physical >> 9;
+                bio->bi_sector = sbio->physical >> 9;
                sbio->err = 0;
        } else if (sbio->physical + sbio->page_count * PAGE_SIZE !=
                   spage->physical ||
                   sbio->logical + sbio->page_count * PAGE_SIZE !=
-                   spage->logical) {
+                   spage->logical ||
-                scrub_submit(sdev);
+                   sbio->dev != spage->dev) {
+                scrub_submit(sctx);
                goto again;
        }
@@ -1542,81 +1959,87 @@ again:
                        sbio->bio = NULL;
                        return -EIO;
                }
-                scrub_submit(sdev);
+                scrub_submit(sctx);
                goto again;
        }
-        scrub_block_get(sblock); /* one for the added page */
+        scrub_block_get(sblock); /* one for the page added to the bio */
        atomic_inc(&sblock->outstanding_pages);
        sbio->page_count++;
-        if (sbio->page_count == sdev->pages_per_bio)
+        if (sbio->page_count == sctx->pages_per_rd_bio)
-                scrub_submit(sdev);
+                scrub_submit(sctx);
        return 0;
 }
-static int scrub_pages(struct scrub_dev *sdev, u64 logical, u64 len,
+static int scrub_pages(struct scrub_ctx *sctx, u64 logical, u64 len,
-                       u64 physical, u64 flags, u64 gen, int mirror_num,
+                       u64 physical, struct btrfs_device *dev, u64 flags,
-                       u8 *csum, int force)
+                       u64 gen, int mirror_num, u8 *csum, int force,
+                       u64 physical_for_dev_replace)
 {
        struct scrub_block *sblock;
        int index;
        sblock = kzalloc(sizeof(*sblock), GFP_NOFS);
        if (!sblock) {
-                spin_lock(&sdev->stat_lock);
+                spin_lock(&sctx->stat_lock);
-                sdev->stat.malloc_errors++;
+                sctx->stat.malloc_errors++;
-                spin_unlock(&sdev->stat_lock);
+                spin_unlock(&sctx->stat_lock);
                return -ENOMEM;
        }
-        /* one ref inside this function, plus one for each page later on */
+        /* one ref inside this function, plus one for each page added to
+         * a bio later on */
        atomic_set(&sblock->ref_count, 1);
-        sblock->sdev = sdev;
+        sblock->sctx = sctx;
        sblock->no_io_error_seen = 1;
        for (index = 0; len > 0; index++) {
-                struct scrub_page *spage = sblock->pagev + index;
+                struct scrub_page *spage;
                u64 l = min_t(u64, len, PAGE_SIZE);
-                BUG_ON(index >= SCRUB_MAX_PAGES_PER_BLOCK);
+                spage = kzalloc(sizeof(*spage), GFP_NOFS);
-                spage->page = alloc_page(GFP_NOFS);
+                if (!spage) {
-                if (!spage->page) {
+leave_nomem:
-                        spin_lock(&sdev->stat_lock);
+                        spin_lock(&sctx->stat_lock);
-                        sdev->stat.malloc_errors++;
+                        sctx->stat.malloc_errors++;
-                        spin_unlock(&sdev->stat_lock);
+                        spin_unlock(&sctx->stat_lock);
-                        while (index > 0) {
+                        scrub_block_put(sblock);
-                                index--;
-                                __free_page(sblock->pagev[index].page);
-                        }
-                        kfree(sblock);
                        return -ENOMEM;
                }
+                BUG_ON(index >= SCRUB_MAX_PAGES_PER_BLOCK);
+                scrub_page_get(spage);
+                sblock->pagev[index] = spage;
                spage->sblock = sblock;
-                spage->dev = sdev->dev;
+                spage->dev = dev;
                spage->flags = flags;
                spage->generation = gen;
                spage->logical = logical;
                spage->physical = physical;
+                spage->physical_for_dev_replace = physical_for_dev_replace;
                spage->mirror_num = mirror_num;
                if (csum) {
                        spage->have_csum = 1;
-                        memcpy(spage->csum, csum, sdev->csum_size);
+                        memcpy(spage->csum, csum, sctx->csum_size);
                } else {
                        spage->have_csum = 0;
                }
                sblock->page_count++;
+                spage->page = alloc_page(GFP_NOFS);
+                if (!spage->page)
+                        goto leave_nomem;
                len -= l;
                logical += l;
                physical += l;
+                physical_for_dev_replace += l;
        }
-        BUG_ON(sblock->page_count == 0);
+        WARN_ON(sblock->page_count == 0);
        for (index = 0; index < sblock->page_count; index++) {
-                struct scrub_page *spage = sblock->pagev + index;
+                struct scrub_page *spage = sblock->pagev[index];
                int ret;
-                ret = scrub_add_page_to_bio(sdev, spage);
+                ret = scrub_add_page_to_rd_bio(sctx, spage);
                if (ret) {
                        scrub_block_put(sblock);
                        return ret;
@@ -1624,7 +2047,7 @@ static int scrub_pages(struct scrub_dev *sdev, u64 logical, u64 len,
        }
        if (force)
-                scrub_submit(sdev);
+                scrub_submit(sctx);
        /* last one frees, either here or in bio completion for last page */
        scrub_block_put(sblock);
@@ -1634,8 +2057,7 @@ static int scrub_pages(struct scrub_dev *sdev, u64 logical, u64 len,
 static void scrub_bio_end_io(struct bio *bio, int err)
 {
        struct scrub_bio *sbio = bio->bi_private;
-        struct scrub_dev *sdev = sbio->sdev;
+        struct btrfs_fs_info *fs_info = sbio->dev->dev_root->fs_info;
-        struct btrfs_fs_info *fs_info = sdev->dev->dev_root->fs_info;
        sbio->err = err;
        sbio->bio = bio;
@@ -1646,10 +2068,10 @@ static void scrub_bio_end_io(struct bio *bio, int err)
 static void scrub_bio_end_io_worker(struct btrfs_work *work)
 {
        struct scrub_bio *sbio = container_of(work, struct scrub_bio, work);
-        struct scrub_dev *sdev = sbio->sdev;
+        struct scrub_ctx *sctx = sbio->sctx;
        int i;
-        BUG_ON(sbio->page_count > SCRUB_PAGES_PER_BIO);
+        BUG_ON(sbio->page_count > SCRUB_PAGES_PER_RD_BIO);
        if (sbio->err) {
                for (i = 0; i < sbio->page_count; i++) {
                        struct scrub_page *spage = sbio->pagev[i];
@@ -1671,23 +2093,37 @@ static void scrub_bio_end_io_worker(struct btrfs_work *work)
        bio_put(sbio->bio);
        sbio->bio = NULL;
-        spin_lock(&sdev->list_lock);
+        spin_lock(&sctx->list_lock);
-        sbio->next_free = sdev->first_free;
+        sbio->next_free = sctx->first_free;
-        sdev->first_free = sbio->index;
+        sctx->first_free = sbio->index;
-        spin_unlock(&sdev->list_lock);
+        spin_unlock(&sctx->list_lock);
-        atomic_dec(&sdev->in_flight);
-        wake_up(&sdev->list_wait);
+        if (sctx->is_dev_replace &&
+            atomic_read(&sctx->wr_ctx.flush_all_writes)) {
+                mutex_lock(&sctx->wr_ctx.wr_lock);
+                scrub_wr_submit(sctx);
+                mutex_unlock(&sctx->wr_ctx.wr_lock);
+        }
+        scrub_pending_bio_dec(sctx);
 }
 static void scrub_block_complete(struct scrub_block *sblock)
 {
-        if (!sblock->no_io_error_seen)
+        if (!sblock->no_io_error_seen) {
                scrub_handle_errored_block(sblock);
-        else
+        } else {
-                scrub_checksum(sblock);
+                /*
+                 * if has checksum error, write via repair mechanism in
+                 * dev replace case, otherwise write here in dev replace
+                 * case.
+                 */
+                if (!scrub_checksum(sblock) && sblock->sctx->is_dev_replace)
+                        scrub_write_block_to_dev_replace(sblock);
+        }
 }
-static int scrub_find_csum(struct scrub_dev *sdev, u64 logical, u64 len,
+static int scrub_find_csum(struct scrub_ctx *sctx, u64 logical, u64 len,
                           u8 *csum)
 {
        struct btrfs_ordered_sum *sum = NULL;
@@ -1695,15 +2131,15 @@ static int scrub_find_csum(struct scrub_dev *sdev, u64 logical, u64 len,
        unsigned long i;
        unsigned long num_sectors;
-        while (!list_empty(&sdev->csum_list)) {
+        while (!list_empty(&sctx->csum_list)) {
-                sum = list_first_entry(&sdev->csum_list,
+                sum = list_first_entry(&sctx->csum_list,
                                       struct btrfs_ordered_sum, list);
                if (sum->bytenr > logical)
                        return 0;
                if (sum->bytenr + sum->len > logical)
                        break;
-                ++sdev->stat.csum_discards;
+                ++sctx->stat.csum_discards;
                list_del(&sum->list);
                kfree(sum);
                sum = NULL;
@@ -1711,10 +2147,10 @@ static int scrub_find_csum(struct scrub_dev *sdev, u64 logical, u64 len,
        if (!sum)
                return 0;
-        num_sectors = sum->len / sdev->sectorsize;
+        num_sectors = sum->len / sctx->sectorsize;
        for (i = 0; i < num_sectors; ++i) {
                if (sum->sums[i].bytenr == logical) {
-                        memcpy(csum, &sum->sums[i].sum, sdev->csum_size);
+                        memcpy(csum, &sum->sums[i].sum, sctx->csum_size);
                        ret = 1;
                        break;
                }
@@ -1727,29 +2163,30 @@ static int scrub_find_csum(struct scrub_dev *sdev, u64 logical, u64 len,
 }
 /* scrub extent tries to collect up to 64 kB for each bio */
-static int scrub_extent(struct scrub_dev *sdev, u64 logical, u64 len,
+static int scrub_extent(struct scrub_ctx *sctx, u64 logical, u64 len,
-                        u64 physical, u64 flags, u64 gen, int mirror_num)
+                        u64 physical, struct btrfs_device *dev, u64 flags,
+                        u64 gen, int mirror_num, u64 physical_for_dev_replace)
 {
        int ret;
        u8 csum[BTRFS_CSUM_SIZE];
        u32 blocksize;
        if (flags & BTRFS_EXTENT_FLAG_DATA) {
-                blocksize = sdev->sectorsize;
+                blocksize = sctx->sectorsize;
-                spin_lock(&sdev->stat_lock);
+                spin_lock(&sctx->stat_lock);
-                sdev->stat.data_extents_scrubbed++;
+                sctx->stat.data_extents_scrubbed++;
-                sdev->stat.data_bytes_scrubbed += len;
+                sctx->stat.data_bytes_scrubbed += len;
-                spin_unlock(&sdev->stat_lock);
+                spin_unlock(&sctx->stat_lock);
        } else if (flags & BTRFS_EXTENT_FLAG_TREE_BLOCK) {
-                BUG_ON(sdev->nodesize != sdev->leafsize);
+                WARN_ON(sctx->nodesize != sctx->leafsize);
-                blocksize = sdev->nodesize;
+                blocksize = sctx->nodesize;
-                spin_lock(&sdev->stat_lock);
+                spin_lock(&sctx->stat_lock);
-                sdev->stat.tree_extents_scrubbed++;
+                sctx->stat.tree_extents_scrubbed++;
-                sdev->stat.tree_bytes_scrubbed += len;
+                sctx->stat.tree_bytes_scrubbed += len;
-                spin_unlock(&sdev->stat_lock);
+                spin_unlock(&sctx->stat_lock);
        } else {
-                blocksize = sdev->sectorsize;
+                blocksize = sctx->sectorsize;
-                BUG_ON(1);
+                WARN_ON(1);
        }
        while (len) {
@@ -1758,26 +2195,38 @@ static int scrub_extent(struct scrub_dev *sdev, u64 logical, u64 len,
                if (flags & BTRFS_EXTENT_FLAG_DATA) {
                        /* push csums to sbio */
-                        have_csum = scrub_find_csum(sdev, logical, l, csum);
+                        have_csum = scrub_find_csum(sctx, logical, l, csum);
                        if (have_csum == 0)
-                                ++sdev->stat.no_csum;
+                                ++sctx->stat.no_csum;
+                        if (sctx->is_dev_replace && !have_csum) {
+                                ret = copy_nocow_pages(sctx, logical, l,
+                                                       mirror_num,
+                                                      physical_for_dev_replace);
+                                goto behind_scrub_pages;
+                        }
                }
-                ret = scrub_pages(sdev, logical, l, physical, flags, gen,
+                ret = scrub_pages(sctx, logical, l, physical, dev, flags, gen,
-                                  mirror_num, have_csum ? csum : NULL, 0);
+                                  mirror_num, have_csum ? csum : NULL, 0,
+                                  physical_for_dev_replace);
+behind_scrub_pages:
                if (ret)
                        return ret;
                len -= l;
                logical += l;
                physical += l;
+                physical_for_dev_replace += l;
        }
        return 0;
 }
-static noinline_for_stack int scrub_stripe(struct scrub_dev *sdev,
+static noinline_for_stack int scrub_stripe(struct scrub_ctx *sctx,
-        struct map_lookup *map, int num, u64 base, u64 length)
+                                           struct map_lookup *map,
+                                           struct btrfs_device *scrub_dev,
+                                           int num, u64 base, u64 length,
+                                           int is_dev_replace)
 {
        struct btrfs_path *path;
-        struct btrfs_fs_info *fs_info = sdev->dev->dev_root->fs_info;
+        struct btrfs_fs_info *fs_info = sctx->dev_root->fs_info;
        struct btrfs_root *root = fs_info->extent_root;
        struct btrfs_root *csum_root = fs_info->csum_root;
        struct btrfs_extent_item *extent;
@@ -1797,9 +2246,13 @@ static noinline_for_stack int scrub_stripe(struct scrub_dev *sdev,
        struct reada_control *reada2;
        struct btrfs_key key_start;
        struct btrfs_key key_end;
        u64 increment = map->stripe_len;
        u64 offset;
+        u64 extent_logical;
+        u64 extent_physical;
+        u64 extent_len;
+        struct btrfs_device *extent_dev;
+        int extent_mirror_num;
        nstripes = length;
        offset = 0;
@@ -1843,8 +2296,8 @@ static noinline_for_stack int scrub_stripe(struct scrub_dev *sdev,
         */
        logical = base + offset;
-        wait_event(sdev->list_wait,
+        wait_event(sctx->list_wait,
-                   atomic_read(&sdev->in_flight) == 0);
+                   atomic_read(&sctx->bios_in_flight) == 0);
        atomic_inc(&fs_info->scrubs_paused);
        wake_up(&fs_info->scrub_pause_wait);
@@ -1898,7 +2351,7 @@ static noinline_for_stack int scrub_stripe(struct scrub_dev *sdev,
                 * canceled?
                 */
                if (atomic_read(&fs_info->scrub_cancel_req) ||
-                    atomic_read(&sdev->cancel_req)) {
+                    atomic_read(&sctx->cancel_req)) {
                        ret = -ECANCELED;
                        goto out;
                }
@@ -1907,9 +2360,14 @@ static noinline_for_stack int scrub_stripe(struct scrub_dev *sdev,
                 */
                if (atomic_read(&fs_info->scrub_pause_req)) {
                        /* push queued extents */
-                        scrub_submit(sdev);
+                        atomic_set(&sctx->wr_ctx.flush_all_writes, 1);
-                        wait_event(sdev->list_wait,
+                        scrub_submit(sctx);
-                                   atomic_read(&sdev->in_flight) == 0);
+                        mutex_lock(&sctx->wr_ctx.wr_lock);
+                        scrub_wr_submit(sctx);
+                        mutex_unlock(&sctx->wr_ctx.wr_lock);
+                        wait_event(sctx->list_wait,
+                                   atomic_read(&sctx->bios_in_flight) == 0);
+                        atomic_set(&sctx->wr_ctx.flush_all_writes, 0);
                        atomic_inc(&fs_info->scrubs_paused);
                        wake_up(&fs_info->scrub_pause_wait);
                        mutex_lock(&fs_info->scrub_lock);
@@ -1926,7 +2384,7 @@ static noinline_for_stack int scrub_stripe(struct scrub_dev *sdev,
                ret = btrfs_lookup_csums_range(csum_root, logical,
                                               logical + map->stripe_len - 1,
-                                               &sdev->csum_list, 1);
+                                               &sctx->csum_list, 1);
                if (ret)
                        goto out;
@@ -2004,9 +2462,20 @@ static noinline_for_stack int scrub_stripe(struct scrub_dev *sdev,
                                             key.objectid;
                        }
-                        ret = scrub_extent(sdev, key.objectid, key.offset,
+                        extent_logical = key.objectid;
-                                           key.objectid - logical + physical,
+                        extent_physical = key.objectid - logical + physical;
-                                           flags, generation, mirror_num);
+                        extent_len = key.offset;
+                        extent_dev = scrub_dev;
+                        extent_mirror_num = mirror_num;
+                        if (is_dev_replace)
+                                scrub_remap_extent(fs_info, extent_logical,
+                                                   extent_len, &extent_physical,
+                                                   &extent_dev,
+                                                   &extent_mirror_num);
+                        ret = scrub_extent(sctx, extent_logical, extent_len,
+                                           extent_physical, extent_dev, flags,
+                                           generation, extent_mirror_num,
+                                           key.objectid - logical + physical);
                        if (ret)
                                goto out;
@@ -2016,29 +2485,34 @@ next:
                btrfs_release_path(path);
                logical += increment;
                physical += map->stripe_len;
-                spin_lock(&sdev->stat_lock);
+                spin_lock(&sctx->stat_lock);
-                sdev->stat.last_physical = physical;
+                sctx->stat.last_physical = physical;
-                spin_unlock(&sdev->stat_lock);
+                spin_unlock(&sctx->stat_lock);
        }
+out:
        /* push queued extents */
-        scrub_submit(sdev);
+        scrub_submit(sctx);
+        mutex_lock(&sctx->wr_ctx.wr_lock);
+        scrub_wr_submit(sctx);
+        mutex_unlock(&sctx->wr_ctx.wr_lock);
-out:
        blk_finish_plug(&plug);
        btrfs_free_path(path);
        return ret < 0 ? ret : 0;
 }
-static noinline_for_stack int scrub_chunk(struct scrub_dev *sdev,
+static noinline_for_stack int scrub_chunk(struct scrub_ctx *sctx,
-        u64 chunk_tree, u64 chunk_objectid, u64 chunk_offset, u64 length,
+                                          struct btrfs_device *scrub_dev,
-        u64 dev_offset)
+                                          u64 chunk_tree, u64 chunk_objectid,
+                                          u64 chunk_offset, u64 length,
+                                          u64 dev_offset, int is_dev_replace)
 {
        struct btrfs_mapping_tree *map_tree =
-                &sdev->dev->dev_root->fs_info->mapping_tree;
+                &sctx->dev_root->fs_info->mapping_tree;
        struct map_lookup *map;
        struct extent_map *em;
        int i;
-        int ret = -EINVAL;
+        int ret = 0;
        read_lock(&map_tree->map_tree.lock);
        em = lookup_extent_mapping(&map_tree->map_tree, chunk_offset, 1);
@@ -2055,9 +2529,11 @@ static noinline_for_stack int scrub_chunk(struct scrub_dev *sdev,
                goto out;
        for (i = 0; i < map->num_stripes; ++i) {
-                if (map->stripes[i].dev == sdev->dev &&
+                if (map->stripes[i].dev->bdev == scrub_dev->bdev &&
                    map->stripes[i].physical == dev_offset) {
-                        ret = scrub_stripe(sdev, map, i, chunk_offset, length);
+                        ret = scrub_stripe(sctx, map, scrub_dev, i,
+                                           chunk_offset, length,
+                                           is_dev_replace);
                        if (ret)
                                goto out;
                }
@@ -2069,11 +2545,13 @@ out:
 }
 static noinline_for_stack
-int scrub_enumerate_chunks(struct scrub_dev *sdev, u64 start, u64 end)
+int scrub_enumerate_chunks(struct scrub_ctx *sctx,
+                           struct btrfs_device *scrub_dev, u64 start, u64 end,
+                           int is_dev_replace)
 {
        struct btrfs_dev_extent *dev_extent = NULL;
        struct btrfs_path *path;
-        struct btrfs_root *root = sdev->dev->dev_root;
+        struct btrfs_root *root = sctx->dev_root;
        struct btrfs_fs_info *fs_info = root->fs_info;
        u64 length;
        u64 chunk_tree;
@@ -2085,6 +2563,7 @@ int scrub_enumerate_chunks(struct scrub_dev *sdev, u64 start, u64 end)
        struct btrfs_key key;
        struct btrfs_key found_key;
        struct btrfs_block_group_cache *cache;
+        struct btrfs_dev_replace *dev_replace = &fs_info->dev_replace;
        path = btrfs_alloc_path();
        if (!path)
@@ -2094,11 +2573,10 @@ int scrub_enumerate_chunks(struct scrub_dev *sdev, u64 start, u64 end)
        path->search_commit_root = 1;
        path->skip_locking = 1;
-        key.objectid = sdev->dev->devid;
+        key.objectid = scrub_dev->devid;
        key.offset = 0ull;
        key.type = BTRFS_DEV_EXTENT_KEY;
        while (1) {
                ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
                if (ret < 0)
@@ -2117,7 +2595,7 @@ int scrub_enumerate_chunks(struct scrub_dev *sdev, u64 start, u64 end)
                btrfs_item_key_to_cpu(l, &found_key, slot);
-                if (found_key.objectid != sdev->dev->devid)
+                if (found_key.objectid != scrub_dev->devid)
                        break;
                if (btrfs_key_type(&found_key) != BTRFS_DEV_EXTENT_KEY)
@@ -2151,11 +2629,62 @@ int scrub_enumerate_chunks(struct scrub_dev *sdev, u64 start, u64 end)
                        ret = -ENOENT;
                        break;
                }
-                ret = scrub_chunk(sdev, chunk_tree, chunk_objectid,
+                dev_replace->cursor_right = found_key.offset + length;
-                                  chunk_offset, length, found_key.offset);
+                dev_replace->cursor_left = found_key.offset;
+                dev_replace->item_needs_writeback = 1;
+                ret = scrub_chunk(sctx, scrub_dev, chunk_tree, chunk_objectid,
+                                  chunk_offset, length, found_key.offset,
+                                  is_dev_replace);
+                /*
+                 * flush, submit all pending read and write bios, afterwards
+                 * wait for them.
+                 * Note that in the dev replace case, a read request causes
+                 * write requests that are submitted in the read completion
+                 * worker. Therefore in the current situation, it is required
+                 * that all write requests are flushed, so that all read and
+                 * write requests are really completed when bios_in_flight
+                 * changes to 0.
+                 */
+                atomic_set(&sctx->wr_ctx.flush_all_writes, 1);
+                scrub_submit(sctx);
+                mutex_lock(&sctx->wr_ctx.wr_lock);
+                scrub_wr_submit(sctx);
+                mutex_unlock(&sctx->wr_ctx.wr_lock);
+                wait_event(sctx->list_wait,
+                           atomic_read(&sctx->bios_in_flight) == 0);
+                atomic_set(&sctx->wr_ctx.flush_all_writes, 0);
+                atomic_inc(&fs_info->scrubs_paused);
+                wake_up(&fs_info->scrub_pause_wait);
+                wait_event(sctx->list_wait,
+                           atomic_read(&sctx->workers_pending) == 0);
+                mutex_lock(&fs_info->scrub_lock);
+                while (atomic_read(&fs_info->scrub_pause_req)) {
+                        mutex_unlock(&fs_info->scrub_lock);
+                        wait_event(fs_info->scrub_pause_wait,
+                           atomic_read(&fs_info->scrub_pause_req) == 0);
+                        mutex_lock(&fs_info->scrub_lock);
+                }
+                atomic_dec(&fs_info->scrubs_paused);
+                mutex_unlock(&fs_info->scrub_lock);
+                wake_up(&fs_info->scrub_pause_wait);
+                dev_replace->cursor_left = dev_replace->cursor_right;
+                dev_replace->item_needs_writeback = 1;
                btrfs_put_block_group(cache);
                if (ret)
                        break;
+                if (is_dev_replace &&
+                    atomic64_read(&dev_replace->num_write_errors) > 0) {
+                        ret = -EIO;
+                        break;
+                }
+                if (sctx->stat.malloc_errors > 0) {
+                        ret = -ENOMEM;
+                        break;
+                }
                key.offset = found_key.offset + length;
                btrfs_release_path(path);
@@ -2170,14 +2699,14 @@ int scrub_enumerate_chunks(struct scrub_dev *sdev, u64 start, u64 end)
        return ret < 0 ? ret : 0;
 }
-static noinline_for_stack int scrub_supers(struct scrub_dev *sdev)
+static noinline_for_stack int scrub_supers(struct scrub_ctx *sctx,
+                                           struct btrfs_device *scrub_dev)
 {
        int     i;
        u64     bytenr;
        u64     gen;
        int     ret;
-        struct btrfs_device *device = sdev->dev;
+        struct btrfs_root *root = sctx->dev_root;
-        struct btrfs_root *root = device->dev_root;
        if (root->fs_info->fs_state & BTRFS_SUPER_FLAG_ERROR)
                return -EIO;
@@ -2186,15 +2715,16 @@ static noinline_for_stack int scrub_supers(struct scrub_dev *sdev)
        for (i = 0; i < BTRFS_SUPER_MIRROR_MAX; i++) {
                bytenr = btrfs_sb_offset(i);
-                if (bytenr + BTRFS_SUPER_INFO_SIZE > device->total_bytes)
+                if (bytenr + BTRFS_SUPER_INFO_SIZE > scrub_dev->total_bytes)
                        break;
-                ret = scrub_pages(sdev, bytenr, BTRFS_SUPER_INFO_SIZE, bytenr,
+                ret = scrub_pages(sctx, bytenr, BTRFS_SUPER_INFO_SIZE, bytenr,
-                                     BTRFS_EXTENT_FLAG_SUPER, gen, i, NULL, 1);
+                                  scrub_dev, BTRFS_EXTENT_FLAG_SUPER, gen, i,
+                                  NULL, 1, bytenr);
                if (ret)
                        return ret;
        }
-        wait_event(sdev->list_wait, atomic_read(&sdev->in_flight) == 0);
+        wait_event(sctx->list_wait, atomic_read(&sctx->bios_in_flight) == 0);
        return 0;
 }
@@ -2202,19 +2732,38 @@ static noinline_for_stack int scrub_supers(struct scrub_dev *sdev)
 /*
 * get a reference count on fs_info->scrub_workers. start worker if necessary
 */
-static noinline_for_stack int scrub_workers_get(struct btrfs_root *root)
+static noinline_for_stack int scrub_workers_get(struct btrfs_fs_info *fs_info,
+                                                int is_dev_replace)
 {
-        struct btrfs_fs_info *fs_info = root->fs_info;
        int ret = 0;
        mutex_lock(&fs_info->scrub_lock);
        if (fs_info->scrub_workers_refcnt == 0) {
-                btrfs_init_workers(&fs_info->scrub_workers, "scrub",
+                if (is_dev_replace)
-                           fs_info->thread_pool_size, &fs_info->generic_worker);
+                        btrfs_init_workers(&fs_info->scrub_workers, "scrub", 1,
+                                        &fs_info->generic_worker);
+                else
+                        btrfs_init_workers(&fs_info->scrub_workers, "scrub",
+                                        fs_info->thread_pool_size,
+                                        &fs_info->generic_worker);
                fs_info->scrub_workers.idle_thresh = 4;
                ret = btrfs_start_workers(&fs_info->scrub_workers);
                if (ret)
                        goto out;
+                btrfs_init_workers(&fs_info->scrub_wr_completion_workers,
+                                   "scrubwrc",
+                                   fs_info->thread_pool_size,
+                                   &fs_info->generic_worker);
+                fs_info->scrub_wr_completion_workers.idle_thresh = 2;
+                ret = btrfs_start_workers(
+                                &fs_info->scrub_wr_completion_workers);
+                if (ret)
+                        goto out;
+                btrfs_init_workers(&fs_info->scrub_nocow_workers, "scrubnc", 1,
+                                   &fs_info->generic_worker);
+                ret = btrfs_start_workers(&fs_info->scrub_nocow_workers);
+                if (ret)
+                        goto out;
        }
        ++fs_info->scrub_workers_refcnt;
 out:
@@ -2223,40 +2772,41 @@ out:
        return ret;
 }
-static noinline_for_stack void scrub_workers_put(struct btrfs_root *root)
+static noinline_for_stack void scrub_workers_put(struct btrfs_fs_info *fs_info)
 {
-        struct btrfs_fs_info *fs_info = root->fs_info;
        mutex_lock(&fs_info->scrub_lock);
-        if (--fs_info->scrub_workers_refcnt == 0)
+        if (--fs_info->scrub_workers_refcnt == 0) {
                btrfs_stop_workers(&fs_info->scrub_workers);
+                btrfs_stop_workers(&fs_info->scrub_wr_completion_workers);
+                btrfs_stop_workers(&fs_info->scrub_nocow_workers);
+        }
        WARN_ON(fs_info->scrub_workers_refcnt < 0);
        mutex_unlock(&fs_info->scrub_lock);
 }
+int btrfs_scrub_dev(struct btrfs_fs_info *fs_info, u64 devid, u64 start,
-int btrfs_scrub_dev(struct btrfs_root *root, u64 devid, u64 start, u64 end,
+                    u64 end, struct btrfs_scrub_progress *progress,
-                    struct btrfs_scrub_progress *progress, int readonly)
+                    int readonly, int is_dev_replace)
 {
-        struct scrub_dev *sdev;
+        struct scrub_ctx *sctx;
-        struct btrfs_fs_info *fs_info = root->fs_info;
        int ret;
        struct btrfs_device *dev;
-        if (btrfs_fs_closing(root->fs_info))
+        if (btrfs_fs_closing(fs_info))
                return -EINVAL;
        /*
         * check some assumptions
         */
-        if (root->nodesize != root->leafsize) {
+        if (fs_info->chunk_root->nodesize != fs_info->chunk_root->leafsize) {
                printk(KERN_ERR
                       "btrfs_scrub: size assumption nodesize == leafsize (%d == %d) fails\n",
-                       root->nodesize, root->leafsize);
+                       fs_info->chunk_root->nodesize,
+                       fs_info->chunk_root->leafsize);
                return -EINVAL;
        }
-        if (root->nodesize > BTRFS_STRIPE_LEN) {
+        if (fs_info->chunk_root->nodesize > BTRFS_STRIPE_LEN) {
                /*
                 * in this case scrub is unable to calculate the checksum
                 * the way scrub is implemented. Do not handle this
@@ -2264,80 +2814,105 @@ int btrfs_scrub_dev(struct btrfs_root *root, u64 devid, u64 start, u64 end,
                 */
                printk(KERN_ERR
                       "btrfs_scrub: size assumption nodesize <= BTRFS_STRIPE_LEN (%d <= %d) fails\n",
-                       root->nodesize, BTRFS_STRIPE_LEN);
+                       fs_info->chunk_root->nodesize, BTRFS_STRIPE_LEN);
                return -EINVAL;
        }
-        if (root->sectorsize != PAGE_SIZE) {
+        if (fs_info->chunk_root->sectorsize != PAGE_SIZE) {
                /* not supported for data w/o checksums */
                printk(KERN_ERR
                       "btrfs_scrub: size assumption sectorsize != PAGE_SIZE (%d != %lld) fails\n",
-                       root->sectorsize, (unsigned long long)PAGE_SIZE);
+                       fs_info->chunk_root->sectorsize,
+                       (unsigned long long)PAGE_SIZE);
                return -EINVAL;
        }
-        ret = scrub_workers_get(root);
+        if (fs_info->chunk_root->nodesize >
+            PAGE_SIZE * SCRUB_MAX_PAGES_PER_BLOCK ||
+            fs_info->chunk_root->sectorsize >
+            PAGE_SIZE * SCRUB_MAX_PAGES_PER_BLOCK) {
+                /*
+                 * would exhaust the array bounds of pagev member in
+                 * struct scrub_block
+                 */
+                pr_err("btrfs_scrub: size assumption nodesize and sectorsize <= SCRUB_MAX_PAGES_PER_BLOCK (%d <= %d && %d <= %d) fails\n",
+                       fs_info->chunk_root->nodesize,
+                       SCRUB_MAX_PAGES_PER_BLOCK,
+                       fs_info->chunk_root->sectorsize,
+                       SCRUB_MAX_PAGES_PER_BLOCK);
+                return -EINVAL;
+        }
+        ret = scrub_workers_get(fs_info, is_dev_replace);
        if (ret)
                return ret;
-        mutex_lock(&root->fs_info->fs_devices->device_list_mutex);
+        mutex_lock(&fs_info->fs_devices->device_list_mutex);
-        dev = btrfs_find_device(root, devid, NULL, NULL);
+        dev = btrfs_find_device(fs_info, devid, NULL, NULL);
-        if (!dev || dev->missing) {
+        if (!dev || (dev->missing && !is_dev_replace)) {
-                mutex_unlock(&root->fs_info->fs_devices->device_list_mutex);
+                mutex_unlock(&fs_info->fs_devices->device_list_mutex);
-                scrub_workers_put(root);
+                scrub_workers_put(fs_info);
                return -ENODEV;
        }
        mutex_lock(&fs_info->scrub_lock);
-        if (!dev->in_fs_metadata) {
+        if (!dev->in_fs_metadata || dev->is_tgtdev_for_dev_replace) {
                mutex_unlock(&fs_info->scrub_lock);
-                mutex_unlock(&root->fs_info->fs_devices->device_list_mutex);
+                mutex_unlock(&fs_info->fs_devices->device_list_mutex);
-                scrub_workers_put(root);
+                scrub_workers_put(fs_info);
-                return -ENODEV;
+                return -EIO;
        }
-        if (dev->scrub_device) {
+        btrfs_dev_replace_lock(&fs_info->dev_replace);
+        if (dev->scrub_device ||
+            (!is_dev_replace &&
+             btrfs_dev_replace_is_ongoing(&fs_info->dev_replace))) {
+                btrfs_dev_replace_unlock(&fs_info->dev_replace);
                mutex_unlock(&fs_info->scrub_lock);
-                mutex_unlock(&root->fs_info->fs_devices->device_list_mutex);
+                mutex_unlock(&fs_info->fs_devices->device_list_mutex);
-                scrub_workers_put(root);
+                scrub_workers_put(fs_info);
                return -EINPROGRESS;
        }
-        sdev = scrub_setup_dev(dev);
+        btrfs_dev_replace_unlock(&fs_info->dev_replace);
-        if (IS_ERR(sdev)) {
+        sctx = scrub_setup_ctx(dev, is_dev_replace);
+        if (IS_ERR(sctx)) {
                mutex_unlock(&fs_info->scrub_lock);
-                mutex_unlock(&root->fs_info->fs_devices->device_list_mutex);
+                mutex_unlock(&fs_info->fs_devices->device_list_mutex);
-                scrub_workers_put(root);
+                scrub_workers_put(fs_info);
-                return PTR_ERR(sdev);
+                return PTR_ERR(sctx);
        }
-        sdev->readonly = readonly;
+        sctx->readonly = readonly;
-        dev->scrub_device = sdev;
+        dev->scrub_device = sctx;
        atomic_inc(&fs_info->scrubs_running);
        mutex_unlock(&fs_info->scrub_lock);
-        mutex_unlock(&root->fs_info->fs_devices->device_list_mutex);
+        mutex_unlock(&fs_info->fs_devices->device_list_mutex);
-        down_read(&fs_info->scrub_super_lock);
+        if (!is_dev_replace) {
-        ret = scrub_supers(sdev);
+                down_read(&fs_info->scrub_super_lock);
-        up_read(&fs_info->scrub_super_lock);
+                ret = scrub_supers(sctx, dev);
+                up_read(&fs_info->scrub_super_lock);
+        }
        if (!ret)
-                ret = scrub_enumerate_chunks(sdev, start, end);
+                ret = scrub_enumerate_chunks(sctx, dev, start, end,
+                                             is_dev_replace);
-        wait_event(sdev->list_wait, atomic_read(&sdev->in_flight) == 0);
+        wait_event(sctx->list_wait, atomic_read(&sctx->bios_in_flight) == 0);
        atomic_dec(&fs_info->scrubs_running);
        wake_up(&fs_info->scrub_pause_wait);
-        wait_event(sdev->list_wait, atomic_read(&sdev->fixup_cnt) == 0);
+        wait_event(sctx->list_wait, atomic_read(&sctx->workers_pending) == 0);
        if (progress)
-                memcpy(progress, &sdev->stat, sizeof(*progress));
+                memcpy(progress, &sctx->stat, sizeof(*progress));
        mutex_lock(&fs_info->scrub_lock);
        dev->scrub_device = NULL;
        mutex_unlock(&fs_info->scrub_lock);
-        scrub_free_dev(sdev);
+        scrub_free_ctx(sctx);
-        scrub_workers_put(root);
+        scrub_workers_put(fs_info);
        return ret;
 }
@@ -2377,9 +2952,8 @@ void btrfs_scrub_continue_super(struct btrfs_root *root)
        up_write(&root->fs_info->scrub_super_lock);
 }
-int __btrfs_scrub_cancel(struct btrfs_fs_info *fs_info)
+int btrfs_scrub_cancel(struct btrfs_fs_info *fs_info)
 {
        mutex_lock(&fs_info->scrub_lock);
        if (!atomic_read(&fs_info->scrubs_running)) {
                mutex_unlock(&fs_info->scrub_lock);
@@ -2399,23 +2973,18 @@ int __btrfs_scrub_cancel(struct btrfs_fs_info *fs_info)
        return 0;
 }
-int btrfs_scrub_cancel(struct btrfs_root *root)
+int btrfs_scrub_cancel_dev(struct btrfs_fs_info *fs_info,
+                           struct btrfs_device *dev)
 {
-        return __btrfs_scrub_cancel(root->fs_info);
+        struct scrub_ctx *sctx;
-}
-int btrfs_scrub_cancel_dev(struct btrfs_root *root, struct btrfs_device *dev)
-{
-        struct btrfs_fs_info *fs_info = root->fs_info;
-        struct scrub_dev *sdev;
        mutex_lock(&fs_info->scrub_lock);
-        sdev = dev->scrub_device;
+        sctx = dev->scrub_device;
-        if (!sdev) {
+        if (!sctx) {
                mutex_unlock(&fs_info->scrub_lock);
                return -ENOTCONN;
        }
-        atomic_inc(&sdev->cancel_req);
+        atomic_inc(&sctx->cancel_req);
        while (dev->scrub_device) {
                mutex_unlock(&fs_info->scrub_lock);
                wait_event(fs_info->scrub_pause_wait,
@@ -2438,12 +3007,12 @@ int btrfs_scrub_cancel_devid(struct btrfs_root *root, u64 devid)
         * does not go away in cancel_dev. FIXME: find a better solution
         */
        mutex_lock(&fs_info->fs_devices->device_list_mutex);
-        dev = btrfs_find_device(root, devid, NULL, NULL);
+        dev = btrfs_find_device(fs_info, devid, NULL, NULL);
        if (!dev) {
                mutex_unlock(&fs_info->fs_devices->device_list_mutex);
                return -ENODEV;
        }
-        ret = btrfs_scrub_cancel_dev(root, dev);
+        ret = btrfs_scrub_cancel_dev(fs_info, dev);
        mutex_unlock(&fs_info->fs_devices->device_list_mutex);
        return ret;
@@ -2453,15 +3022,291 @@ int btrfs_scrub_progress(struct btrfs_root *root, u64 devid,
                         struct btrfs_scrub_progress *progress)
 {
        struct btrfs_device *dev;
-        struct scrub_dev *sdev = NULL;
+        struct scrub_ctx *sctx = NULL;
        mutex_lock(&root->fs_info->fs_devices->device_list_mutex);
-        dev = btrfs_find_device(root, devid, NULL, NULL);
+        dev = btrfs_find_device(root->fs_info, devid, NULL, NULL);
        if (dev)
-                sdev = dev->scrub_device;
+                sctx = dev->scrub_device;
-        if (sdev)
+        if (sctx)
-                memcpy(progress, &sdev->stat, sizeof(*progress));
+                memcpy(progress, &sctx->stat, sizeof(*progress));
        mutex_unlock(&root->fs_info->fs_devices->device_list_mutex);
-        return dev ? (sdev ? 0 : -ENOTCONN) : -ENODEV;
+        return dev ? (sctx ? 0 : -ENOTCONN) : -ENODEV;
+}
+static void scrub_remap_extent(struct btrfs_fs_info *fs_info,
+                               u64 extent_logical, u64 extent_len,
+                               u64 *extent_physical,
+                               struct btrfs_device **extent_dev,
+                               int *extent_mirror_num)
+{
+        u64 mapped_length;
+        struct btrfs_bio *bbio = NULL;
+        int ret;
+        mapped_length = extent_len;
+        ret = btrfs_map_block(fs_info, READ, extent_logical,
+                              &mapped_length, &bbio, 0);
+        if (ret || !bbio || mapped_length < extent_len ||
+            !bbio->stripes[0].dev->bdev) {
+                kfree(bbio);
+                return;
+        }
+        *extent_physical = bbio->stripes[0].physical;
+        *extent_mirror_num = bbio->mirror_num;
+        *extent_dev = bbio->stripes[0].dev;
+        kfree(bbio);
+}
+static int scrub_setup_wr_ctx(struct scrub_ctx *sctx,
+                              struct scrub_wr_ctx *wr_ctx,
+                              struct btrfs_fs_info *fs_info,
+                              struct btrfs_device *dev,
+                              int is_dev_replace)
+{
+        WARN_ON(wr_ctx->wr_curr_bio != NULL);
+        mutex_init(&wr_ctx->wr_lock);
+        wr_ctx->wr_curr_bio = NULL;
+        if (!is_dev_replace)
+                return 0;
+        WARN_ON(!dev->bdev);
+        wr_ctx->pages_per_wr_bio = min_t(int, SCRUB_PAGES_PER_WR_BIO,
+                                         bio_get_nr_vecs(dev->bdev));
+        wr_ctx->tgtdev = dev;
+        atomic_set(&wr_ctx->flush_all_writes, 0);
+        return 0;
+}
+static void scrub_free_wr_ctx(struct scrub_wr_ctx *wr_ctx)
+{
+        mutex_lock(&wr_ctx->wr_lock);
+        kfree(wr_ctx->wr_curr_bio);
+        wr_ctx->wr_curr_bio = NULL;
+        mutex_unlock(&wr_ctx->wr_lock);
+}
+static int copy_nocow_pages(struct scrub_ctx *sctx, u64 logical, u64 len,
+                            int mirror_num, u64 physical_for_dev_replace)
+{
+        struct scrub_copy_nocow_ctx *nocow_ctx;
+        struct btrfs_fs_info *fs_info = sctx->dev_root->fs_info;
+        nocow_ctx = kzalloc(sizeof(*nocow_ctx), GFP_NOFS);
+        if (!nocow_ctx) {
+                spin_lock(&sctx->stat_lock);
+                sctx->stat.malloc_errors++;
+                spin_unlock(&sctx->stat_lock);
+                return -ENOMEM;
+        }
+        scrub_pending_trans_workers_inc(sctx);
+        nocow_ctx->sctx = sctx;
+        nocow_ctx->logical = logical;
+        nocow_ctx->len = len;
+        nocow_ctx->mirror_num = mirror_num;
+        nocow_ctx->physical_for_dev_replace = physical_for_dev_replace;
+        nocow_ctx->work.func = copy_nocow_pages_worker;
+        btrfs_queue_worker(&fs_info->scrub_nocow_workers,
+                           &nocow_ctx->work);
+        return 0;
+}
+static void copy_nocow_pages_worker(struct btrfs_work *work)
+{
+        struct scrub_copy_nocow_ctx *nocow_ctx =
+                container_of(work, struct scrub_copy_nocow_ctx, work);
+        struct scrub_ctx *sctx = nocow_ctx->sctx;
+        u64 logical = nocow_ctx->logical;
+        u64 len = nocow_ctx->len;
+        int mirror_num = nocow_ctx->mirror_num;
+        u64 physical_for_dev_replace = nocow_ctx->physical_for_dev_replace;
+        int ret;
+        struct btrfs_trans_handle *trans = NULL;
+        struct btrfs_fs_info *fs_info;
+        struct btrfs_path *path;
+        struct btrfs_root *root;
+        int not_written = 0;
+        fs_info = sctx->dev_root->fs_info;
+        root = fs_info->extent_root;
+        path = btrfs_alloc_path();
+        if (!path) {
+                spin_lock(&sctx->stat_lock);
+                sctx->stat.malloc_errors++;
+                spin_unlock(&sctx->stat_lock);
+                not_written = 1;
+                goto out;
+        }
+        trans = btrfs_join_transaction(root);
+        if (IS_ERR(trans)) {
+                not_written = 1;
+                goto out;
+        }
+        ret = iterate_inodes_from_logical(logical, fs_info, path,
+                                          copy_nocow_pages_for_inode,
+                                          nocow_ctx);
+        if (ret != 0 && ret != -ENOENT) {
+                pr_warn("iterate_inodes_from_logical() failed: log %llu, phys %llu, len %llu, mir %llu, ret %d\n",
+                        (unsigned long long)logical,
+                        (unsigned long long)physical_for_dev_replace,
+                        (unsigned long long)len,
+                        (unsigned long long)mirror_num, ret);
+                not_written = 1;
+                goto out;
+        }
+out:
+        if (trans && !IS_ERR(trans))
+                btrfs_end_transaction(trans, root);
+        if (not_written)
+                btrfs_dev_replace_stats_inc(&fs_info->dev_replace.
+                                            num_uncorrectable_read_errors);
+        btrfs_free_path(path);
+        kfree(nocow_ctx);
+        scrub_pending_trans_workers_dec(sctx);
+}
+static int copy_nocow_pages_for_inode(u64 inum, u64 offset, u64 root, void *ctx)
+{
+        unsigned long index;
+        struct scrub_copy_nocow_ctx *nocow_ctx = ctx;
+        int ret = 0;
+        struct btrfs_key key;
+        struct inode *inode = NULL;
+        struct btrfs_root *local_root;
+        u64 physical_for_dev_replace;
+        u64 len;
+        struct btrfs_fs_info *fs_info = nocow_ctx->sctx->dev_root->fs_info;
+        int srcu_index;
+        key.objectid = root;
+        key.type = BTRFS_ROOT_ITEM_KEY;
+        key.offset = (u64)-1;
+        srcu_index = srcu_read_lock(&fs_info->subvol_srcu);
+        local_root = btrfs_read_fs_root_no_name(fs_info, &key);
+        if (IS_ERR(local_root)) {
+                srcu_read_unlock(&fs_info->subvol_srcu, srcu_index);
+                return PTR_ERR(local_root);
+        }
+        key.type = BTRFS_INODE_ITEM_KEY;
+        key.objectid = inum;
+        key.offset = 0;
+        inode = btrfs_iget(fs_info->sb, &key, local_root, NULL);
+        srcu_read_unlock(&fs_info->subvol_srcu, srcu_index);
+        if (IS_ERR(inode))
+                return PTR_ERR(inode);
+        physical_for_dev_replace = nocow_ctx->physical_for_dev_replace;
+        len = nocow_ctx->len;
+        while (len >= PAGE_CACHE_SIZE) {
+                struct page *page = NULL;
+                int ret_sub;
+                index = offset >> PAGE_CACHE_SHIFT;
+                page = find_or_create_page(inode->i_mapping, index, GFP_NOFS);
+                if (!page) {
+                        pr_err("find_or_create_page() failed\n");
+                        ret = -ENOMEM;
+                        goto next_page;
+                }
+                if (PageUptodate(page)) {
+                        if (PageDirty(page))
+                                goto next_page;
+                } else {
+                        ClearPageError(page);
+                        ret_sub = extent_read_full_page(&BTRFS_I(inode)->
+                                                         io_tree,
+                                                        page, btrfs_get_extent,
+                                                        nocow_ctx->mirror_num);
+                        if (ret_sub) {
+                                ret = ret_sub;
+                                goto next_page;
+                        }
+                        wait_on_page_locked(page);
+                        if (!PageUptodate(page)) {
+                                ret = -EIO;
+                                goto next_page;
+                        }
+                }
+                ret_sub = write_page_nocow(nocow_ctx->sctx,
+                                           physical_for_dev_replace, page);
+                if (ret_sub) {
+                        ret = ret_sub;
+                        goto next_page;
+                }
+next_page:
+                if (page) {
+                        unlock_page(page);
+                        put_page(page);
+                }
+                offset += PAGE_CACHE_SIZE;
+                physical_for_dev_replace += PAGE_CACHE_SIZE;
+                len -= PAGE_CACHE_SIZE;
+        }
+        if (inode)
+                iput(inode);
+        return ret;
+}
+static int write_page_nocow(struct scrub_ctx *sctx,
+                            u64 physical_for_dev_replace, struct page *page)
+{
+        struct bio *bio;
+        struct btrfs_device *dev;
+        int ret;
+        DECLARE_COMPLETION_ONSTACK(compl);
+        dev = sctx->wr_ctx.tgtdev;
+        if (!dev)
+                return -EIO;
+        if (!dev->bdev) {
+                printk_ratelimited(KERN_WARNING
+                        "btrfs: scrub write_page_nocow(bdev == NULL) is unexpected!\n");
+                return -EIO;
+        }
+        bio = bio_alloc(GFP_NOFS, 1);
+        if (!bio) {
+                spin_lock(&sctx->stat_lock);
+                sctx->stat.malloc_errors++;
+                spin_unlock(&sctx->stat_lock);
+                return -ENOMEM;
+        }
+        bio->bi_private = &compl;
+        bio->bi_end_io = scrub_complete_bio_end_io;
+        bio->bi_size = 0;
+        bio->bi_sector = physical_for_dev_replace >> 9;
+        bio->bi_bdev = dev->bdev;
+        ret = bio_add_page(bio, page, PAGE_CACHE_SIZE, 0);
+        if (ret != PAGE_CACHE_SIZE) {
+leave_with_eio:
+                bio_put(bio);
+                btrfs_dev_stat_inc_and_print(dev, BTRFS_DEV_STAT_WRITE_ERRS);
+                return -EIO;
+        }
+        btrfsic_submit_bio(WRITE_SYNC, bio);
+        wait_for_completion(&compl);
+        if (!test_bit(BIO_UPTODATE, &bio->bi_flags))
+                goto leave_with_eio;
+        bio_put(bio);
+        return 0;
 }
diff --git a/fs/btrfs/send.c b/fs/btrfs/send.c
index e78b297b0b00..321b7fb4e441 100644
--- a/fs/btrfs/send.c
+++ b/fs/btrfs/send.c
@@ -1814,8 +1814,10 @@ static int name_cache_insert(struct send_ctx *sctx,
                        (unsigned long)nce->ino);
        if (!nce_head) {
                nce_head = kmalloc(sizeof(*nce_head), GFP_NOFS);
-                if (!nce_head)
+                if (!nce_head) {
+                        kfree(nce);
                        return -ENOMEM;
+                }
                INIT_LIST_HEAD(nce_head);
                ret = radix_tree_insert(&sctx->name_cache, nce->ino, nce_head);
@@ -4397,9 +4399,9 @@ static int full_send_tree(struct send_ctx *sctx)
        if (!path)
                return -ENOMEM;
-        spin_lock(&send_root->root_times_lock);
+        spin_lock(&send_root->root_item_lock);
        start_ctransid = btrfs_root_ctransid(&send_root->root_item);
-        spin_unlock(&send_root->root_times_lock);
+        spin_unlock(&send_root->root_item_lock);
        key.objectid = BTRFS_FIRST_FREE_OBJECTID;
        key.type = BTRFS_INODE_ITEM_KEY;
@@ -4422,9 +4424,9 @@ join_trans:
         * Make sure the tree has not changed after re-joining. We detect this
         * by comparing start_ctransid and ctransid. They should always match.
         */
-        spin_lock(&send_root->root_times_lock);
+        spin_lock(&send_root->root_item_lock);
        ctransid = btrfs_root_ctransid(&send_root->root_item);
-        spin_unlock(&send_root->root_times_lock);
+        spin_unlock(&send_root->root_item_lock);
        if (ctransid != start_ctransid) {
                WARN(1, KERN_WARNING "btrfs: the root that you're trying to "
diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c
index 915ac14c2064..d8982e9601d3 100644
--- a/fs/btrfs/super.c
+++ b/fs/btrfs/super.c
@@ -55,6 +55,7 @@
 #include "export.h"
 #include "compression.h"
 #include "rcu-string.h"
+#include "dev-replace.h"
 #define CREATE_TRACE_POINTS
 #include <trace/events/btrfs.h>
@@ -116,7 +117,16 @@ static void btrfs_handle_error(struct btrfs_fs_info *fs_info)
        if (fs_info->fs_state & BTRFS_SUPER_FLAG_ERROR) {
                sb->s_flags |= MS_RDONLY;
                printk(KERN_INFO "btrfs is forced readonly\n");
-                __btrfs_scrub_cancel(fs_info);
+                /*
+                 * Note that a running device replace operation is not
+                 * canceled here although there is no way to update
+                 * the progress. It would add the risk of a deadlock,
+                 * therefore the canceling is ommited. The only penalty
+                 * is that some I/O remains active until the procedure
+                 * completes. The next time when the filesystem is
+                 * mounted writeable again, the device replace
+                 * operation continues.
+                 */
 //              WARN_ON(1);
        }
 }
@@ -257,7 +267,7 @@ void __btrfs_abort_transaction(struct btrfs_trans_handle *trans,
                             function, line, errstr);
                return;
        }
-        trans->transaction->aborted = errno;
+        ACCESS_ONCE(trans->transaction->aborted) = errno;
        __btrfs_std_error(root->fs_info, function, line, errno, NULL);
 }
 /*
@@ -1186,7 +1196,8 @@ static void btrfs_resize_thread_pool(struct btrfs_fs_info *fs_info,
        btrfs_set_max_workers(&fs_info->endio_freespace_worker, new_pool_size);
        btrfs_set_max_workers(&fs_info->delayed_workers, new_pool_size);
        btrfs_set_max_workers(&fs_info->readahead_workers, new_pool_size);
-        btrfs_set_max_workers(&fs_info->scrub_workers, new_pool_size);
+        btrfs_set_max_workers(&fs_info->scrub_wr_completion_workers,
+                              new_pool_size);
 }
 static int btrfs_remount(struct super_block *sb, int *flags, char *data)
@@ -1215,8 +1226,15 @@ static int btrfs_remount(struct super_block *sb, int *flags, char *data)
                return 0;
        if (*flags & MS_RDONLY) {
+                /*
+                 * this also happens on 'umount -rf' or on shutdown, when
+                 * the filesystem is busy.
+                 */
                sb->s_flags |= MS_RDONLY;
+                btrfs_dev_replace_suspend_for_unmount(fs_info);
+                btrfs_scrub_cancel(fs_info);
                ret = btrfs_commit_super(root);
                if (ret)
                        goto restore;
@@ -1226,6 +1244,15 @@ static int btrfs_remount(struct super_block *sb, int *flags, char *data)
                        goto restore;
                }
+                if (fs_info->fs_devices->missing_devices >
+                     fs_info->num_tolerated_disk_barrier_failures &&
+                    !(*flags & MS_RDONLY)) {
+                        printk(KERN_WARNING
+                               "Btrfs: too many missing devices, writeable remount is not allowed\n");
+                        ret = -EACCES;
+                        goto restore;
+                }
                if (btrfs_super_log_root(fs_info->super_copy) != 0) {
                        ret = -EINVAL;
                        goto restore;
@@ -1244,6 +1271,11 @@ static int btrfs_remount(struct super_block *sb, int *flags, char *data)
                if (ret)
                        goto restore;
+                ret = btrfs_resume_dev_replace_async(fs_info);
+                if (ret) {
+                        pr_warn("btrfs: failed to resume dev_replace\n");
+                        goto restore;
+                }
                sb->s_flags &= ~MS_RDONLY;
        }
@@ -1336,7 +1368,8 @@ static int btrfs_calc_avail_data_space(struct btrfs_root *root, u64 *free_bytes)
                min_stripe_size = BTRFS_STRIPE_LEN;
        list_for_each_entry(device, &fs_devices->devices, dev_list) {
-                if (!device->in_fs_metadata || !device->bdev)
+                if (!device->in_fs_metadata || !device->bdev ||
+                    device->is_tgtdev_for_dev_replace)
                        continue;
                avail_space = device->total_bytes - device->bytes_used;
@@ -1647,10 +1680,14 @@ static int __init init_btrfs_fs(void)
        if (err)
                goto free_ordered_data;
-        err = btrfs_interface_init();
+        err = btrfs_auto_defrag_init();
        if (err)
                goto free_delayed_inode;
+        err = btrfs_interface_init();
+        if (err)
+                goto free_auto_defrag;
        err = register_filesystem(&btrfs_fs_type);
        if (err)
                goto unregister_ioctl;
@@ -1662,6 +1699,8 @@ static int __init init_btrfs_fs(void)
 unregister_ioctl:
        btrfs_interface_exit();
+free_auto_defrag:
+        btrfs_auto_defrag_exit();
 free_delayed_inode:
        btrfs_delayed_inode_exit();
 free_ordered_data:
@@ -1681,6 +1720,7 @@ free_compress:
 static void __exit exit_btrfs_fs(void)
 {
        btrfs_destroy_cachep();
+        btrfs_auto_defrag_exit();
        btrfs_delayed_inode_exit();
        ordered_data_exit();
        extent_map_exit();
diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c
index 04bbfb1052eb..fc03aa60b684 100644
--- a/fs/btrfs/transaction.c
+++ b/fs/btrfs/transaction.c
@@ -30,6 +30,7 @@
 #include "tree-log.h"
 #include "inode-map.h"
 #include "volumes.h"
+#include "dev-replace.h"
 #define BTRFS_ROOT_TRANS_TAG 0
@@ -145,16 +146,12 @@ loop:
         * the log must never go across transaction boundaries.
         */
        smp_mb();
-        if (!list_empty(&fs_info->tree_mod_seq_list)) {
+        if (!list_empty(&fs_info->tree_mod_seq_list))
-                printk(KERN_ERR "btrfs: tree_mod_seq_list not empty when "
+                WARN(1, KERN_ERR "btrfs: tree_mod_seq_list not empty when "
                        "creating a fresh transaction\n");
-                WARN_ON(1);
+        if (!RB_EMPTY_ROOT(&fs_info->tree_mod_log))
-        }
+                WARN(1, KERN_ERR "btrfs: tree_mod_log rb tree not empty when "
-        if (!RB_EMPTY_ROOT(&fs_info->tree_mod_log)) {
-                printk(KERN_ERR "btrfs: tree_mod_log rb tree not empty when "
                        "creating a fresh transaction\n");
-                WARN_ON(1);
-        }
        atomic_set(&fs_info->tree_mod_seq, 0);
        spin_lock_init(&cur_trans->commit_lock);
@@ -295,9 +292,9 @@ static int may_wait_transaction(struct btrfs_root *root, int type)
        return 0;
 }
-static struct btrfs_trans_handle *start_transaction(struct btrfs_root *root,
+static struct btrfs_trans_handle *
-                                                    u64 num_items, int type,
+start_transaction(struct btrfs_root *root, u64 num_items, int type,
-                                                    int noflush)
+                  enum btrfs_reserve_flush_enum flush)
 {
        struct btrfs_trans_handle *h;
        struct btrfs_transaction *cur_trans;
@@ -312,6 +309,7 @@ static struct btrfs_trans_handle *start_transaction(struct btrfs_root *root,
                WARN_ON(type != TRANS_JOIN && type != TRANS_JOIN_NOLOCK);
                h = current->journal_info;
                h->use_count++;
+                WARN_ON(h->use_count > 2);
                h->orig_rsv = h->block_rsv;
                h->block_rsv = NULL;
                goto got_it;
@@ -331,21 +329,18 @@ static struct btrfs_trans_handle *start_transaction(struct btrfs_root *root,
                }
                num_bytes = btrfs_calc_trans_metadata_size(root, num_items);
-                if (noflush)
+                ret = btrfs_block_rsv_add(root,
-                        ret = btrfs_block_rsv_add_noflush(root,
+                                          &root->fs_info->trans_block_rsv,
-                                                &root->fs_info->trans_block_rsv,
+                                          num_bytes, flush);
-                                                num_bytes);
-                else
-                        ret = btrfs_block_rsv_add(root,
-                                                &root->fs_info->trans_block_rsv,
-                                                num_bytes);
                if (ret)
-                        return ERR_PTR(ret);
+                        goto reserve_fail;
        }
 again:
        h = kmem_cache_alloc(btrfs_trans_handle_cachep, GFP_NOFS);
-        if (!h)
+        if (!h) {
-                return ERR_PTR(-ENOMEM);
+                ret = -ENOMEM;
+                goto alloc_fail;
+        }
        /*
         * If we are JOIN_NOLOCK we're already committing a transaction and
@@ -372,11 +367,7 @@ again:
        if (ret < 0) {
                /* We must get the transaction if we are JOIN_NOLOCK. */
                BUG_ON(type == TRANS_JOIN_NOLOCK);
+                goto join_fail;
-                if (type < TRANS_JOIN_NOLOCK)
-                        sb_end_intwrite(root->fs_info->sb);
-                kmem_cache_free(btrfs_trans_handle_cachep, h);
-                return ERR_PTR(ret);
        }
        cur_trans = root->fs_info->running_transaction;
@@ -417,18 +408,33 @@ got_it:
        if (!current->journal_info && type != TRANS_USERSPACE)
                current->journal_info = h;
        return h;
+join_fail:
+        if (type < TRANS_JOIN_NOLOCK)
+                sb_end_intwrite(root->fs_info->sb);
+        kmem_cache_free(btrfs_trans_handle_cachep, h);
+alloc_fail:
+        if (num_bytes)
+                btrfs_block_rsv_release(root, &root->fs_info->trans_block_rsv,
+                                        num_bytes);
+reserve_fail:
+        if (qgroup_reserved)
+                btrfs_qgroup_free(root, qgroup_reserved);
+        return ERR_PTR(ret);
 }
 struct btrfs_trans_handle *btrfs_start_transaction(struct btrfs_root *root,
                                                   int num_items)
 {
-        return start_transaction(root, num_items, TRANS_START, 0);
+        return start_transaction(root, num_items, TRANS_START,
+                                 BTRFS_RESERVE_FLUSH_ALL);
 }
-struct btrfs_trans_handle *btrfs_start_transaction_noflush(
+struct btrfs_trans_handle *btrfs_start_transaction_lflush(
                                        struct btrfs_root *root, int num_items)
 {
-        return start_transaction(root, num_items, TRANS_START, 1);
+        return start_transaction(root, num_items, TRANS_START,
+                                 BTRFS_RESERVE_FLUSH_LIMIT);
 }
 struct btrfs_trans_handle *btrfs_join_transaction(struct btrfs_root *root)
@@ -461,28 +467,31 @@ static noinline void wait_for_commit(struct btrfs_root *root,
 int btrfs_wait_for_commit(struct btrfs_root *root, u64 transid)
 {
        struct btrfs_transaction *cur_trans = NULL, *t;
-        int ret;
+        int ret = 0;
-        ret = 0;
        if (transid) {
                if (transid <= root->fs_info->last_trans_committed)
                        goto out;
+                ret = -EINVAL;
                /* find specified transaction */
                spin_lock(&root->fs_info->trans_lock);
                list_for_each_entry(t, &root->fs_info->trans_list, list) {
                        if (t->transid == transid) {
                                cur_trans = t;
                                atomic_inc(&cur_trans->use_count);
+                                ret = 0;
                                break;
                        }
-                        if (t->transid > transid)
+                        if (t->transid > transid) {
+                                ret = 0;
                                break;
+                        }
                }
                spin_unlock(&root->fs_info->trans_lock);
-                ret = -EINVAL;
+                /* The specified transaction doesn't exist */
                if (!cur_trans)
-                        goto out;  /* bad transid */
+                        goto out;
        } else {
                /* find newest transaction that is committing | committed */
                spin_lock(&root->fs_info->trans_lock);
@@ -502,9 +511,7 @@ int btrfs_wait_for_commit(struct btrfs_root *root, u64 transid)
        }
        wait_for_commit(root, cur_trans);
        put_transaction(cur_trans);
-        ret = 0;
 out:
        return ret;
 }
@@ -851,7 +858,9 @@ static noinline int commit_cowonly_roots(struct btrfs_trans_handle *trans,
                return ret;
        ret = btrfs_run_dev_stats(trans, root->fs_info);
-        BUG_ON(ret);
+        WARN_ON(ret);
+        ret = btrfs_run_dev_replace(trans, root->fs_info);
+        WARN_ON(ret);
        ret = btrfs_run_qgroups(trans, root->fs_info);
        BUG_ON(ret);
@@ -874,6 +883,8 @@ static noinline int commit_cowonly_roots(struct btrfs_trans_handle *trans,
        switch_commit_root(fs_info->extent_root);
        up_write(&fs_info->extent_commit_sem);
+        btrfs_after_dev_replace_commit(fs_info);
        return 0;
 }
@@ -958,7 +969,6 @@ int btrfs_defrag_root(struct btrfs_root *root, int cacheonly)
        struct btrfs_fs_info *info = root->fs_info;
        struct btrfs_trans_handle *trans;
        int ret;
-        unsigned long nr;
        if (xchg(&root->defrag_running, 1))
                return 0;
@@ -970,9 +980,8 @@ int btrfs_defrag_root(struct btrfs_root *root, int cacheonly)
                ret = btrfs_defrag_leaves(trans, root, cacheonly);
-                nr = trans->blocks_used;
                btrfs_end_transaction(trans, root);
-                btrfs_btree_balance_dirty(info->tree_root, nr);
+                btrfs_btree_balance_dirty(info->tree_root);
                cond_resched();
                if (btrfs_fs_closing(root->fs_info) || ret != -EAGAIN)
@@ -1032,8 +1041,9 @@ static noinline int create_pending_snapshot(struct btrfs_trans_handle *trans,
        btrfs_reloc_pre_snapshot(trans, pending, &to_reserve);
        if (to_reserve > 0) {
-                ret = btrfs_block_rsv_add_noflush(root, &pending->block_rsv,
+                ret = btrfs_block_rsv_add(root, &pending->block_rsv,
-                                                  to_reserve);
+                                          to_reserve,
+                                          BTRFS_RESERVE_NO_FLUSH);
                if (ret) {
                        pending->error = ret;
                        goto no_free_objectid;
@@ -1191,7 +1201,7 @@ static noinline int create_pending_snapshot(struct btrfs_trans_handle *trans,
                                    parent_inode, &key,
                                    BTRFS_FT_DIR, index);
        /* We have check then name at the beginning, so it is impossible. */
-        BUG_ON(ret == -EEXIST);
+        BUG_ON(ret == -EEXIST || ret == -EOVERFLOW);
        if (ret) {
                btrfs_abort_transaction(trans, root, ret);
                goto fail;
@@ -1309,9 +1319,10 @@ static void do_async_commit(struct work_struct *work)
         * We've got freeze protection passed with the transaction.
         * Tell lockdep about it.
         */
-        rwsem_acquire_read(
+        if (ac->newtrans->type < TRANS_JOIN_NOLOCK)
-                &ac->root->fs_info->sb->s_writers.lock_map[SB_FREEZE_FS-1],
+                rwsem_acquire_read(
-                0, 1, _THIS_IP_);
+                     &ac->root->fs_info->sb->s_writers.lock_map[SB_FREEZE_FS-1],
+                     0, 1, _THIS_IP_);
        current->journal_info = ac->newtrans;
@@ -1349,8 +1360,10 @@ int btrfs_commit_transaction_async(struct btrfs_trans_handle *trans,
         * Tell lockdep we've released the freeze rwsem, since the
         * async commit thread will be the one to unlock it.
         */
-        rwsem_release(&root->fs_info->sb->s_writers.lock_map[SB_FREEZE_FS-1],
+        if (trans->type < TRANS_JOIN_NOLOCK)
-                      1, _THIS_IP_);
+                rwsem_release(
+                        &root->fs_info->sb->s_writers.lock_map[SB_FREEZE_FS-1],
+                        1, _THIS_IP_);
        schedule_delayed_work(&ac->work, 0);
@@ -1400,6 +1413,48 @@ static void cleanup_transaction(struct btrfs_trans_handle *trans,
        kmem_cache_free(btrfs_trans_handle_cachep, trans);
 }
+static int btrfs_flush_all_pending_stuffs(struct btrfs_trans_handle *trans,
+                                          struct btrfs_root *root)
+{
+        int flush_on_commit = btrfs_test_opt(root, FLUSHONCOMMIT);
+        int snap_pending = 0;
+        int ret;
+        if (!flush_on_commit) {
+                spin_lock(&root->fs_info->trans_lock);
+                if (!list_empty(&trans->transaction->pending_snapshots))
+                        snap_pending = 1;
+                spin_unlock(&root->fs_info->trans_lock);
+        }
+        if (flush_on_commit || snap_pending) {
+                btrfs_start_delalloc_inodes(root, 1);
+                btrfs_wait_ordered_extents(root, 1);
+        }
+        ret = btrfs_run_delayed_items(trans, root);
+        if (ret)
+                return ret;
+        /*
+         * running the delayed items may have added new refs. account
+         * them now so that they hinder processing of more delayed refs
+         * as little as possible.
+         */
+        btrfs_delayed_refs_qgroup_accounting(trans, root->fs_info);
+        /*
+         * rename don't use btrfs_join_transaction, so, once we
+         * set the transaction to blocked above, we aren't going
+         * to get any new ordered operations.  We can safely run
+         * it here and no for sure that nothing new will be added
+         * to the list
+         */
+        btrfs_run_ordered_operations(root, 1);
+        return 0;
+}
 /*
 * btrfs_transaction state sequence:
 *    in_commit = 0, blocked = 0  (initial)
@@ -1414,15 +1469,21 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans,
        struct btrfs_transaction *cur_trans = trans->transaction;
        struct btrfs_transaction *prev_trans = NULL;
        DEFINE_WAIT(wait);
-        int ret = -EIO;
+        int ret;
        int should_grow = 0;
        unsigned long now = get_seconds();
-        int flush_on_commit = btrfs_test_opt(root, FLUSHONCOMMIT);
-        btrfs_run_ordered_operations(root, 0);
+        ret = btrfs_run_ordered_operations(root, 0);
+        if (ret) {
+                btrfs_abort_transaction(trans, root, ret);
+                goto cleanup_transaction;
+        }
-        if (cur_trans->aborted)
+        /* Stop the commit early if ->aborted is set */
+        if (unlikely(ACCESS_ONCE(cur_trans->aborted))) {
+                ret = cur_trans->aborted;
                goto cleanup_transaction;
+        }
        /* make a pass through all the delayed refs we have so far
         * any runnings procs may add more while we are here
@@ -1490,39 +1551,14 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans,
                should_grow = 1;
        do {
-                int snap_pending = 0;
                joined = cur_trans->num_joined;
-                if (!list_empty(&trans->transaction->pending_snapshots))
-                        snap_pending = 1;
                WARN_ON(cur_trans != trans->transaction);
-                if (flush_on_commit || snap_pending) {
+                ret = btrfs_flush_all_pending_stuffs(trans, root);
-                        btrfs_start_delalloc_inodes(root, 1);
-                        btrfs_wait_ordered_extents(root, 1);
-                }
-                ret = btrfs_run_delayed_items(trans, root);
                if (ret)
                        goto cleanup_transaction;
-                /*
-                 * running the delayed items may have added new refs. account
-                 * them now so that they hinder processing of more delayed refs
-                 * as little as possible.
-                 */
-                btrfs_delayed_refs_qgroup_accounting(trans, root->fs_info);
-                /*
-                 * rename don't use btrfs_join_transaction, so, once we
-                 * set the transaction to blocked above, we aren't going
-                 * to get any new ordered operations.  We can safely run
-                 * it here and no for sure that nothing new will be added
-                 * to the list
-                 */
-                btrfs_run_ordered_operations(root, 1);
                prepare_to_wait(&cur_trans->writer_wait, &wait,
                                TASK_UNINTERRUPTIBLE);
@@ -1535,6 +1571,10 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans,
        } while (atomic_read(&cur_trans->num_writers) > 1 ||
                 (should_grow && cur_trans->num_joined != joined));
+        ret = btrfs_flush_all_pending_stuffs(trans, root);
+        if (ret)
+                goto cleanup_transaction;
        /*
         * Ok now we need to make sure to block out any other joins while we
         * commit the transaction.  We could have started a join before setting
@@ -1546,6 +1586,11 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans,
        wait_event(cur_trans->writer_wait,
                   atomic_read(&cur_trans->num_writers) == 1);
+        /* ->aborted might be set after the previous check, so check it */
+        if (unlikely(ACCESS_ONCE(cur_trans->aborted))) {
+                ret = cur_trans->aborted;
+                goto cleanup_transaction;
+        }
        /*
         * the reloc mutex makes sure that we stop
         * the balancing code from coming in and moving
@@ -1629,6 +1674,17 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans,
                goto cleanup_transaction;
        }
+        /*
+         * The tasks which save the space cache and inode cache may also
+         * update ->aborted, check it.
+         */
+        if (unlikely(ACCESS_ONCE(cur_trans->aborted))) {
+                ret = cur_trans->aborted;
+                mutex_unlock(&root->fs_info->tree_log_mutex);
+                mutex_unlock(&root->fs_info->reloc_mutex);
+                goto cleanup_transaction;
+        }
        btrfs_prepare_extent_commit(trans, root);
        cur_trans = root->fs_info->running_transaction;
diff --git a/fs/btrfs/transaction.h b/fs/btrfs/transaction.h
index 80961947a6b2..0e8aa1e6c287 100644
--- a/fs/btrfs/transaction.h
+++ b/fs/btrfs/transaction.h
@@ -105,7 +105,7 @@ int btrfs_end_transaction(struct btrfs_trans_handle *trans,
                          struct btrfs_root *root);
 struct btrfs_trans_handle *btrfs_start_transaction(struct btrfs_root *root,
                                                   int num_items);
-struct btrfs_trans_handle *btrfs_start_transaction_noflush(
+struct btrfs_trans_handle *btrfs_start_transaction_lflush(
                                        struct btrfs_root *root, int num_items);
 struct btrfs_trans_handle *btrfs_join_transaction(struct btrfs_root *root);
 struct btrfs_trans_handle *btrfs_join_transaction_nolock(struct btrfs_root *root);
diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c
index 81e407d9677a..9027bb1e7466 100644
--- a/fs/btrfs/tree-log.c
+++ b/fs/btrfs/tree-log.c
@@ -2952,33 +2952,9 @@ static void fill_inode_item(struct btrfs_trans_handle *trans,
                            struct btrfs_inode_item *item,
                            struct inode *inode, int log_inode_only)
 {
-        btrfs_set_inode_uid(leaf, item, i_uid_read(inode));
+        struct btrfs_map_token token;
-        btrfs_set_inode_gid(leaf, item, i_gid_read(inode));
-        btrfs_set_inode_mode(leaf, item, inode->i_mode);
+        btrfs_init_map_token(&token);
-        btrfs_set_inode_nlink(leaf, item, inode->i_nlink);
-        btrfs_set_timespec_sec(leaf, btrfs_inode_atime(item),
-                               inode->i_atime.tv_sec);
-        btrfs_set_timespec_nsec(leaf, btrfs_inode_atime(item),
-                                inode->i_atime.tv_nsec);
-        btrfs_set_timespec_sec(leaf, btrfs_inode_mtime(item),
-                               inode->i_mtime.tv_sec);
-        btrfs_set_timespec_nsec(leaf, btrfs_inode_mtime(item),
-                                inode->i_mtime.tv_nsec);
-        btrfs_set_timespec_sec(leaf, btrfs_inode_ctime(item),
-                               inode->i_ctime.tv_sec);
-        btrfs_set_timespec_nsec(leaf, btrfs_inode_ctime(item),
-                                inode->i_ctime.tv_nsec);
-        btrfs_set_inode_nbytes(leaf, item, inode_get_bytes(inode));
-        btrfs_set_inode_sequence(leaf, item, inode->i_version);
-        btrfs_set_inode_transid(leaf, item, trans->transid);
-        btrfs_set_inode_rdev(leaf, item, inode->i_rdev);
-        btrfs_set_inode_flags(leaf, item, BTRFS_I(inode)->flags);
-        btrfs_set_inode_block_group(leaf, item, 0);
        if (log_inode_only) {
                /* set the generation to zero so the recover code
@@ -2986,14 +2962,63 @@ static void fill_inode_item(struct btrfs_trans_handle *trans,
                 * just to say 'this inode exists' and a logging
                 * to say 'update this inode with these values'
                 */
-                btrfs_set_inode_generation(leaf, item, 0);
+                btrfs_set_token_inode_generation(leaf, item, 0, &token);
-                btrfs_set_inode_size(leaf, item, 0);
+                btrfs_set_token_inode_size(leaf, item, 0, &token);
        } else {
-                btrfs_set_inode_generation(leaf, item,
+                btrfs_set_token_inode_generation(leaf, item,
-                                           BTRFS_I(inode)->generation);
+                                                 BTRFS_I(inode)->generation,
-                btrfs_set_inode_size(leaf, item, inode->i_size);
+                                                 &token);
-        }
+                btrfs_set_token_inode_size(leaf, item, inode->i_size, &token);
+        }
+        btrfs_set_token_inode_uid(leaf, item, i_uid_read(inode), &token);
+        btrfs_set_token_inode_gid(leaf, item, i_gid_read(inode), &token);
+        btrfs_set_token_inode_mode(leaf, item, inode->i_mode, &token);
+        btrfs_set_token_inode_nlink(leaf, item, inode->i_nlink, &token);
+        btrfs_set_token_timespec_sec(leaf, btrfs_inode_atime(item),
+                                     inode->i_atime.tv_sec, &token);
+        btrfs_set_token_timespec_nsec(leaf, btrfs_inode_atime(item),
+                                      inode->i_atime.tv_nsec, &token);
+        btrfs_set_token_timespec_sec(leaf, btrfs_inode_mtime(item),
+                                     inode->i_mtime.tv_sec, &token);
+        btrfs_set_token_timespec_nsec(leaf, btrfs_inode_mtime(item),
+                                      inode->i_mtime.tv_nsec, &token);
+        btrfs_set_token_timespec_sec(leaf, btrfs_inode_ctime(item),
+                                     inode->i_ctime.tv_sec, &token);
+        btrfs_set_token_timespec_nsec(leaf, btrfs_inode_ctime(item),
+                                      inode->i_ctime.tv_nsec, &token);
+        btrfs_set_token_inode_nbytes(leaf, item, inode_get_bytes(inode),
+                                     &token);
+        btrfs_set_token_inode_sequence(leaf, item, inode->i_version, &token);
+        btrfs_set_token_inode_transid(leaf, item, trans->transid, &token);
+        btrfs_set_token_inode_rdev(leaf, item, inode->i_rdev, &token);
+        btrfs_set_token_inode_flags(leaf, item, BTRFS_I(inode)->flags, &token);
+        btrfs_set_token_inode_block_group(leaf, item, 0, &token);
+}
+static int log_inode_item(struct btrfs_trans_handle *trans,
+                          struct btrfs_root *log, struct btrfs_path *path,
+                          struct inode *inode)
+{
+        struct btrfs_inode_item *inode_item;
+        struct btrfs_key key;
+        int ret;
+        memcpy(&key, &BTRFS_I(inode)->location, sizeof(key));
+        ret = btrfs_insert_empty_item(trans, log, path, &key,
+                                      sizeof(*inode_item));
+        if (ret && ret != -EEXIST)
+                return ret;
+        inode_item = btrfs_item_ptr(path->nodes[0], path->slots[0],
+                                    struct btrfs_inode_item);
+        fill_inode_item(trans, path->nodes[0], inode_item, inode, 0);
+        btrfs_release_path(path);
+        return 0;
 }
 static noinline int copy_items(struct btrfs_trans_handle *trans,
@@ -3130,151 +3155,239 @@ static int extent_cmp(void *priv, struct list_head *a, struct list_head *b)
        return 0;
 }
-struct log_args {
+static int drop_adjacent_extents(struct btrfs_trans_handle *trans,
-        struct extent_buffer *src;
+                                 struct btrfs_root *root, struct inode *inode,
-        u64 next_offset;
+                                 struct extent_map *em,
-        int start_slot;
+                                 struct btrfs_path *path)
-        int nr;
+{
-};
+        struct btrfs_file_extent_item *fi;
+        struct extent_buffer *leaf;
+        struct btrfs_key key, new_key;
+        struct btrfs_map_token token;
+        u64 extent_end;
+        u64 extent_offset = 0;
+        int extent_type;
+        int del_slot = 0;
+        int del_nr = 0;
+        int ret = 0;
+        while (1) {
+                btrfs_init_map_token(&token);
+                leaf = path->nodes[0];
+                path->slots[0]++;
+                if (path->slots[0] >= btrfs_header_nritems(leaf)) {
+                        if (del_nr) {
+                                ret = btrfs_del_items(trans, root, path,
+                                                      del_slot, del_nr);
+                                if (ret)
+                                        return ret;
+                                del_nr = 0;
+                        }
+                        ret = btrfs_next_leaf_write(trans, root, path, 1);
+                        if (ret < 0)
+                                return ret;
+                        if (ret > 0)
+                                return 0;
+                        leaf = path->nodes[0];
+                }
+                btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
+                if (key.objectid != btrfs_ino(inode) ||
+                    key.type != BTRFS_EXTENT_DATA_KEY ||
+                    key.offset >= em->start + em->len)
+                        break;
+                fi = btrfs_item_ptr(leaf, path->slots[0],
+                                    struct btrfs_file_extent_item);
+                extent_type = btrfs_token_file_extent_type(leaf, fi, &token);
+                if (extent_type == BTRFS_FILE_EXTENT_REG ||
+                    extent_type == BTRFS_FILE_EXTENT_PREALLOC) {
+                        extent_offset = btrfs_token_file_extent_offset(leaf,
+                                                                fi, &token);
+                        extent_end = key.offset +
+                                btrfs_token_file_extent_num_bytes(leaf, fi,
+                                                                  &token);
+                } else if (extent_type == BTRFS_FILE_EXTENT_INLINE) {
+                        extent_end = key.offset +
+                                btrfs_file_extent_inline_len(leaf, fi);
+                } else {
+                        BUG();
+                }
+                if (extent_end <= em->len + em->start) {
+                        if (!del_nr) {
+                                del_slot = path->slots[0];
+                        }
+                        del_nr++;
+                        continue;
+                }
+                /*
+                 * Ok so we'll ignore previous items if we log a new extent,
+                 * which can lead to overlapping extents, so if we have an
+                 * existing extent we want to adjust we _have_ to check the next
+                 * guy to make sure we even need this extent anymore, this keeps
+                 * us from panicing in set_item_key_safe.
+                 */
+                if (path->slots[0] < btrfs_header_nritems(leaf) - 1) {
+                        struct btrfs_key tmp_key;
+                        btrfs_item_key_to_cpu(leaf, &tmp_key,
+                                              path->slots[0] + 1);
+                        if (tmp_key.objectid == btrfs_ino(inode) &&
+                            tmp_key.type == BTRFS_EXTENT_DATA_KEY &&
+                            tmp_key.offset <= em->start + em->len) {
+                                if (!del_nr)
+                                        del_slot = path->slots[0];
+                                del_nr++;
+                                continue;
+                        }
+                }
+                BUG_ON(extent_type == BTRFS_FILE_EXTENT_INLINE);
+                memcpy(&new_key, &key, sizeof(new_key));
+                new_key.offset = em->start + em->len;
+                btrfs_set_item_key_safe(trans, root, path, &new_key);
+                extent_offset += em->start + em->len - key.offset;
+                btrfs_set_token_file_extent_offset(leaf, fi, extent_offset,
+                                                   &token);
+                btrfs_set_token_file_extent_num_bytes(leaf, fi, extent_end -
+                                                      (em->start + em->len),
+                                                      &token);
+                btrfs_mark_buffer_dirty(leaf);
+        }
+        if (del_nr)
+                ret = btrfs_del_items(trans, root, path, del_slot, del_nr);
+        return ret;
+}
 static int log_one_extent(struct btrfs_trans_handle *trans,
                          struct inode *inode, struct btrfs_root *root,
-                          struct extent_map *em, struct btrfs_path *path,
+                          struct extent_map *em, struct btrfs_path *path)
-                          struct btrfs_path *dst_path, struct log_args *args)
 {
        struct btrfs_root *log = root->log_root;
        struct btrfs_file_extent_item *fi;
+        struct extent_buffer *leaf;
+        struct list_head ordered_sums;
+        struct btrfs_map_token token;
        struct btrfs_key key;
-        u64 start = em->mod_start;
+        u64 csum_offset = em->mod_start - em->start;
-        u64 search_start = start;
+        u64 csum_len = em->mod_len;
-        u64 len = em->mod_len;
+        u64 extent_offset = em->start - em->orig_start;
-        u64 num_bytes;
+        u64 block_len;
-        int nritems;
        int ret;
+        bool skip_csum = BTRFS_I(inode)->flags & BTRFS_INODE_NODATASUM;
-        if (BTRFS_I(inode)->logged_trans == trans->transid) {
+        INIT_LIST_HEAD(&ordered_sums);
-                ret = __btrfs_drop_extents(trans, log, inode, dst_path, start,
+        btrfs_init_map_token(&token);
-                                           start + len, NULL, 0);
+        key.objectid = btrfs_ino(inode);
-                if (ret)
+        key.type = BTRFS_EXTENT_DATA_KEY;
-                        return ret;
+        key.offset = em->start;
+        path->really_keep_locks = 1;
+        ret = btrfs_insert_empty_item(trans, log, path, &key, sizeof(*fi));
+        if (ret && ret != -EEXIST) {
+                path->really_keep_locks = 0;
+                return ret;
        }
+        leaf = path->nodes[0];
+        fi = btrfs_item_ptr(leaf, path->slots[0],
+                            struct btrfs_file_extent_item);
+        btrfs_set_token_file_extent_generation(leaf, fi, em->generation,
+                                               &token);
+        if (test_bit(EXTENT_FLAG_PREALLOC, &em->flags)) {
+                skip_csum = true;
+                btrfs_set_token_file_extent_type(leaf, fi,
+                                                 BTRFS_FILE_EXTENT_PREALLOC,
+                                                 &token);
+        } else {
+                btrfs_set_token_file_extent_type(leaf, fi,
+                                                 BTRFS_FILE_EXTENT_REG,
+                                                 &token);
+                if (em->block_start == 0)
+                        skip_csum = true;
+        }
+        block_len = max(em->block_len, em->orig_block_len);
+        if (em->compress_type != BTRFS_COMPRESS_NONE) {
+                btrfs_set_token_file_extent_disk_bytenr(leaf, fi,
+                                                        em->block_start,
+                                                        &token);
+                btrfs_set_token_file_extent_disk_num_bytes(leaf, fi, block_len,
+                                                           &token);
+        } else if (em->block_start < EXTENT_MAP_LAST_BYTE) {
+                btrfs_set_token_file_extent_disk_bytenr(leaf, fi,
+                                                        em->block_start -
+                                                        extent_offset, &token);
+                btrfs_set_token_file_extent_disk_num_bytes(leaf, fi, block_len,
+                                                           &token);
+        } else {
+                btrfs_set_token_file_extent_disk_bytenr(leaf, fi, 0, &token);
+                btrfs_set_token_file_extent_disk_num_bytes(leaf, fi, 0,
+                                                           &token);
+        }
+        btrfs_set_token_file_extent_offset(leaf, fi,
+                                           em->start - em->orig_start,
+                                           &token);
+        btrfs_set_token_file_extent_num_bytes(leaf, fi, em->len, &token);
+        btrfs_set_token_file_extent_ram_bytes(leaf, fi, em->len, &token);
+        btrfs_set_token_file_extent_compression(leaf, fi, em->compress_type,
+                                                &token);
+        btrfs_set_token_file_extent_encryption(leaf, fi, 0, &token);
+        btrfs_set_token_file_extent_other_encoding(leaf, fi, 0, &token);
+        btrfs_mark_buffer_dirty(leaf);
-        while (len) {
+        /*
-                if (args->nr)
+         * Have to check the extent to the right of us to make sure it doesn't
-                        goto next_slot;
+         * fall in our current range.  We're ok if the previous extent is in our
-again:
+         * range since the recovery stuff will run us in key order and thus just
-                key.objectid = btrfs_ino(inode);
+         * drop the part we overwrote.
-                key.type = BTRFS_EXTENT_DATA_KEY;
+         */
-                key.offset = search_start;
+        ret = drop_adjacent_extents(trans, log, inode, em, path);
+        btrfs_release_path(path);
-                ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
+        path->really_keep_locks = 0;
-                if (ret < 0)
+        if (ret) {
-                        return ret;
+                return ret;
+        }
-                if (ret) {
-                        /*
-                         * A rare case were we can have an em for a section of a
-                         * larger extent so we need to make sure that this em
-                         * falls within the extent we've found.  If not we just
-                         * bail and go back to ye-olde way of doing things but
-                         * it happens often enough in testing that we need to do
-                         * this dance to make sure.
-                         */
-                        do {
-                                if (path->slots[0] == 0) {
-                                        btrfs_release_path(path);
-                                        if (search_start == 0)
-                                                return -ENOENT;
-                                        search_start--;
-                                        goto again;
-                                }
-                                path->slots[0]--;
+        if (skip_csum)
-                                btrfs_item_key_to_cpu(path->nodes[0], &key,
+                return 0;
-                                                      path->slots[0]);
-                                if (key.objectid != btrfs_ino(inode) ||
-                                    key.type != BTRFS_EXTENT_DATA_KEY) {
-                                        btrfs_release_path(path);
-                                        return -ENOENT;
-                                }
-                        } while (key.offset > start);
-                        fi = btrfs_item_ptr(path->nodes[0], path->slots[0],
+        if (em->compress_type) {
-                                            struct btrfs_file_extent_item);
+                csum_offset = 0;
-                        num_bytes = btrfs_file_extent_num_bytes(path->nodes[0],
+                csum_len = block_len;
-                                                                fi);
+        }
-                        if (key.offset + num_bytes <= start) {
-                                btrfs_release_path(path);
-                                return -ENOENT;
-                        }
-                }
-                args->src = path->nodes[0];
-next_slot:
-                btrfs_item_key_to_cpu(path->nodes[0], &key, path->slots[0]);
-                fi = btrfs_item_ptr(args->src, path->slots[0],
-                                    struct btrfs_file_extent_item);
-                if (args->nr &&
-                    args->start_slot + args->nr == path->slots[0]) {
-                        args->nr++;
-                } else if (args->nr) {
-                        ret = copy_items(trans, inode, dst_path, args->src,
-                                         args->start_slot, args->nr,
-                                         LOG_INODE_ALL);
-                        if (ret)
-                                return ret;
-                        args->nr = 1;
-                        args->start_slot = path->slots[0];
-                } else if (!args->nr) {
-                        args->nr = 1;
-                        args->start_slot = path->slots[0];
-                }
-                nritems = btrfs_header_nritems(path->nodes[0]);
-                path->slots[0]++;
-                num_bytes = btrfs_file_extent_num_bytes(args->src, fi);
-                if (len < num_bytes) {
-                        /* I _think_ this is ok, envision we write to a
-                         * preallocated space that is adjacent to a previously
-                         * written preallocated space that gets merged when we
-                         * mark this preallocated space written.  If we do not
-                         * have the adjacent extent in cache then when we copy
-                         * this extent it could end up being larger than our EM
-                         * thinks it is, which is a-ok, so just set len to 0.
-                         */
-                        len = 0;
-                } else {
-                        len -= num_bytes;
-                }
-                start = key.offset + num_bytes;
-                args->next_offset = start;
-                search_start = start;
-                if (path->slots[0] < nritems) {
+        /* block start is already adjusted for the file extent offset. */
-                        if (len)
+        ret = btrfs_lookup_csums_range(log->fs_info->csum_root,
-                                goto next_slot;
+                                       em->block_start + csum_offset,
-                        break;
+                                       em->block_start + csum_offset +
-                }
+                                       csum_len - 1, &ordered_sums, 0);
+        if (ret)
+                return ret;
-                if (args->nr) {
+        while (!list_empty(&ordered_sums)) {
-                        ret = copy_items(trans, inode, dst_path, args->src,
+                struct btrfs_ordered_sum *sums = list_entry(ordered_sums.next,
-                                         args->start_slot, args->nr,
+                                                   struct btrfs_ordered_sum,
-                                         LOG_INODE_ALL);
+                                                   list);
-                        if (ret)
+                if (!ret)
-                                return ret;
+                        ret = btrfs_csum_file_blocks(trans, log, sums);
-                        args->nr = 0;
+                list_del(&sums->list);
-                        btrfs_release_path(path);
+                kfree(sums);
-                }
        }
-        return 0;
+        return ret;
 }
 static int btrfs_log_changed_extents(struct btrfs_trans_handle *trans,
                                     struct btrfs_root *root,
                                     struct inode *inode,
-                                     struct btrfs_path *path,
+                                     struct btrfs_path *path)
-                                     struct btrfs_path *dst_path)
 {
-        struct log_args args;
        struct extent_map *em, *n;
        struct list_head extents;
        struct extent_map_tree *tree = &BTRFS_I(inode)->extent_tree;
@@ -3283,8 +3396,6 @@ static int btrfs_log_changed_extents(struct btrfs_trans_handle *trans,
        INIT_LIST_HEAD(&extents);
-        memset(&args, 0, sizeof(args));
        write_lock(&tree->lock);
        test_gen = root->fs_info->last_trans_committed;
@@ -3304,47 +3415,27 @@ static int btrfs_log_changed_extents(struct btrfs_trans_handle *trans,
                em = list_entry(extents.next, struct extent_map, list);
                list_del_init(&em->list);
-                clear_bit(EXTENT_FLAG_LOGGING, &em->flags);
                /*
                 * If we had an error we just need to delete everybody from our
                 * private list.
                 */
                if (ret) {
+                        clear_em_logging(tree, em);
                        free_extent_map(em);
                        continue;
                }
                write_unlock(&tree->lock);
-                /*
+                ret = log_one_extent(trans, inode, root, em, path);
-                 * If the previous EM and the last extent we left off on aren't
-                 * sequential then we need to copy the items we have and redo
-                 * our search
-                 */
-                if (args.nr && em->mod_start != args.next_offset) {
-                        ret = copy_items(trans, inode, dst_path, args.src,
-                                         args.start_slot, args.nr,
-                                         LOG_INODE_ALL);
-                        if (ret) {
-                                free_extent_map(em);
-                                write_lock(&tree->lock);
-                                continue;
-                        }
-                        btrfs_release_path(path);
-                        args.nr = 0;
-                }
-                ret = log_one_extent(trans, inode, root, em, path, dst_path, &args);
-                free_extent_map(em);
                write_lock(&tree->lock);
+                clear_em_logging(tree, em);
+                free_extent_map(em);
        }
        WARN_ON(!list_empty(&extents));
        write_unlock(&tree->lock);
-        if (!ret && args.nr)
-                ret = copy_items(trans, inode, dst_path, args.src,
-                                 args.start_slot, args.nr, LOG_INODE_ALL);
        btrfs_release_path(path);
        return ret;
 }
@@ -3400,7 +3491,10 @@ static int btrfs_log_inode(struct btrfs_trans_handle *trans,
        /* today the code can only do partial logging of directories */
-        if (inode_only == LOG_INODE_EXISTS || S_ISDIR(inode->i_mode))
+        if (S_ISDIR(inode->i_mode) ||
+            (!test_bit(BTRFS_INODE_NEEDS_FULL_SYNC,
+                       &BTRFS_I(inode)->runtime_flags) &&
+             inode_only == LOG_INODE_EXISTS))
                max_key.type = BTRFS_XATTR_ITEM_KEY;
        else
                max_key.type = (u8)-1;
@@ -3432,14 +3526,28 @@ static int btrfs_log_inode(struct btrfs_trans_handle *trans,
        } else {
                if (test_and_clear_bit(BTRFS_INODE_NEEDS_FULL_SYNC,
                                       &BTRFS_I(inode)->runtime_flags)) {
+                        clear_bit(BTRFS_INODE_COPY_EVERYTHING,
+                                  &BTRFS_I(inode)->runtime_flags);
                        ret = btrfs_truncate_inode_items(trans, log,
                                                         inode, 0, 0);
-                } else {
+                } else if (test_and_clear_bit(BTRFS_INODE_COPY_EVERYTHING,
-                        fast_search = true;
+                                              &BTRFS_I(inode)->runtime_flags)) {
+                        if (inode_only == LOG_INODE_ALL)
+                                fast_search = true;
                        max_key.type = BTRFS_XATTR_ITEM_KEY;
                        ret = drop_objectid_items(trans, log, path, ino,
-                                                  BTRFS_XATTR_ITEM_KEY);
+                                                  max_key.type);
+                } else {
+                        if (inode_only == LOG_INODE_ALL)
+                                fast_search = true;
+                        ret = log_inode_item(trans, log, dst_path, inode);
+                        if (ret) {
+                                err = ret;
+                                goto out_unlock;
+                        }
+                        goto log_extents;
                }
        }
        if (ret) {
                err = ret;
@@ -3518,11 +3626,10 @@ next_slot:
                ins_nr = 0;
        }
+log_extents:
        if (fast_search) {
-                btrfs_release_path(path);
                btrfs_release_path(dst_path);
-                ret = btrfs_log_changed_extents(trans, root, inode, path,
+                ret = btrfs_log_changed_extents(trans, root, inode, dst_path);
-                                                dst_path);
                if (ret) {
                        err = ret;
                        goto out_unlock;
@@ -3531,8 +3638,10 @@ next_slot:
                struct extent_map_tree *tree = &BTRFS_I(inode)->extent_tree;
                struct extent_map *em, *n;
+                write_lock(&tree->lock);
                list_for_each_entry_safe(em, n, &tree->modified_extents, list)
                        list_del_init(&em->list);
+                write_unlock(&tree->lock);
        }
        if (inode_only == LOG_INODE_ALL && S_ISDIR(inode->i_mode)) {
diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c
index 0f5ebb72a5ea..5cbb7f4b1672 100644
--- a/fs/btrfs/volumes.c
+++ b/fs/btrfs/volumes.c
@@ -25,7 +25,6 @@
 #include <linux/capability.h>
 #include <linux/ratelimit.h>
 #include <linux/kthread.h>
-#include <asm/div64.h>
 #include "compat.h"
 #include "ctree.h"
 #include "extent_map.h"
@@ -36,6 +35,8 @@
 #include "async-thread.h"
 #include "check-integrity.h"
 #include "rcu-string.h"
+#include "math.h"
+#include "dev-replace.h"
 static int init_first_rw_device(struct btrfs_trans_handle *trans,
                                struct btrfs_root *root,
@@ -71,6 +72,19 @@ static void free_fs_devices(struct btrfs_fs_devices *fs_devices)
        kfree(fs_devices);
 }
+static void btrfs_kobject_uevent(struct block_device *bdev,
+                                 enum kobject_action action)
+{
+        int ret;
+        ret = kobject_uevent(&disk_to_dev(bdev->bd_disk)->kobj, action);
+        if (ret)
+                pr_warn("Sending event '%d' to kobject: '%s' (%p): failed\n",
+                        action,
+                        kobject_name(&disk_to_dev(bdev->bd_disk)->kobj),
+                        &disk_to_dev(bdev->bd_disk)->kobj);
+}
 void btrfs_cleanup_fs_uuids(void)
 {
        struct btrfs_fs_devices *fs_devices;
@@ -108,6 +122,44 @@ static noinline struct btrfs_fs_devices *find_fsid(u8 *fsid)
        return NULL;
 }
+static int
+btrfs_get_bdev_and_sb(const char *device_path, fmode_t flags, void *holder,
+                      int flush, struct block_device **bdev,
+                      struct buffer_head **bh)
+{
+        int ret;
+        *bdev = blkdev_get_by_path(device_path, flags, holder);
+        if (IS_ERR(*bdev)) {
+                ret = PTR_ERR(*bdev);
+                printk(KERN_INFO "btrfs: open %s failed\n", device_path);
+                goto error;
+        }
+        if (flush)
+                filemap_write_and_wait((*bdev)->bd_inode->i_mapping);
+        ret = set_blocksize(*bdev, 4096);
+        if (ret) {
+                blkdev_put(*bdev, flags);
+                goto error;
+        }
+        invalidate_bdev(*bdev);
+        *bh = btrfs_read_dev_super(*bdev);
+        if (!*bh) {
+                ret = -EINVAL;
+                blkdev_put(*bdev, flags);
+                goto error;
+        }
+        return 0;
+error:
+        *bdev = NULL;
+        *bh = NULL;
+        return ret;
+}
 static void requeue_list(struct btrfs_pending_bios *pending_bios,
                        struct bio *head, struct bio *tail)
 {
@@ -467,7 +519,8 @@ error:
        return ERR_PTR(-ENOMEM);
 }
-void btrfs_close_extra_devices(struct btrfs_fs_devices *fs_devices)
+void btrfs_close_extra_devices(struct btrfs_fs_info *fs_info,
+                               struct btrfs_fs_devices *fs_devices, int step)
 {
        struct btrfs_device *device, *next;
@@ -480,8 +533,9 @@ again:
        /* This is the initialized path, it is safe to release the devices. */
        list_for_each_entry_safe(device, next, &fs_devices->devices, dev_list) {
                if (device->in_fs_metadata) {
-                        if (!latest_transid ||
+                        if (!device->is_tgtdev_for_dev_replace &&
-                            device->generation > latest_transid) {
+                            (!latest_transid ||
+                             device->generation > latest_transid)) {
                                latest_devid = device->devid;
                                latest_transid = device->generation;
                                latest_bdev = device->bdev;
@@ -489,6 +543,21 @@ again:
                        continue;
                }
+                if (device->devid == BTRFS_DEV_REPLACE_DEVID) {
+                        /*
+                         * In the first step, keep the device which has
+                         * the correct fsid and the devid that is used
+                         * for the dev_replace procedure.
+                         * In the second step, the dev_replace state is
+                         * read from the device tree and it is known
+                         * whether the procedure is really active or
+                         * not, which means whether this device is
+                         * used or whether it should be removed.
+                         */
+                        if (step == 0 || device->is_tgtdev_for_dev_replace) {
+                                continue;
+                        }
+                }
                if (device->bdev) {
                        blkdev_put(device->bdev, device->mode);
                        device->bdev = NULL;
@@ -497,7 +566,8 @@ again:
                if (device->writeable) {
                        list_del_init(&device->dev_alloc_list);
                        device->writeable = 0;
-                        fs_devices->rw_devices--;
+                        if (!device->is_tgtdev_for_dev_replace)
+                                fs_devices->rw_devices--;
                }
                list_del_init(&device->dev_list);
                fs_devices->num_devices--;
@@ -555,7 +625,7 @@ static int __btrfs_close_devices(struct btrfs_fs_devices *fs_devices)
                if (device->bdev)
                        fs_devices->open_devices--;
-                if (device->writeable) {
+                if (device->writeable && !device->is_tgtdev_for_dev_replace) {
                        list_del_init(&device->dev_alloc_list);
                        fs_devices->rw_devices--;
                }
@@ -637,18 +707,10 @@ static int __btrfs_open_devices(struct btrfs_fs_devices *fs_devices,
                if (!device->name)
                        continue;
-                bdev = blkdev_get_by_path(device->name->str, flags, holder);
+                ret = btrfs_get_bdev_and_sb(device->name->str, flags, holder, 1,
-                if (IS_ERR(bdev)) {
+                                            &bdev, &bh);
-                        printk(KERN_INFO "btrfs: open %s failed\n", device->name->str);
+                if (ret)
-                        goto error;
+                        continue;
-                }
-                filemap_write_and_wait(bdev->bd_inode->i_mapping);
-                invalidate_bdev(bdev);
-                set_blocksize(bdev, 4096);
-                bh = btrfs_read_dev_super(bdev);
-                if (!bh)
-                        goto error_close;
                disk_super = (struct btrfs_super_block *)bh->b_data;
                devid = btrfs_stack_device_id(&disk_super->dev_item);
@@ -687,7 +749,7 @@ static int __btrfs_open_devices(struct btrfs_fs_devices *fs_devices,
                        fs_devices->rotating = 1;
                fs_devices->open_devices++;
-                if (device->writeable) {
+                if (device->writeable && !device->is_tgtdev_for_dev_replace) {
                        fs_devices->rw_devices++;
                        list_add(&device->dev_alloc_list,
                                 &fs_devices->alloc_list);
@@ -697,9 +759,7 @@ static int __btrfs_open_devices(struct btrfs_fs_devices *fs_devices,
 error_brelse:
                brelse(bh);
-error_close:
                blkdev_put(bdev, flags);
-error:
                continue;
        }
        if (fs_devices->open_devices == 0) {
@@ -744,40 +804,30 @@ int btrfs_scan_one_device(const char *path, fmode_t flags, void *holder,
        u64 total_devices;
        flags |= FMODE_EXCL;
-        bdev = blkdev_get_by_path(path, flags, holder);
-        if (IS_ERR(bdev)) {
-                ret = PTR_ERR(bdev);
-                goto error;
-        }
        mutex_lock(&uuid_mutex);
-        ret = set_blocksize(bdev, 4096);
+        ret = btrfs_get_bdev_and_sb(path, flags, holder, 0, &bdev, &bh);
        if (ret)
-                goto error_close;
+                goto error;
-        bh = btrfs_read_dev_super(bdev);
-        if (!bh) {
-                ret = -EINVAL;
-                goto error_close;
-        }
        disk_super = (struct btrfs_super_block *)bh->b_data;
        devid = btrfs_stack_device_id(&disk_super->dev_item);
        transid = btrfs_super_generation(disk_super);
        total_devices = btrfs_super_num_devices(disk_super);
-        if (disk_super->label[0])
+        if (disk_super->label[0]) {
+                if (disk_super->label[BTRFS_LABEL_SIZE - 1])
+                        disk_super->label[BTRFS_LABEL_SIZE - 1] = '\0';
                printk(KERN_INFO "device label %s ", disk_super->label);
-        else
+        } else {
                printk(KERN_INFO "device fsid %pU ", disk_super->fsid);
+        }
        printk(KERN_CONT "devid %llu transid %llu %s\n",
               (unsigned long long)devid, (unsigned long long)transid, path);
        ret = device_list_add(path, disk_super, devid, fs_devices_ret);
        if (!ret && fs_devices_ret)
                (*fs_devices_ret)->total_devices = total_devices;
        brelse(bh);
-error_close:
-        mutex_unlock(&uuid_mutex);
        blkdev_put(bdev, flags);
 error:
+        mutex_unlock(&uuid_mutex);
        return ret;
 }
@@ -796,7 +846,7 @@ int btrfs_account_dev_extents_size(struct btrfs_device *device, u64 start,
        *length = 0;
-        if (start >= device->total_bytes)
+        if (start >= device->total_bytes || device->is_tgtdev_for_dev_replace)
                return 0;
        path = btrfs_alloc_path();
@@ -913,7 +963,7 @@ int find_free_dev_extent(struct btrfs_device *device, u64 num_bytes,
        max_hole_size = 0;
        hole_size = 0;
-        if (search_start >= search_end) {
+        if (search_start >= search_end || device->is_tgtdev_for_dev_replace) {
                ret = -ENOSPC;
                goto error;
        }
@@ -1096,6 +1146,7 @@ int btrfs_alloc_dev_extent(struct btrfs_trans_handle *trans,
        struct btrfs_key key;
        WARN_ON(!device->in_fs_metadata);
+        WARN_ON(device->is_tgtdev_for_dev_replace);
        path = btrfs_alloc_path();
        if (!path)
                return -ENOMEM;
@@ -1330,16 +1381,22 @@ int btrfs_rm_device(struct btrfs_root *root, char *device_path)
                root->fs_info->avail_system_alloc_bits |
                root->fs_info->avail_metadata_alloc_bits;
-        if ((all_avail & BTRFS_BLOCK_GROUP_RAID10) &&
+        num_devices = root->fs_info->fs_devices->num_devices;
-            root->fs_info->fs_devices->num_devices <= 4) {
+        btrfs_dev_replace_lock(&root->fs_info->dev_replace);
+        if (btrfs_dev_replace_is_ongoing(&root->fs_info->dev_replace)) {
+                WARN_ON(num_devices < 1);
+                num_devices--;
+        }
+        btrfs_dev_replace_unlock(&root->fs_info->dev_replace);
+        if ((all_avail & BTRFS_BLOCK_GROUP_RAID10) && num_devices <= 4) {
                printk(KERN_ERR "btrfs: unable to go below four devices "
                       "on raid10\n");
                ret = -EINVAL;
                goto out;
        }
-        if ((all_avail & BTRFS_BLOCK_GROUP_RAID1) &&
+        if ((all_avail & BTRFS_BLOCK_GROUP_RAID1) && num_devices <= 2) {
-            root->fs_info->fs_devices->num_devices <= 2) {
                printk(KERN_ERR "btrfs: unable to go below two "
                       "devices on raid1\n");
                ret = -EINVAL;
@@ -1357,7 +1414,9 @@ int btrfs_rm_device(struct btrfs_root *root, char *device_path)
                 * is held.
                 */
                list_for_each_entry(tmp, devices, dev_list) {
-                        if (tmp->in_fs_metadata && !tmp->bdev) {
+                        if (tmp->in_fs_metadata &&
+                            !tmp->is_tgtdev_for_dev_replace &&
+                            !tmp->bdev) {
                                device = tmp;
                                break;
                        }
@@ -1371,24 +1430,16 @@ int btrfs_rm_device(struct btrfs_root *root, char *device_path)
                        goto out;
                }
        } else {
-                bdev = blkdev_get_by_path(device_path, FMODE_READ | FMODE_EXCL,
+                ret = btrfs_get_bdev_and_sb(device_path,
-                                          root->fs_info->bdev_holder);
+                                            FMODE_WRITE | FMODE_EXCL,
-                if (IS_ERR(bdev)) {
+                                            root->fs_info->bdev_holder, 0,
-                        ret = PTR_ERR(bdev);
+                                            &bdev, &bh);
+                if (ret)
                        goto out;
-                }
-                set_blocksize(bdev, 4096);
-                invalidate_bdev(bdev);
-                bh = btrfs_read_dev_super(bdev);
-                if (!bh) {
-                        ret = -EINVAL;
-                        goto error_close;
-                }
                disk_super = (struct btrfs_super_block *)bh->b_data;
                devid = btrfs_stack_device_id(&disk_super->dev_item);
                dev_uuid = disk_super->dev_item.uuid;
-                device = btrfs_find_device(root, devid, dev_uuid,
+                device = btrfs_find_device(root->fs_info, devid, dev_uuid,
                                           disk_super->fsid);
                if (!device) {
                        ret = -ENOENT;
@@ -1396,6 +1447,12 @@ int btrfs_rm_device(struct btrfs_root *root, char *device_path)
                }
        }
+        if (device->is_tgtdev_for_dev_replace) {
+                pr_err("btrfs: unable to remove the dev_replace target dev\n");
+                ret = -EINVAL;
+                goto error_brelse;
+        }
        if (device->writeable && root->fs_info->fs_devices->rw_devices == 1) {
                printk(KERN_ERR "btrfs: unable to remove the only writeable "
                       "device\n");
@@ -1415,6 +1472,11 @@ int btrfs_rm_device(struct btrfs_root *root, char *device_path)
        if (ret)
                goto error_undo;
+        /*
+         * TODO: the superblock still includes this device in its num_devices
+         * counter although write_all_supers() is not locked out. This
+         * could give a filesystem state which requires a degraded mount.
+         */
        ret = btrfs_rm_dev_item(root->fs_info->chunk_root, device);
        if (ret)
                goto error_undo;
@@ -1425,7 +1487,7 @@ int btrfs_rm_device(struct btrfs_root *root, char *device_path)
        spin_unlock(&root->fs_info->free_chunk_lock);
        device->in_fs_metadata = 0;
-        btrfs_scrub_cancel_dev(root, device);
+        btrfs_scrub_cancel_dev(root->fs_info, device);
        /*
         * the device list mutex makes sure that we don't change
@@ -1482,7 +1544,7 @@ int btrfs_rm_device(struct btrfs_root *root, char *device_path)
         * at this point, the device is zero sized.  We want to
         * remove it from the devices list and zero out the old super
         */
-        if (clear_super) {
+        if (clear_super && disk_super) {
                /* make sure this device isn't detected as part of
                 * the FS anymore
                 */
@@ -1493,9 +1555,12 @@ int btrfs_rm_device(struct btrfs_root *root, char *device_path)
        ret = 0;
+        /* Notify udev that device has changed */
+        if (bdev)
+                btrfs_kobject_uevent(bdev, KOBJ_CHANGE);
 error_brelse:
        brelse(bh);
-error_close:
        if (bdev)
                blkdev_put(bdev, FMODE_READ | FMODE_EXCL);
 out:
@@ -1512,6 +1577,112 @@ error_undo:
        goto error_brelse;
 }
+void btrfs_rm_dev_replace_srcdev(struct btrfs_fs_info *fs_info,
+                                 struct btrfs_device *srcdev)
+{
+        WARN_ON(!mutex_is_locked(&fs_info->fs_devices->device_list_mutex));
+        list_del_rcu(&srcdev->dev_list);
+        list_del_rcu(&srcdev->dev_alloc_list);
+        fs_info->fs_devices->num_devices--;
+        if (srcdev->missing) {
+                fs_info->fs_devices->missing_devices--;
+                fs_info->fs_devices->rw_devices++;
+        }
+        if (srcdev->can_discard)
+                fs_info->fs_devices->num_can_discard--;
+        if (srcdev->bdev)
+                fs_info->fs_devices->open_devices--;
+        call_rcu(&srcdev->rcu, free_device);
+}
+void btrfs_destroy_dev_replace_tgtdev(struct btrfs_fs_info *fs_info,
+                                      struct btrfs_device *tgtdev)
+{
+        struct btrfs_device *next_device;
+        WARN_ON(!tgtdev);
+        mutex_lock(&fs_info->fs_devices->device_list_mutex);
+        if (tgtdev->bdev) {
+                btrfs_scratch_superblock(tgtdev);
+                fs_info->fs_devices->open_devices--;
+        }
+        fs_info->fs_devices->num_devices--;
+        if (tgtdev->can_discard)
+                fs_info->fs_devices->num_can_discard++;
+        next_device = list_entry(fs_info->fs_devices->devices.next,
+                                 struct btrfs_device, dev_list);
+        if (tgtdev->bdev == fs_info->sb->s_bdev)
+                fs_info->sb->s_bdev = next_device->bdev;
+        if (tgtdev->bdev == fs_info->fs_devices->latest_bdev)
+                fs_info->fs_devices->latest_bdev = next_device->bdev;
+        list_del_rcu(&tgtdev->dev_list);
+        call_rcu(&tgtdev->rcu, free_device);
+        mutex_unlock(&fs_info->fs_devices->device_list_mutex);
+}
+int btrfs_find_device_by_path(struct btrfs_root *root, char *device_path,
+                              struct btrfs_device **device)
+{
+        int ret = 0;
+        struct btrfs_super_block *disk_super;
+        u64 devid;
+        u8 *dev_uuid;
+        struct block_device *bdev;
+        struct buffer_head *bh;
+        *device = NULL;
+        ret = btrfs_get_bdev_and_sb(device_path, FMODE_READ,
+                                    root->fs_info->bdev_holder, 0, &bdev, &bh);
+        if (ret)
+                return ret;
+        disk_super = (struct btrfs_super_block *)bh->b_data;
+        devid = btrfs_stack_device_id(&disk_super->dev_item);
+        dev_uuid = disk_super->dev_item.uuid;
+        *device = btrfs_find_device(root->fs_info, devid, dev_uuid,
+                                    disk_super->fsid);
+        brelse(bh);
+        if (!*device)
+                ret = -ENOENT;
+        blkdev_put(bdev, FMODE_READ);
+        return ret;
+}
+int btrfs_find_device_missing_or_by_path(struct btrfs_root *root,
+                                         char *device_path,
+                                         struct btrfs_device **device)
+{
+        *device = NULL;
+        if (strcmp(device_path, "missing") == 0) {
+                struct list_head *devices;
+                struct btrfs_device *tmp;
+                devices = &root->fs_info->fs_devices->devices;
+                /*
+                 * It is safe to read the devices since the volume_mutex
+                 * is held by the caller.
+                 */
+                list_for_each_entry(tmp, devices, dev_list) {
+                        if (tmp->in_fs_metadata && !tmp->bdev) {
+                                *device = tmp;
+                                break;
+                        }
+                }
+                if (!*device) {
+                        pr_err("btrfs: no missing device found\n");
+                        return -ENOENT;
+                }
+                return 0;
+        } else {
+                return btrfs_find_device_by_path(root, device_path, device);
+        }
+}
 /*
 * does all the dirty work required for changing file system's UUID.
 */
@@ -1630,7 +1801,8 @@ next_slot:
                read_extent_buffer(leaf, fs_uuid,
                                   (unsigned long)btrfs_device_fsid(dev_item),
                                   BTRFS_UUID_SIZE);
-                device = btrfs_find_device(root, devid, dev_uuid, fs_uuid);
+                device = btrfs_find_device(root->fs_info, devid, dev_uuid,
+                                           fs_uuid);
                BUG_ON(!device); /* Logic error */
                if (device->fs_devices->seeding) {
@@ -1678,16 +1850,17 @@ int btrfs_init_new_device(struct btrfs_root *root, char *device_path)
        filemap_write_and_wait(bdev->bd_inode->i_mapping);
        devices = &root->fs_info->fs_devices->devices;
-        /*
-         * we have the volume lock, so we don't need the extra
+        mutex_lock(&root->fs_info->fs_devices->device_list_mutex);
-         * device list mutex while reading the list here.
-         */
        list_for_each_entry(device, devices, dev_list) {
                if (device->bdev == bdev) {
                        ret = -EEXIST;
+                        mutex_unlock(
+                                &root->fs_info->fs_devices->device_list_mutex);
                        goto error;
                }
        }
+        mutex_unlock(&root->fs_info->fs_devices->device_list_mutex);
        device = kzalloc(sizeof(*device), GFP_NOFS);
        if (!device) {
@@ -1737,6 +1910,7 @@ int btrfs_init_new_device(struct btrfs_root *root, char *device_path)
        device->dev_root = root->fs_info->dev_root;
        device->bdev = bdev;
        device->in_fs_metadata = 1;
+        device->is_tgtdev_for_dev_replace = 0;
        device->mode = FMODE_EXCL;
        set_blocksize(device->bdev, 4096);
@@ -1844,6 +2018,98 @@ error:
        return ret;
 }
+int btrfs_init_dev_replace_tgtdev(struct btrfs_root *root, char *device_path,
+                                  struct btrfs_device **device_out)
+{
+        struct request_queue *q;
+        struct btrfs_device *device;
+        struct block_device *bdev;
+        struct btrfs_fs_info *fs_info = root->fs_info;
+        struct list_head *devices;
+        struct rcu_string *name;
+        int ret = 0;
+        *device_out = NULL;
+        if (fs_info->fs_devices->seeding)
+                return -EINVAL;
+        bdev = blkdev_get_by_path(device_path, FMODE_WRITE | FMODE_EXCL,
+                                  fs_info->bdev_holder);
+        if (IS_ERR(bdev))
+                return PTR_ERR(bdev);
+        filemap_write_and_wait(bdev->bd_inode->i_mapping);
+        devices = &fs_info->fs_devices->devices;
+        list_for_each_entry(device, devices, dev_list) {
+                if (device->bdev == bdev) {
+                        ret = -EEXIST;
+                        goto error;
+                }
+        }
+        device = kzalloc(sizeof(*device), GFP_NOFS);
+        if (!device) {
+                ret = -ENOMEM;
+                goto error;
+        }
+        name = rcu_string_strdup(device_path, GFP_NOFS);
+        if (!name) {
+                kfree(device);
+                ret = -ENOMEM;
+                goto error;
+        }
+        rcu_assign_pointer(device->name, name);
+        q = bdev_get_queue(bdev);
+        if (blk_queue_discard(q))
+                device->can_discard = 1;
+        mutex_lock(&root->fs_info->fs_devices->device_list_mutex);
+        device->writeable = 1;
+        device->work.func = pending_bios_fn;
+        generate_random_uuid(device->uuid);
+        device->devid = BTRFS_DEV_REPLACE_DEVID;
+        spin_lock_init(&device->io_lock);
+        device->generation = 0;
+        device->io_width = root->sectorsize;
+        device->io_align = root->sectorsize;
+        device->sector_size = root->sectorsize;
+        device->total_bytes = i_size_read(bdev->bd_inode);
+        device->disk_total_bytes = device->total_bytes;
+        device->dev_root = fs_info->dev_root;
+        device->bdev = bdev;
+        device->in_fs_metadata = 1;
+        device->is_tgtdev_for_dev_replace = 1;
+        device->mode = FMODE_EXCL;
+        set_blocksize(device->bdev, 4096);
+        device->fs_devices = fs_info->fs_devices;
+        list_add(&device->dev_list, &fs_info->fs_devices->devices);
+        fs_info->fs_devices->num_devices++;
+        fs_info->fs_devices->open_devices++;
+        if (device->can_discard)
+                fs_info->fs_devices->num_can_discard++;
+        mutex_unlock(&root->fs_info->fs_devices->device_list_mutex);
+        *device_out = device;
+        return ret;
+error:
+        blkdev_put(bdev, FMODE_EXCL);
+        return ret;
+}
+void btrfs_init_dev_replace_tgtdev_for_resume(struct btrfs_fs_info *fs_info,
+                                              struct btrfs_device *tgtdev)
+{
+        WARN_ON(fs_info->fs_devices->rw_devices == 0);
+        tgtdev->io_width = fs_info->dev_root->sectorsize;
+        tgtdev->io_align = fs_info->dev_root->sectorsize;
+        tgtdev->sector_size = fs_info->dev_root->sectorsize;
+        tgtdev->dev_root = fs_info->dev_root;
+        tgtdev->in_fs_metadata = 1;
+}
 static noinline int btrfs_update_device(struct btrfs_trans_handle *trans,
                                        struct btrfs_device *device)
 {
@@ -1900,7 +2166,8 @@ static int __btrfs_grow_device(struct btrfs_trans_handle *trans,
        if (!device->writeable)
                return -EACCES;
-        if (new_size <= device->total_bytes)
+        if (new_size <= device->total_bytes ||
+            device->is_tgtdev_for_dev_replace)
                return -EINVAL;
        btrfs_set_super_total_bytes(super_copy, old_total + diff);
@@ -2338,18 +2605,6 @@ static int chunk_profiles_filter(u64 chunk_type,
        return 1;
 }
-static u64 div_factor_fine(u64 num, int factor)
-{
-        if (factor <= 0)
-                return 0;
-        if (factor >= 100)
-                return num;
-        num *= factor;
-        do_div(num, 100);
-        return num;
-}
 static int chunk_usage_filter(struct btrfs_fs_info *fs_info, u64 chunk_offset,
                              struct btrfs_balance_args *bargs)
 {
@@ -2360,7 +2615,14 @@ static int chunk_usage_filter(struct btrfs_fs_info *fs_info, u64 chunk_offset,
        cache = btrfs_lookup_block_group(fs_info, chunk_offset);
        chunk_used = btrfs_block_group_used(&cache->item);
-        user_thresh = div_factor_fine(cache->key.offset, bargs->usage);
+        if (bargs->usage == 0)
+                user_thresh = 0;
+        else if (bargs->usage > 100)
+                user_thresh = cache->key.offset;
+        else
+                user_thresh = div_factor_fine(cache->key.offset,
+                                              bargs->usage);
        if (chunk_used < user_thresh)
                ret = 0;
@@ -2514,15 +2776,6 @@ static int should_balance_chunk(struct btrfs_root *root,
        return 1;
 }
-static u64 div_factor(u64 num, int factor)
-{
-        if (factor == 10)
-                return num;
-        num *= factor;
-        do_div(num, 10);
-        return num;
-}
 static int __btrfs_balance(struct btrfs_fs_info *fs_info)
 {
        struct btrfs_balance_control *bctl = fs_info->balance_ctl;
@@ -2550,7 +2803,8 @@ static int __btrfs_balance(struct btrfs_fs_info *fs_info)
                size_to_free = div_factor(old_size, 1);
                size_to_free = min(size_to_free, (u64)1 * 1024 * 1024);
                if (!device->writeable ||
-                    device->total_bytes - device->bytes_used > size_to_free)
+                    device->total_bytes - device->bytes_used > size_to_free ||
+                    device->is_tgtdev_for_dev_replace)
                        continue;
                ret = btrfs_shrink_device(device, old_size - size_to_free);
@@ -2713,6 +2967,8 @@ static void __cancel_balance(struct btrfs_fs_info *fs_info)
        unset_balance_control(fs_info);
        ret = del_balance_item(fs_info->tree_root);
        BUG_ON(ret);
+        atomic_set(&fs_info->mutually_exclusive_operation_running, 0);
 }
 void update_ioctl_balance_args(struct btrfs_fs_info *fs_info, int lock,
@@ -2728,6 +2984,7 @@ int btrfs_balance(struct btrfs_balance_control *bctl,
        u64 allowed;
        int mixed = 0;
        int ret;
+        u64 num_devices;
        if (btrfs_fs_closing(fs_info) ||
            atomic_read(&fs_info->balance_pause_req) ||
@@ -2756,10 +3013,17 @@ int btrfs_balance(struct btrfs_balance_control *bctl,
                }
        }
+        num_devices = fs_info->fs_devices->num_devices;
+        btrfs_dev_replace_lock(&fs_info->dev_replace);
+        if (btrfs_dev_replace_is_ongoing(&fs_info->dev_replace)) {
+                BUG_ON(num_devices < 1);
+                num_devices--;
+        }
+        btrfs_dev_replace_unlock(&fs_info->dev_replace);
        allowed = BTRFS_AVAIL_ALLOC_BIT_SINGLE;
-        if (fs_info->fs_devices->num_devices == 1)
+        if (num_devices == 1)
                allowed |= BTRFS_BLOCK_GROUP_DUP;
-        else if (fs_info->fs_devices->num_devices < 4)
+        else if (num_devices < 4)
                allowed |= (BTRFS_BLOCK_GROUP_RAID0 | BTRFS_BLOCK_GROUP_RAID1);
        else
                allowed |= (BTRFS_BLOCK_GROUP_RAID0 | BTRFS_BLOCK_GROUP_RAID1 |
@@ -2884,8 +3148,10 @@ int btrfs_balance(struct btrfs_balance_control *bctl,
 out:
        if (bctl->flags & BTRFS_BALANCE_RESUME)
                __cancel_balance(fs_info);
-        else
+        else {
                kfree(bctl);
+                atomic_set(&fs_info->mutually_exclusive_operation_running, 0);
+        }
        return ret;
 }
@@ -2977,6 +3243,8 @@ int btrfs_recover_balance(struct btrfs_fs_info *fs_info)
        btrfs_balance_sys(leaf, item, &disk_bargs);
        btrfs_disk_balance_args_to_cpu(&bctl->sys, &disk_bargs);
+        WARN_ON(atomic_xchg(&fs_info->mutually_exclusive_operation_running, 1));
        mutex_lock(&fs_info->volume_mutex);
        mutex_lock(&fs_info->balance_mutex);
@@ -3080,7 +3348,7 @@ int btrfs_shrink_device(struct btrfs_device *device, u64 new_size)
        u64 old_size = device->total_bytes;
        u64 diff = device->total_bytes - new_size;
-        if (new_size >= device->total_bytes)
+        if (device->is_tgtdev_for_dev_replace)
                return -EINVAL;
        path = btrfs_alloc_path();
@@ -3235,6 +3503,14 @@ static int btrfs_cmp_device_info(const void *a, const void *b)
        return 0;
 }
+struct btrfs_raid_attr btrfs_raid_array[BTRFS_NR_RAID_TYPES] = {
+        { 2, 1, 0, 4, 2, 2 /* raid10 */ },
+        { 1, 1, 2, 2, 2, 2 /* raid1 */ },
+        { 1, 2, 1, 1, 1, 2 /* dup */ },
+        { 1, 1, 0, 2, 1, 1 /* raid0 */ },
+        { 1, 1, 1, 1, 1, 1 /* single */ },
+};
 static int __btrfs_alloc_chunk(struct btrfs_trans_handle *trans,
                               struct btrfs_root *extent_root,
                               struct map_lookup **map_ret,
@@ -3264,43 +3540,21 @@ static int __btrfs_alloc_chunk(struct btrfs_trans_handle *trans,
        int ndevs;
        int i;
        int j;
+        int index;
        BUG_ON(!alloc_profile_is_valid(type, 0));
        if (list_empty(&fs_devices->alloc_list))
                return -ENOSPC;
-        sub_stripes = 1;
+        index = __get_raid_index(type);
-        dev_stripes = 1;
-        devs_increment = 1;
-        ncopies = 1;
-        devs_max = 0;   /* 0 == as many as possible */
-        devs_min = 1;
-        /*
+        sub_stripes = btrfs_raid_array[index].sub_stripes;
-         * define the properties of each RAID type.
+        dev_stripes = btrfs_raid_array[index].dev_stripes;
-         * FIXME: move this to a global table and use it in all RAID
+        devs_max = btrfs_raid_array[index].devs_max;
-         * calculation code
+        devs_min = btrfs_raid_array[index].devs_min;
-         */
+        devs_increment = btrfs_raid_array[index].devs_increment;
-        if (type & (BTRFS_BLOCK_GROUP_DUP)) {
+        ncopies = btrfs_raid_array[index].ncopies;
-                dev_stripes = 2;
-                ncopies = 2;
-                devs_max = 1;
-        } else if (type & (BTRFS_BLOCK_GROUP_RAID0)) {
-                devs_min = 2;
-        } else if (type & (BTRFS_BLOCK_GROUP_RAID1)) {
-                devs_increment = 2;
-                ncopies = 2;
-                devs_max = 2;
-                devs_min = 2;
-        } else if (type & (BTRFS_BLOCK_GROUP_RAID10)) {
-                sub_stripes = 2;
-                devs_increment = 2;
-                ncopies = 2;
-                devs_min = 4;
-        } else {
-                devs_max = 1;
-        }
        if (type & BTRFS_BLOCK_GROUP_DATA) {
                max_stripe_size = 1024 * 1024 * 1024;
@@ -3347,13 +3601,13 @@ static int __btrfs_alloc_chunk(struct btrfs_trans_handle *trans,
                cur = cur->next;
                if (!device->writeable) {
-                        printk(KERN_ERR
+                        WARN(1, KERN_ERR
                               "btrfs: read-only device in alloc_list\n");
-                        WARN_ON(1);
                        continue;
                }
-                if (!device->in_fs_metadata)
+                if (!device->in_fs_metadata ||
+                    device->is_tgtdev_for_dev_replace)
                        continue;
                if (device->total_bytes > device->bytes_used)
@@ -3382,6 +3636,7 @@ static int __btrfs_alloc_chunk(struct btrfs_trans_handle *trans,
                devices_info[ndevs].total_avail = total_avail;
                devices_info[ndevs].dev = device;
                ++ndevs;
+                WARN_ON(ndevs > fs_devices->rw_devices);
        }
        /*
@@ -3740,8 +3995,9 @@ void btrfs_mapping_tree_free(struct btrfs_mapping_tree *tree)
        }
 }
-int btrfs_num_copies(struct btrfs_mapping_tree *map_tree, u64 logical, u64 len)
+int btrfs_num_copies(struct btrfs_fs_info *fs_info, u64 logical, u64 len)
 {
+        struct btrfs_mapping_tree *map_tree = &fs_info->mapping_tree;
        struct extent_map *em;
        struct map_lookup *map;
        struct extent_map_tree *em_tree = &map_tree->map_tree;
@@ -3761,32 +4017,60 @@ int btrfs_num_copies(struct btrfs_mapping_tree *map_tree, u64 logical, u64 len)
        else
                ret = 1;
        free_extent_map(em);
+        btrfs_dev_replace_lock(&fs_info->dev_replace);
+        if (btrfs_dev_replace_is_ongoing(&fs_info->dev_replace))
+                ret++;
+        btrfs_dev_replace_unlock(&fs_info->dev_replace);
        return ret;
 }
-static int find_live_mirror(struct map_lookup *map, int first, int num,
+static int find_live_mirror(struct btrfs_fs_info *fs_info,
-                            int optimal)
+                            struct map_lookup *map, int first, int num,
+                            int optimal, int dev_replace_is_ongoing)
 {
        int i;
-        if (map->stripes[optimal].dev->bdev)
+        int tolerance;
-                return optimal;
+        struct btrfs_device *srcdev;
-        for (i = first; i < first + num; i++) {
-                if (map->stripes[i].dev->bdev)
+        if (dev_replace_is_ongoing &&
-                        return i;
+            fs_info->dev_replace.cont_reading_from_srcdev_mode ==
+             BTRFS_DEV_REPLACE_ITEM_CONT_READING_FROM_SRCDEV_MODE_AVOID)
+                srcdev = fs_info->dev_replace.srcdev;
+        else
+                srcdev = NULL;
+        /*
+         * try to avoid the drive that is the source drive for a
+         * dev-replace procedure, only choose it if no other non-missing
+         * mirror is available
+         */
+        for (tolerance = 0; tolerance < 2; tolerance++) {
+                if (map->stripes[optimal].dev->bdev &&
+                    (tolerance || map->stripes[optimal].dev != srcdev))
+                        return optimal;
+                for (i = first; i < first + num; i++) {
+                        if (map->stripes[i].dev->bdev &&
+                            (tolerance || map->stripes[i].dev != srcdev))
+                                return i;
+                }
        }
        /* we couldn't find one that doesn't fail.  Just return something
         * and the io error handling code will clean up eventually
         */
        return optimal;
 }
-static int __btrfs_map_block(struct btrfs_mapping_tree *map_tree, int rw,
+static int __btrfs_map_block(struct btrfs_fs_info *fs_info, int rw,
                             u64 logical, u64 *length,
                             struct btrfs_bio **bbio_ret,
                             int mirror_num)
 {
        struct extent_map *em;
        struct map_lookup *map;
+        struct btrfs_mapping_tree *map_tree = &fs_info->mapping_tree;
        struct extent_map_tree *em_tree = &map_tree->map_tree;
        u64 offset;
        u64 stripe_offset;
@@ -3800,6 +4084,11 @@ static int __btrfs_map_block(struct btrfs_mapping_tree *map_tree, int rw,
        int num_stripes;
        int max_errors = 0;
        struct btrfs_bio *bbio = NULL;
+        struct btrfs_dev_replace *dev_replace = &fs_info->dev_replace;
+        int dev_replace_is_ongoing = 0;
+        int num_alloc_stripes;
+        int patch_the_first_stripe_for_dev_replace = 0;
+        u64 physical_to_patch_in_first_stripe = 0;
        read_lock(&em_tree->lock);
        em = lookup_extent_mapping(em_tree, logical, *length);
@@ -3816,9 +4105,6 @@ static int __btrfs_map_block(struct btrfs_mapping_tree *map_tree, int rw,
        map = (struct map_lookup *)em->bdev;
        offset = logical - em->start;
-        if (mirror_num > map->num_stripes)
-                mirror_num = 0;
        stripe_nr = offset;
        /*
         * stripe_nr counts the total number of stripes we have to stride
@@ -3845,6 +4131,93 @@ static int __btrfs_map_block(struct btrfs_mapping_tree *map_tree, int rw,
        if (!bbio_ret)
                goto out;
+        btrfs_dev_replace_lock(dev_replace);
+        dev_replace_is_ongoing = btrfs_dev_replace_is_ongoing(dev_replace);
+        if (!dev_replace_is_ongoing)
+                btrfs_dev_replace_unlock(dev_replace);
+        if (dev_replace_is_ongoing && mirror_num == map->num_stripes + 1 &&
+            !(rw & (REQ_WRITE | REQ_DISCARD | REQ_GET_READ_MIRRORS)) &&
+            dev_replace->tgtdev != NULL) {
+                /*
+                 * in dev-replace case, for repair case (that's the only
+                 * case where the mirror is selected explicitly when
+                 * calling btrfs_map_block), blocks left of the left cursor
+                 * can also be read from the target drive.
+                 * For REQ_GET_READ_MIRRORS, the target drive is added as
+                 * the last one to the array of stripes. For READ, it also
+                 * needs to be supported using the same mirror number.
+                 * If the requested block is not left of the left cursor,
+                 * EIO is returned. This can happen because btrfs_num_copies()
+                 * returns one more in the dev-replace case.
+                 */
+                u64 tmp_length = *length;
+                struct btrfs_bio *tmp_bbio = NULL;
+                int tmp_num_stripes;
+                u64 srcdev_devid = dev_replace->srcdev->devid;
+                int index_srcdev = 0;
+                int found = 0;
+                u64 physical_of_found = 0;
+                ret = __btrfs_map_block(fs_info, REQ_GET_READ_MIRRORS,
+                             logical, &tmp_length, &tmp_bbio, 0);
+                if (ret) {
+                        WARN_ON(tmp_bbio != NULL);
+                        goto out;
+                }
+                tmp_num_stripes = tmp_bbio->num_stripes;
+                if (mirror_num > tmp_num_stripes) {
+                        /*
+                         * REQ_GET_READ_MIRRORS does not contain this
+                         * mirror, that means that the requested area
+                         * is not left of the left cursor
+                         */
+                        ret = -EIO;
+                        kfree(tmp_bbio);
+                        goto out;
+                }
+                /*
+                 * process the rest of the function using the mirror_num
+                 * of the source drive. Therefore look it up first.
+                 * At the end, patch the device pointer to the one of the
+                 * target drive.
+                 */
+                for (i = 0; i < tmp_num_stripes; i++) {
+                        if (tmp_bbio->stripes[i].dev->devid == srcdev_devid) {
+                                /*
+                                 * In case of DUP, in order to keep it
+                                 * simple, only add the mirror with the
+                                 * lowest physical address
+                                 */
+                                if (found &&
+                                    physical_of_found <=
+                                     tmp_bbio->stripes[i].physical)
+                                        continue;
+                                index_srcdev = i;
+                                found = 1;
+                                physical_of_found =
+                                        tmp_bbio->stripes[i].physical;
+                        }
+                }
+                if (found) {
+                        mirror_num = index_srcdev + 1;
+                        patch_the_first_stripe_for_dev_replace = 1;
+                        physical_to_patch_in_first_stripe = physical_of_found;
+                } else {
+                        WARN_ON(1);
+                        ret = -EIO;
+                        kfree(tmp_bbio);
+                        goto out;
+                }
+                kfree(tmp_bbio);
+        } else if (mirror_num > map->num_stripes) {
+                mirror_num = 0;
+        }
        num_stripes = 1;
        stripe_index = 0;
        stripe_nr_orig = stripe_nr;
@@ -3859,19 +4232,20 @@ static int __btrfs_map_block(struct btrfs_mapping_tree *map_tree, int rw,
                                            stripe_nr_end - stripe_nr_orig);
                stripe_index = do_div(stripe_nr, map->num_stripes);
        } else if (map->type & BTRFS_BLOCK_GROUP_RAID1) {
-                if (rw & (REQ_WRITE | REQ_DISCARD))
+                if (rw & (REQ_WRITE | REQ_DISCARD | REQ_GET_READ_MIRRORS))
                        num_stripes = map->num_stripes;
                else if (mirror_num)
                        stripe_index = mirror_num - 1;
                else {
-                        stripe_index = find_live_mirror(map, 0,
+                        stripe_index = find_live_mirror(fs_info, map, 0,
                                            map->num_stripes,
-                                            current->pid % map->num_stripes);
+                                            current->pid % map->num_stripes,
+                                            dev_replace_is_ongoing);
                        mirror_num = stripe_index + 1;
                }
        } else if (map->type & BTRFS_BLOCK_GROUP_DUP) {
-                if (rw & (REQ_WRITE | REQ_DISCARD)) {
+                if (rw & (REQ_WRITE | REQ_DISCARD | REQ_GET_READ_MIRRORS)) {
                        num_stripes = map->num_stripes;
                } else if (mirror_num) {
                        stripe_index = mirror_num - 1;
@@ -3885,7 +4259,7 @@ static int __btrfs_map_block(struct btrfs_mapping_tree *map_tree, int rw,
                stripe_index = do_div(stripe_nr, factor);
                stripe_index *= map->sub_stripes;
-                if (rw & REQ_WRITE)
+                if (rw & (REQ_WRITE | REQ_GET_READ_MIRRORS))
                        num_stripes = map->sub_stripes;
                else if (rw & REQ_DISCARD)
                        num_stripes = min_t(u64, map->sub_stripes *
@@ -3895,9 +4269,11 @@ static int __btrfs_map_block(struct btrfs_mapping_tree *map_tree, int rw,
                        stripe_index += mirror_num - 1;
                else {
                        int old_stripe_index = stripe_index;
-                        stripe_index = find_live_mirror(map, stripe_index,
+                        stripe_index = find_live_mirror(fs_info, map,
+                                              stripe_index,
                                              map->sub_stripes, stripe_index +
-                                              current->pid % map->sub_stripes);
+                                              current->pid % map->sub_stripes,
+                                              dev_replace_is_ongoing);
                        mirror_num = stripe_index - old_stripe_index + 1;
                }
        } else {
@@ -3911,7 +4287,14 @@ static int __btrfs_map_block(struct btrfs_mapping_tree *map_tree, int rw,
        }
        BUG_ON(stripe_index >= map->num_stripes);
-        bbio = kzalloc(btrfs_bio_size(num_stripes), GFP_NOFS);
+        num_alloc_stripes = num_stripes;
+        if (dev_replace_is_ongoing) {
+                if (rw & (REQ_WRITE | REQ_DISCARD))
+                        num_alloc_stripes <<= 1;
+                if (rw & REQ_GET_READ_MIRRORS)
+                        num_alloc_stripes++;
+        }
+        bbio = kzalloc(btrfs_bio_size(num_alloc_stripes), GFP_NOFS);
        if (!bbio) {
                ret = -ENOMEM;
                goto out;
@@ -3998,7 +4381,7 @@ static int __btrfs_map_block(struct btrfs_mapping_tree *map_tree, int rw,
                }
        }
-        if (rw & REQ_WRITE) {
+        if (rw & (REQ_WRITE | REQ_GET_READ_MIRRORS)) {
                if (map->type & (BTRFS_BLOCK_GROUP_RAID1 |
                                 BTRFS_BLOCK_GROUP_RAID10 |
                                 BTRFS_BLOCK_GROUP_DUP)) {
@@ -4006,20 +4389,115 @@ static int __btrfs_map_block(struct btrfs_mapping_tree *map_tree, int rw,
                }
        }
+        if (dev_replace_is_ongoing && (rw & (REQ_WRITE | REQ_DISCARD)) &&
+            dev_replace->tgtdev != NULL) {
+                int index_where_to_add;
+                u64 srcdev_devid = dev_replace->srcdev->devid;
+                /*
+                 * duplicate the write operations while the dev replace
+                 * procedure is running. Since the copying of the old disk
+                 * to the new disk takes place at run time while the
+                 * filesystem is mounted writable, the regular write
+                 * operations to the old disk have to be duplicated to go
+                 * to the new disk as well.
+                 * Note that device->missing is handled by the caller, and
+                 * that the write to the old disk is already set up in the
+                 * stripes array.
+                 */
+                index_where_to_add = num_stripes;
+                for (i = 0; i < num_stripes; i++) {
+                        if (bbio->stripes[i].dev->devid == srcdev_devid) {
+                                /* write to new disk, too */
+                                struct btrfs_bio_stripe *new =
+                                        bbio->stripes + index_where_to_add;
+                                struct btrfs_bio_stripe *old =
+                                        bbio->stripes + i;
+                                new->physical = old->physical;
+                                new->length = old->length;
+                                new->dev = dev_replace->tgtdev;
+                                index_where_to_add++;
+                                max_errors++;
+                        }
+                }
+                num_stripes = index_where_to_add;
+        } else if (dev_replace_is_ongoing && (rw & REQ_GET_READ_MIRRORS) &&
+                   dev_replace->tgtdev != NULL) {
+                u64 srcdev_devid = dev_replace->srcdev->devid;
+                int index_srcdev = 0;
+                int found = 0;
+                u64 physical_of_found = 0;
+                /*
+                 * During the dev-replace procedure, the target drive can
+                 * also be used to read data in case it is needed to repair
+                 * a corrupt block elsewhere. This is possible if the
+                 * requested area is left of the left cursor. In this area,
+                 * the target drive is a full copy of the source drive.
+                 */
+                for (i = 0; i < num_stripes; i++) {
+                        if (bbio->stripes[i].dev->devid == srcdev_devid) {
+                                /*
+                                 * In case of DUP, in order to keep it
+                                 * simple, only add the mirror with the
+                                 * lowest physical address
+                                 */
+                                if (found &&
+                                    physical_of_found <=
+                                     bbio->stripes[i].physical)
+                                        continue;
+                                index_srcdev = i;
+                                found = 1;
+                                physical_of_found = bbio->stripes[i].physical;
+                        }
+                }
+                if (found) {
+                        u64 length = map->stripe_len;
+                        if (physical_of_found + length <=
+                            dev_replace->cursor_left) {
+                                struct btrfs_bio_stripe *tgtdev_stripe =
+                                        bbio->stripes + num_stripes;
+                                tgtdev_stripe->physical = physical_of_found;
+                                tgtdev_stripe->length =
+                                        bbio->stripes[index_srcdev].length;
+                                tgtdev_stripe->dev = dev_replace->tgtdev;
+                                num_stripes++;
+                        }
+                }
+        }
        *bbio_ret = bbio;
        bbio->num_stripes = num_stripes;
        bbio->max_errors = max_errors;
        bbio->mirror_num = mirror_num;
+        /*
+         * this is the case that REQ_READ && dev_replace_is_ongoing &&
+         * mirror_num == num_stripes + 1 && dev_replace target drive is
+         * available as a mirror
+         */
+        if (patch_the_first_stripe_for_dev_replace && num_stripes > 0) {
+                WARN_ON(num_stripes > 1);
+                bbio->stripes[0].dev = dev_replace->tgtdev;
+                bbio->stripes[0].physical = physical_to_patch_in_first_stripe;
+                bbio->mirror_num = map->num_stripes + 1;
+        }
 out:
+        if (dev_replace_is_ongoing)
+                btrfs_dev_replace_unlock(dev_replace);
        free_extent_map(em);
        return ret;
 }
-int btrfs_map_block(struct btrfs_mapping_tree *map_tree, int rw,
+int btrfs_map_block(struct btrfs_fs_info *fs_info, int rw,
                      u64 logical, u64 *length,
                      struct btrfs_bio **bbio_ret, int mirror_num)
 {
-        return __btrfs_map_block(map_tree, rw, logical, length, bbio_ret,
+        return __btrfs_map_block(fs_info, rw, logical, length, bbio_ret,
                                 mirror_num);
 }
@@ -4238,10 +4716,116 @@ static noinline void schedule_bio(struct btrfs_root *root,
                                   &device->work);
 }
+static int bio_size_ok(struct block_device *bdev, struct bio *bio,
+                       sector_t sector)
+{
+        struct bio_vec *prev;
+        struct request_queue *q = bdev_get_queue(bdev);
+        unsigned short max_sectors = queue_max_sectors(q);
+        struct bvec_merge_data bvm = {
+                .bi_bdev = bdev,
+                .bi_sector = sector,
+                .bi_rw = bio->bi_rw,
+        };
+        if (bio->bi_vcnt == 0) {
+                WARN_ON(1);
+                return 1;
+        }
+        prev = &bio->bi_io_vec[bio->bi_vcnt - 1];
+        if ((bio->bi_size >> 9) > max_sectors)
+                return 0;
+        if (!q->merge_bvec_fn)
+                return 1;
+        bvm.bi_size = bio->bi_size - prev->bv_len;
+        if (q->merge_bvec_fn(q, &bvm, prev) < prev->bv_len)
+                return 0;
+        return 1;
+}
+static void submit_stripe_bio(struct btrfs_root *root, struct btrfs_bio *bbio,
+                              struct bio *bio, u64 physical, int dev_nr,
+                              int rw, int async)
+{
+        struct btrfs_device *dev = bbio->stripes[dev_nr].dev;
+        bio->bi_private = bbio;
+        bio->bi_private = merge_stripe_index_into_bio_private(
+                        bio->bi_private, (unsigned int)dev_nr);
+        bio->bi_end_io = btrfs_end_bio;
+        bio->bi_sector = physical >> 9;
+#ifdef DEBUG
+        {
+                struct rcu_string *name;
+                rcu_read_lock();
+                name = rcu_dereference(dev->name);
+                pr_debug("btrfs_map_bio: rw %d, sector=%llu, dev=%lu "
+                         "(%s id %llu), size=%u\n", rw,
+                         (u64)bio->bi_sector, (u_long)dev->bdev->bd_dev,
+                         name->str, dev->devid, bio->bi_size);
+                rcu_read_unlock();
+        }
+#endif
+        bio->bi_bdev = dev->bdev;
+        if (async)
+                schedule_bio(root, dev, rw, bio);
+        else
+                btrfsic_submit_bio(rw, bio);
+}
+static int breakup_stripe_bio(struct btrfs_root *root, struct btrfs_bio *bbio,
+                              struct bio *first_bio, struct btrfs_device *dev,
+                              int dev_nr, int rw, int async)
+{
+        struct bio_vec *bvec = first_bio->bi_io_vec;
+        struct bio *bio;
+        int nr_vecs = bio_get_nr_vecs(dev->bdev);
+        u64 physical = bbio->stripes[dev_nr].physical;
+again:
+        bio = btrfs_bio_alloc(dev->bdev, physical >> 9, nr_vecs, GFP_NOFS);
+        if (!bio)
+                return -ENOMEM;
+        while (bvec <= (first_bio->bi_io_vec + first_bio->bi_vcnt - 1)) {
+                if (bio_add_page(bio, bvec->bv_page, bvec->bv_len,
+                                 bvec->bv_offset) < bvec->bv_len) {
+                        u64 len = bio->bi_size;
+                        atomic_inc(&bbio->stripes_pending);
+                        submit_stripe_bio(root, bbio, bio, physical, dev_nr,
+                                          rw, async);
+                        physical += len;
+                        goto again;
+                }
+                bvec++;
+        }
+        submit_stripe_bio(root, bbio, bio, physical, dev_nr, rw, async);
+        return 0;
+}
+static void bbio_error(struct btrfs_bio *bbio, struct bio *bio, u64 logical)
+{
+        atomic_inc(&bbio->error);
+        if (atomic_dec_and_test(&bbio->stripes_pending)) {
+                bio->bi_private = bbio->private;
+                bio->bi_end_io = bbio->end_io;
+                bio->bi_bdev = (struct block_device *)
+                        (unsigned long)bbio->mirror_num;
+                bio->bi_sector = logical >> 9;
+                kfree(bbio);
+                bio_endio(bio, -EIO);
+        }
+}
 int btrfs_map_bio(struct btrfs_root *root, int rw, struct bio *bio,
                  int mirror_num, int async_submit)
 {
-        struct btrfs_mapping_tree *map_tree;
        struct btrfs_device *dev;
        struct bio *first_bio = bio;
        u64 logical = (u64)bio->bi_sector << 9;
@@ -4253,12 +4837,11 @@ int btrfs_map_bio(struct btrfs_root *root, int rw, struct bio *bio,
        struct btrfs_bio *bbio = NULL;
        length = bio->bi_size;
-        map_tree = &root->fs_info->mapping_tree;
        map_length = length;
-        ret = btrfs_map_block(map_tree, rw, logical, &map_length, &bbio,
+        ret = btrfs_map_block(root->fs_info, rw, logical, &map_length, &bbio,
                              mirror_num);
-        if (ret) /* -ENOMEM */
+        if (ret)
                return ret;
        total_devs = bbio->num_stripes;
@@ -4276,52 +4859,48 @@ int btrfs_map_bio(struct btrfs_root *root, int rw, struct bio *bio,
        atomic_set(&bbio->stripes_pending, bbio->num_stripes);
        while (dev_nr < total_devs) {
+                dev = bbio->stripes[dev_nr].dev;
+                if (!dev || !dev->bdev || (rw & WRITE && !dev->writeable)) {
+                        bbio_error(bbio, first_bio, logical);
+                        dev_nr++;
+                        continue;
+                }
+                /*
+                 * Check and see if we're ok with this bio based on it's size
+                 * and offset with the given device.
+                 */
+                if (!bio_size_ok(dev->bdev, first_bio,
+                                 bbio->stripes[dev_nr].physical >> 9)) {
+                        ret = breakup_stripe_bio(root, bbio, first_bio, dev,
+                                                 dev_nr, rw, async_submit);
+                        BUG_ON(ret);
+                        dev_nr++;
+                        continue;
+                }
                if (dev_nr < total_devs - 1) {
                        bio = bio_clone(first_bio, GFP_NOFS);
                        BUG_ON(!bio); /* -ENOMEM */
                } else {
                        bio = first_bio;
                }
-                bio->bi_private = bbio;
-                bio->bi_private = merge_stripe_index_into_bio_private(
+                submit_stripe_bio(root, bbio, bio,
-                                bio->bi_private, (unsigned int)dev_nr);
+                                  bbio->stripes[dev_nr].physical, dev_nr, rw,
-                bio->bi_end_io = btrfs_end_bio;
+                                  async_submit);
-                bio->bi_sector = bbio->stripes[dev_nr].physical >> 9;
-                dev = bbio->stripes[dev_nr].dev;
-                if (dev && dev->bdev && (rw != WRITE || dev->writeable)) {
-#ifdef DEBUG
-                        struct rcu_string *name;
-                        rcu_read_lock();
-                        name = rcu_dereference(dev->name);
-                        pr_debug("btrfs_map_bio: rw %d, secor=%llu, dev=%lu "
-                                 "(%s id %llu), size=%u\n", rw,
-                                 (u64)bio->bi_sector, (u_long)dev->bdev->bd_dev,
-                                 name->str, dev->devid, bio->bi_size);
-                        rcu_read_unlock();
-#endif
-                        bio->bi_bdev = dev->bdev;
-                        if (async_submit)
-                                schedule_bio(root, dev, rw, bio);
-                        else
-                                btrfsic_submit_bio(rw, bio);
-                } else {
-                        bio->bi_bdev = root->fs_info->fs_devices->latest_bdev;
-                        bio->bi_sector = logical >> 9;
-                        bio_endio(bio, -EIO);
-                }
                dev_nr++;
        }
        return 0;
 }
-struct btrfs_device *btrfs_find_device(struct btrfs_root *root, u64 devid,
+struct btrfs_device *btrfs_find_device(struct btrfs_fs_info *fs_info, u64 devid,
                                       u8 *uuid, u8 *fsid)
 {
        struct btrfs_device *device;
        struct btrfs_fs_devices *cur_devices;
-        cur_devices = root->fs_info->fs_devices;
+        cur_devices = fs_info->fs_devices;
        while (cur_devices) {
                if (!fsid ||
                    !memcmp(cur_devices->fsid, fsid, BTRFS_UUID_SIZE)) {
@@ -4402,6 +4981,7 @@ static int read_one_chunk(struct btrfs_root *root, struct btrfs_key *key,
        em->bdev = (struct block_device *)map;
        em->start = logical;
        em->len = length;
+        em->orig_start = 0;
        em->block_start = 0;
        em->block_len = em->len;
@@ -4419,8 +4999,8 @@ static int read_one_chunk(struct btrfs_root *root, struct btrfs_key *key,
                read_extent_buffer(leaf, uuid, (unsigned long)
                                   btrfs_stripe_dev_uuid_nr(chunk, i),
                                   BTRFS_UUID_SIZE);
-                map->stripes[i].dev = btrfs_find_device(root, devid, uuid,
+                map->stripes[i].dev = btrfs_find_device(root->fs_info, devid,
-                                                        NULL);
+                                                        uuid, NULL);
                if (!map->stripes[i].dev && !btrfs_test_opt(root, DEGRADED)) {
                        kfree(map);
                        free_extent_map(em);
@@ -4461,6 +5041,8 @@ static void fill_device_from_item(struct extent_buffer *leaf,
        device->io_align = btrfs_device_io_align(leaf, dev_item);
        device->io_width = btrfs_device_io_width(leaf, dev_item);
        device->sector_size = btrfs_device_sector_size(leaf, dev_item);
+        WARN_ON(device->devid == BTRFS_DEV_REPLACE_DEVID);
+        device->is_tgtdev_for_dev_replace = 0;
        ptr = (unsigned long)btrfs_device_uuid(dev_item);
        read_extent_buffer(leaf, device->uuid, ptr, BTRFS_UUID_SIZE);
@@ -4538,7 +5120,7 @@ static int read_one_dev(struct btrfs_root *root,
                        return ret;
        }
-        device = btrfs_find_device(root, devid, dev_uuid, fs_uuid);
+        device = btrfs_find_device(root->fs_info, devid, dev_uuid, fs_uuid);
        if (!device || !device->bdev) {
                if (!btrfs_test_opt(root, DEGRADED))
                        return -EIO;
@@ -4571,7 +5153,7 @@ static int read_one_dev(struct btrfs_root *root,
        fill_device_from_item(leaf, dev_item, device);
        device->dev_root = root->fs_info->dev_root;
        device->in_fs_metadata = 1;
-        if (device->writeable) {
+        if (device->writeable && !device->is_tgtdev_for_dev_replace) {
                device->fs_devices->total_rw_bytes += device->total_bytes;
                spin_lock(&root->fs_info->free_chunk_lock);
                root->fs_info->free_chunk_space += device->total_bytes -
@@ -4930,7 +5512,7 @@ int btrfs_get_dev_stats(struct btrfs_root *root,
        int i;
        mutex_lock(&fs_devices->device_list_mutex);
-        dev = btrfs_find_device(root, stats->devid, NULL, NULL);
+        dev = btrfs_find_device(root->fs_info, stats->devid, NULL, NULL);
        mutex_unlock(&fs_devices->device_list_mutex);
        if (!dev) {
@@ -4958,3 +5540,21 @@ int btrfs_get_dev_stats(struct btrfs_root *root,
                stats->nr_items = BTRFS_DEV_STAT_VALUES_MAX;
        return 0;
 }
+int btrfs_scratch_superblock(struct btrfs_device *device)
+{
+        struct buffer_head *bh;
+        struct btrfs_super_block *disk_super;
+        bh = btrfs_read_dev_super(device->bdev);
+        if (!bh)
+                return -EINVAL;
+        disk_super = (struct btrfs_super_block *)bh->b_data;
+        memset(&disk_super->magic, 0, sizeof(disk_super->magic));
+        set_buffer_dirty(bh);
+        sync_dirty_buffer(bh);
+        brelse(bh);
+        return 0;
+}
diff --git a/fs/btrfs/volumes.h b/fs/btrfs/volumes.h
index 53c06af92e8d..d3c3939ac751 100644
--- a/fs/btrfs/volumes.h
+++ b/fs/btrfs/volumes.h
@@ -50,6 +50,7 @@ struct btrfs_device {
        int in_fs_metadata;
        int missing;
        int can_discard;
+        int is_tgtdev_for_dev_replace;
        spinlock_t io_lock;
@@ -88,7 +89,7 @@ struct btrfs_device {
        u8 uuid[BTRFS_UUID_SIZE];
        /* per-device scrub information */
-        struct scrub_dev *scrub_device;
+        struct scrub_ctx *scrub_device;
        struct btrfs_work work;
        struct rcu_head rcu;
@@ -179,6 +180,15 @@ struct btrfs_device_info {
        u64 total_avail;
 };
+struct btrfs_raid_attr {
+        int sub_stripes;        /* sub_stripes info for map */
+        int dev_stripes;        /* stripes per dev */
+        int devs_max;           /* max devs to use */
+        int devs_min;           /* min devs needed */
+        int devs_increment;     /* ndevs has to be a multiple of this */
+        int ncopies;            /* how many copies to data has */
+};
 struct map_lookup {
        u64 type;
        int io_align;
@@ -248,7 +258,7 @@ int btrfs_alloc_dev_extent(struct btrfs_trans_handle *trans,
                           struct btrfs_device *device,
                           u64 chunk_tree, u64 chunk_objectid,
                           u64 chunk_offset, u64 start, u64 num_bytes);
-int btrfs_map_block(struct btrfs_mapping_tree *map_tree, int rw,
+int btrfs_map_block(struct btrfs_fs_info *fs_info, int rw,
                    u64 logical, u64 *length,
                    struct btrfs_bio **bbio_ret, int mirror_num);
 int btrfs_rmap_block(struct btrfs_mapping_tree *map_tree,
@@ -267,19 +277,27 @@ int btrfs_open_devices(struct btrfs_fs_devices *fs_devices,
 int btrfs_scan_one_device(const char *path, fmode_t flags, void *holder,
                          struct btrfs_fs_devices **fs_devices_ret);
 int btrfs_close_devices(struct btrfs_fs_devices *fs_devices);
-void btrfs_close_extra_devices(struct btrfs_fs_devices *fs_devices);
+void btrfs_close_extra_devices(struct btrfs_fs_info *fs_info,
+                               struct btrfs_fs_devices *fs_devices, int step);
+int btrfs_find_device_missing_or_by_path(struct btrfs_root *root,
+                                         char *device_path,
+                                         struct btrfs_device **device);
+int btrfs_find_device_by_path(struct btrfs_root *root, char *device_path,
+                              struct btrfs_device **device);
 int btrfs_add_device(struct btrfs_trans_handle *trans,
                     struct btrfs_root *root,
                     struct btrfs_device *device);
 int btrfs_rm_device(struct btrfs_root *root, char *device_path);
 void btrfs_cleanup_fs_uuids(void);
-int btrfs_num_copies(struct btrfs_mapping_tree *map_tree, u64 logical, u64 len);
+int btrfs_num_copies(struct btrfs_fs_info *fs_info, u64 logical, u64 len);
 int btrfs_grow_device(struct btrfs_trans_handle *trans,
                      struct btrfs_device *device, u64 new_size);
-struct btrfs_device *btrfs_find_device(struct btrfs_root *root, u64 devid,
+struct btrfs_device *btrfs_find_device(struct btrfs_fs_info *fs_info, u64 devid,
                                       u8 *uuid, u8 *fsid);
 int btrfs_shrink_device(struct btrfs_device *device, u64 new_size);
 int btrfs_init_new_device(struct btrfs_root *root, char *path);
+int btrfs_init_dev_replace_tgtdev(struct btrfs_root *root, char *device_path,
+                                  struct btrfs_device **device_out);
 int btrfs_balance(struct btrfs_balance_control *bctl,
                  struct btrfs_ioctl_balance_args *bargs);
 int btrfs_resume_balance_async(struct btrfs_fs_info *fs_info);
@@ -296,6 +314,13 @@ int btrfs_get_dev_stats(struct btrfs_root *root,
 int btrfs_init_dev_stats(struct btrfs_fs_info *fs_info);
 int btrfs_run_dev_stats(struct btrfs_trans_handle *trans,
                        struct btrfs_fs_info *fs_info);
+void btrfs_rm_dev_replace_srcdev(struct btrfs_fs_info *fs_info,
+                                 struct btrfs_device *srcdev);
+void btrfs_destroy_dev_replace_tgtdev(struct btrfs_fs_info *fs_info,
+                                      struct btrfs_device *tgtdev);
+void btrfs_init_dev_replace_tgtdev_for_resume(struct btrfs_fs_info *fs_info,
+                                              struct btrfs_device *tgtdev);
+int btrfs_scratch_superblock(struct btrfs_device *device);
 static inline void btrfs_dev_stat_inc(struct btrfs_device *dev,
                                      int index)
diff --git a/fs/btrfs/xattr.c b/fs/btrfs/xattr.c
index 3f4e2d69e83a..446a6848c554 100644
--- a/fs/btrfs/xattr.c
+++ b/fs/btrfs/xattr.c
@@ -122,6 +122,16 @@ static int do_setxattr(struct btrfs_trans_handle *trans,
                 */
                if (!value)
                        goto out;
+        } else {
+                di = btrfs_lookup_xattr(NULL, root, path, btrfs_ino(inode),
+                                        name, name_len, 0);
+                if (IS_ERR(di)) {
+                        ret = PTR_ERR(di);
+                        goto out;
+                }
+                if (!di && !value)
+                        goto out;
+                btrfs_release_path(path);
        }
 again:
@@ -198,6 +208,7 @@ int __btrfs_setxattr(struct btrfs_trans_handle *trans,
        inode_inc_iversion(inode);
        inode->i_ctime = CURRENT_TIME;
+        set_bit(BTRFS_INODE_COPY_EVERYTHING, &BTRFS_I(inode)->runtime_flags);
        ret = btrfs_update_inode(trans, root, inode);
        BUG_ON(ret);
 out:
@@ -265,7 +276,7 @@ ssize_t btrfs_listxattr(struct dentry *dentry, char *buffer, size_t size)
                di = btrfs_item_ptr(leaf, slot, struct btrfs_dir_item);
                if (verify_dir_item(root, leaf, di))
-                        continue;
+                        goto next;
                name_len = btrfs_dir_name_len(leaf, di);
                total_size += name_len + 1;